diff options
author | Patrick Scott <phanna@android.com> | 2010-02-04 10:37:17 -0500 |
---|---|---|
committer | Patrick Scott <phanna@android.com> | 2010-02-04 10:39:42 -0500 |
commit | c7f5f8508d98d5952d42ed7648c2a8f30a4da156 (patch) | |
tree | dd51dbfbf6670daa61279b3a19e7b1835b301dbf /googleurl/src | |
parent | 139d8152182f9093f03d9089822b688e49fa7667 (diff) | |
download | external_chromium-c7f5f8508d98d5952d42ed7648c2a8f30a4da156.zip external_chromium-c7f5f8508d98d5952d42ed7648c2a8f30a4da156.tar.gz external_chromium-c7f5f8508d98d5952d42ed7648c2a8f30a4da156.tar.bz2 |
Initial source checkin.
The source files were determined by building net_unittests in chromium's source
tree. Some of the obvious libraries were left out (v8, gmock, gtest).
The Android.mk file has all the sources (minus unittests and tools) that were
used during net_unittests compilation. Nothing builds yet because of STL but
that is the next task. The .cpp files will most likely not compile anyways
because of the LOCAL_CPP_EXTENSION mod. I will have to break this into multiple
projects to get around that limitation.
Diffstat (limited to 'googleurl/src')
33 files changed, 11995 insertions, 0 deletions
diff --git a/googleurl/src/gurl.cc b/googleurl/src/gurl.cc new file mode 100644 index 0000000..2dab0b2 --- /dev/null +++ b/googleurl/src/gurl.cc @@ -0,0 +1,450 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifdef WIN32 +#include <windows.h> +#else +#include <pthread.h> +#endif + +#include <algorithm> + +#include "googleurl/src/gurl.h" + +#include "base/logging.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_util.h" + +namespace { + +// External template that can handle initialization of either character type. +// The input spec is given, and the canonical version will be placed in +// |*canonical|, along with the parsing of the canonical spec in |*parsed|. +template<typename STR> +bool InitCanonical(const STR& input_spec, + std::string* canonical, + url_parse::Parsed* parsed) { + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + canonical->reserve(input_spec.size() + 32); + url_canon::StdStringCanonOutput output(canonical); + bool success = url_util::Canonicalize( + input_spec.data(), static_cast<int>(input_spec.length()), + NULL, &output, parsed); + + output.Complete(); // Must be done before using string. + return success; +} + +static std::string* empty_string = NULL; +static GURL* empty_gurl = NULL; + +#ifdef WIN32 + +// Returns a static reference to an empty string for returning a reference +// when there is no underlying string. +const std::string& EmptyStringForGURL() { + // Avoid static object construction/destruction on startup/shutdown. + if (!empty_string) { + // Create the string. Be careful that we don't break in the case that this + // is being called from multiple threads. Statics are not threadsafe. + std::string* new_empty_string = new std::string; + if (InterlockedCompareExchangePointer( + reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) { + // The old value was non-NULL, so no replacement was done. Another + // thread did the initialization out from under us. + delete new_empty_string; + } + } + return *empty_string; +} + +#else + +static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT; +static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT; + +void EmptyStringForGURLOnce(void) { + empty_string = new std::string; +} + +const std::string& EmptyStringForGURL() { + // Avoid static object construction/destruction on startup/shutdown. + pthread_once(&empty_string_once, EmptyStringForGURLOnce); + return *empty_string; +} + +#endif // WIN32 + +} // namespace + +GURL::GURL() : is_valid_(false) { +} + +GURL::GURL(const GURL& other) + : spec_(other.spec_), + is_valid_(other.is_valid_), + parsed_(other.parsed_) { +} + +GURL::GURL(const std::string& url_string) { + is_valid_ = InitCanonical(url_string, &spec_, &parsed_); +} + +GURL::GURL(const string16& url_string) { + is_valid_ = InitCanonical(url_string, &spec_, &parsed_); +} + +GURL::GURL(const char* canonical_spec, size_t canonical_spec_len, + const url_parse::Parsed& parsed, bool is_valid) + : spec_(canonical_spec, canonical_spec_len), + is_valid_(is_valid), + parsed_(parsed) { +#ifndef NDEBUG + // For testing purposes, check that the parsed canonical URL is identical to + // what we would have produced. Skip checking for invalid URLs have no meaning + // and we can't always canonicalize then reproducabely. + if (is_valid_) { + GURL test_url(spec_); + + DCHECK(test_url.is_valid_ == is_valid_); + DCHECK(test_url.spec_ == spec_); + + DCHECK(test_url.parsed_.scheme == parsed_.scheme); + DCHECK(test_url.parsed_.username == parsed_.username); + DCHECK(test_url.parsed_.password == parsed_.password); + DCHECK(test_url.parsed_.host == parsed_.host); + DCHECK(test_url.parsed_.port == parsed_.port); + DCHECK(test_url.parsed_.path == parsed_.path); + DCHECK(test_url.parsed_.query == parsed_.query); + DCHECK(test_url.parsed_.ref == parsed_.ref); + } +#endif +} + +const std::string& GURL::spec() const { + if (is_valid_ || spec_.empty()) + return spec_; + + DCHECK(false) << "Trying to get the spec of an invalid URL!"; + return EmptyStringForGURL(); +} + +GURL GURL::Resolve(const std::string& relative) const { + return ResolveWithCharsetConverter(relative, NULL); +} +GURL GURL::Resolve(const string16& relative) const { + return ResolveWithCharsetConverter(relative, NULL); +} + +// Note: code duplicated below (it's inconvenient to use a template here). +GURL GURL::ResolveWithCharsetConverter( + const std::string& relative, + url_canon::CharsetConverter* charset_converter) const { + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + GURL result; + + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + result.spec_.reserve(spec_.size() + 32); + url_canon::StdStringCanonOutput output(&result.spec_); + + if (!url_util::ResolveRelative( + spec_.data(), static_cast<int>(spec_.length()), parsed_, + relative.data(), static_cast<int>(relative.length()), + charset_converter, &output, &result.parsed_)) { + // Error resolving, return an empty URL. + return GURL(); + } + + output.Complete(); + result.is_valid_ = true; + return result; +} + +// Note: code duplicated above (it's inconvenient to use a template here). +GURL GURL::ResolveWithCharsetConverter( + const string16& relative, + url_canon::CharsetConverter* charset_converter) const { + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + GURL result; + + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + result.spec_.reserve(spec_.size() + 32); + url_canon::StdStringCanonOutput output(&result.spec_); + + if (!url_util::ResolveRelative( + spec_.data(), static_cast<int>(spec_.length()), parsed_, + relative.data(), static_cast<int>(relative.length()), + charset_converter, &output, &result.parsed_)) { + // Error resolving, return an empty URL. + return GURL(); + } + + output.Complete(); + result.is_valid_ = true; + return result; +} + +// Note: code duplicated below (it's inconvenient to use a template here). +GURL GURL::ReplaceComponents( + const url_canon::Replacements<char>& replacements) const { + GURL result; + + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + result.spec_.reserve(spec_.size() + 32); + url_canon::StdStringCanonOutput output(&result.spec_); + + result.is_valid_ = url_util::ReplaceComponents( + spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, + NULL, &output, &result.parsed_); + + output.Complete(); + return result; +} + +// Note: code duplicated above (it's inconvenient to use a template here). +GURL GURL::ReplaceComponents( + const url_canon::Replacements<char16>& replacements) const { + GURL result; + + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + result.spec_.reserve(spec_.size() + 32); + url_canon::StdStringCanonOutput output(&result.spec_); + + result.is_valid_ = url_util::ReplaceComponents( + spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, + NULL, &output, &result.parsed_); + + output.Complete(); + return result; +} + +GURL GURL::GetOrigin() const { + // This doesn't make sense for invalid or nonstandard URLs, so return + // the empty URL + if (!is_valid_ || !IsStandard()) + return GURL(); + + url_canon::Replacements<char> replacements; + replacements.ClearUsername(); + replacements.ClearPassword(); + replacements.ClearPath(); + replacements.ClearQuery(); + replacements.ClearRef(); + + return ReplaceComponents(replacements); +} + +GURL GURL::GetWithEmptyPath() const { + // This doesn't make sense for invalid or nonstandard URLs, so return + // the empty URL. + if (!is_valid_ || !IsStandard()) + return GURL(); + + // We could optimize this since we know that the URL is canonical, and we are + // appending a canonical path, so avoiding re-parsing. + GURL other(*this); + if (parsed_.path.len == 0) + return other; + + // Clear everything after the path. + other.parsed_.query.reset(); + other.parsed_.ref.reset(); + + // Set the path, since the path is longer than one, we can just set the + // first character and resize. + other.spec_[other.parsed_.path.begin] = '/'; + other.parsed_.path.len = 1; + other.spec_.resize(other.parsed_.path.begin + 1); + return other; +} + +bool GURL::IsStandard() const { + return url_util::IsStandard(spec_.data(), static_cast<int>(spec_.length()), + parsed_.scheme); +} + +bool GURL::SchemeIs(const char* lower_ascii_scheme) const { + if (parsed_.scheme.len <= 0) + return lower_ascii_scheme == NULL; + return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin, + spec_.data() + parsed_.scheme.end(), + lower_ascii_scheme); +} + +int GURL::IntPort() const { + if (parsed_.port.is_nonempty()) + return url_parse::ParsePort(spec_.data(), parsed_.port); + return url_parse::PORT_UNSPECIFIED; +} + +int GURL::EffectiveIntPort() const { + int int_port = IntPort(); + if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard()) + return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin, + parsed_.scheme.len); + return int_port; +} + +std::string GURL::ExtractFileName() const { + url_parse::Component file_component; + url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component); + return ComponentString(file_component); +} + +std::string GURL::PathForRequest() const { + DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty"; + if (parsed_.ref.len >= 0) { + // Clip off the reference when it exists. The reference starts after the # + // sign, so we have to subtract one to also remove it. + return std::string(spec_, parsed_.path.begin, + parsed_.ref.begin - parsed_.path.begin - 1); + } + + // Use everything form the path to the end. + return std::string(spec_, parsed_.path.begin); +} + +std::string GURL::HostNoBrackets() const { + // If host looks like an IPv6 literal, strip the square brackets. + url_parse::Component h(parsed_.host); + if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') { + h.begin++; + h.len -= 2; + } + return ComponentString(h); +} + +bool GURL::HostIsIPAddress() const { + if (!is_valid_ || spec_.empty()) + return false; + + url_canon::RawCanonOutputT<char, 128> ignored_output; + url_canon::CanonHostInfo host_info; + url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host, + &ignored_output, &host_info); + return host_info.IsIPAddress(); +} + +#ifdef WIN32 + +const GURL& GURL::EmptyGURL() { + // Avoid static object construction/destruction on startup/shutdown. + if (!empty_gurl) { + // Create the string. Be careful that we don't break in the case that this + // is being called from multiple threads. + GURL* new_empty_gurl = new GURL; + if (InterlockedCompareExchangePointer( + reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) { + // The old value was non-NULL, so no replacement was done. Another + // thread did the initialization out from under us. + delete new_empty_gurl; + } + } + return *empty_gurl; +} + +#else + +void EmptyGURLOnce(void) { + empty_gurl = new GURL; +} + +const GURL& GURL::EmptyGURL() { + // Avoid static object construction/destruction on startup/shutdown. + pthread_once(&empty_gurl_once, EmptyGURLOnce); + return *empty_gurl; +} + +#endif // WIN32 + +bool GURL::DomainIs(const char* lower_ascii_domain, + int domain_len) const { + // Return false if this URL is not valid or domain is empty. + if (!is_valid_ || !parsed_.host.is_nonempty() || !domain_len) + return false; + + // Check whether the host name is end with a dot. If yes, treat it + // the same as no-dot unless the input comparison domain is end + // with dot. + const char* last_pos = spec_.data() + parsed_.host.end() - 1; + int host_len = parsed_.host.len; + if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) { + last_pos--; + host_len--; + } + + // Return false if host's length is less than domain's length. + if (host_len < domain_len) + return false; + + // Compare this url whether belong specific domain. + const char* start_pos = spec_.data() + parsed_.host.begin + + host_len - domain_len; + + if (!url_util::LowerCaseEqualsASCII(start_pos, + last_pos + 1, + lower_ascii_domain, + lower_ascii_domain + domain_len)) + return false; + + // Check whether host has right domain start with dot, make sure we got + // right domain range. For example www.google.com has domain + // "google.com" but www.iamnotgoogle.com does not. + if ('.' != lower_ascii_domain[0] && host_len > domain_len && + '.' != *(start_pos - 1)) + return false; + + return true; +} + +void GURL::Swap(GURL* other) { + spec_.swap(other->spec_); + std::swap(is_valid_, other->is_valid_); + std::swap(parsed_, other->parsed_); +} + diff --git a/googleurl/src/gurl.h b/googleurl/src/gurl.h new file mode 100644 index 0000000..36cd14c --- /dev/null +++ b/googleurl/src/gurl.h @@ -0,0 +1,372 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_GURL_H__ +#define GOOGLEURL_SRC_GURL_H__ + +#include <iostream> +#include <string> + +#include "base/string16.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_parse.h" + +class GURL { + public: + typedef url_canon::StdStringReplacements<std::string> Replacements; + typedef url_canon::StdStringReplacements<string16> ReplacementsW; + + // Creates an empty, invalid URL. + GURL(); + + // Copy construction is relatively inexpensive, with most of the time going + // to reallocating the string. It does not re-parse. + GURL(const GURL& other); + + // The narrow version requires the input be UTF-8. Invalid UTF-8 input will + // result in an invalid URL. + // + // The wide version should also take an encoding parameter so we know how to + // encode the query parameters. It is probably sufficient for the narrow + // version to assume the query parameter encoding should be the same as the + // input encoding. + explicit GURL(const std::string& url_string /*, output_param_encoding*/); + explicit GURL(const string16& url_string /*, output_param_encoding*/); + + // Constructor for URLs that have already been parsed and canonicalized. This + // is used for conversions from KURL, for example. The caller must supply all + // information associated with the URL, which must be correct and consistent. + GURL(const char* canonical_spec, size_t canonical_spec_len, + const url_parse::Parsed& parsed, bool is_valid); + + // Returns true when this object represents a valid parsed URL. When not + // valid, other functions will still succeed, but you will not get canonical + // data out in the format you may be expecting. Instead, we keep something + // "reasonable looking" so that the user can see how it's busted if + // displayed to them. + bool is_valid() const { + return is_valid_; + } + + // Returns true if the URL is zero-length. Note that empty URLs are also + // invalid, and is_valid() will return false for them. This is provided + // because some users may want to treat the empty case differently. + bool is_empty() const { + return spec_.empty(); + } + + // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8, + // if the URL is valid. If the URL is not valid, this will assert and return + // the empty string (for safety in release builds, to keep them from being + // misused which might be a security problem). + // + // The URL will be ASCII except the reference fragment, which may be UTF-8. + // It is guaranteed to be valid UTF-8. + // + // The exception is for empty() URLs (which are !is_valid()) but this will + // return the empty string without asserting. + // + // Used invalid_spec() below to get the unusable spec of an invalid URL. This + // separation is designed to prevent errors that may cause security problems + // that could result from the mistaken use of an invalid URL. + const std::string& spec() const; + + // Returns the potentially invalid spec for a the URL. This spec MUST NOT be + // modified or sent over the network. It is designed to be displayed in error + // messages to the user, as the apperance of the spec may explain the error. + // If the spec is valid, the valid spec will be returned. + // + // The returned string is guaranteed to be valid UTF-8. + const std::string& possibly_invalid_spec() const { + return spec_; + } + + // Getter for the raw parsed structure. This allows callers to locate parts + // of the URL within the spec themselves. Most callers should consider using + // the individual component getters below. + // + // The returned parsed structure will reference into the raw spec, which may + // or may not be valid. If you are using this to index into the spec, BE + // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you + // don't do anything "important" with invalid specs. + const url_parse::Parsed& parsed_for_possibly_invalid_spec() const { + return parsed_; + } + + // Defiant equality operator! + bool operator==(const GURL& other) const { + return spec_ == other.spec_; + } + bool operator!=(const GURL& other) const { + return spec_ != other.spec_; + } + + // Allows GURL to used as a key in STL (for example, a std::set or std::map). + bool operator<(const GURL& other) const { + return spec_ < other.spec_; + } + + // Resolves a URL that's possibly relative to this object's URL, and returns + // it. Absolute URLs are also handled according to the rules of URLs on web + // pages. + // + // It may be impossible to resolve the URLs properly. If the input is not + // "standard" (SchemeIsStandard() == false) and the input looks relative, we + // can't resolve it. In these cases, the result will be an empty, invalid + // GURL. + // + // The result may also be a nonempty, invalid URL if the input has some kind + // of encoding error. In these cases, we will try to construct a "good" URL + // that may have meaning to the user, but it will be marked invalid. + // + // It is an error to resolve a URL relative to an invalid URL. The result + // will be the empty URL. + GURL Resolve(const std::string& relative) const; + GURL Resolve(const string16& relative) const; + + // Like Resolve() above but takes a character set encoder which will be used + // for any query text specified in the input. The charset converter parameter + // may be NULL, in which case it will be treated as UTF-8. + // + // TODO(brettw): These should be replaced with versions that take something + // more friendly than a raw CharsetConverter (maybe like an ICU character set + // name). + GURL ResolveWithCharsetConverter( + const std::string& relative, + url_canon::CharsetConverter* charset_converter) const; + GURL ResolveWithCharsetConverter( + const string16& relative, + url_canon::CharsetConverter* charset_converter) const; + + // Creates a new GURL by replacing the current URL's components with the + // supplied versions. See the Replacements class in url_canon.h for more. + // + // These are not particularly quick, so avoid doing mutations when possible. + // Prefer the 8-bit version when possible. + // + // It is an error to replace components of an invalid URL. The result will + // be the empty URL. + // + // Note that we use the more general url_canon::Replacements type to give + // callers extra flexibility rather than our override. + GURL ReplaceComponents( + const url_canon::Replacements<char>& replacements) const; + GURL ReplaceComponents( + const url_canon::Replacements<char16>& replacements) const; + + // A helper function that is equivalent to replacing the path with a slash + // and clearing out everything after that. We sometimes need to know just the + // scheme and the authority. If this URL is not a standard URL (it doesn't + // have the regular authority and path sections), then the result will be + // an empty, invalid GURL. Note that this *does* work for file: URLs, which + // some callers may want to filter out before calling this. + // + // It is an error to get an empty path on an invalid URL. The result + // will be the empty URL. + GURL GetWithEmptyPath() const; + + // A helper function to return a GURL containing just the scheme, host, + // and port from a URL. Equivalent to clearing any username and password, + // replacing the path with a slash, and clearing everything after that. If + // this URL is not a standard URL, then the result will be an empty, + // invalid GURL. If the URL has neither username nor password, this + // degenerates to GetWithEmptyPath(). + // + // It is an error to get the origin of an invalid URL. The result + // will be the empty URL. + GURL GetOrigin() const; + + // Returns true if the scheme for the current URL is a known "standard" + // scheme or there is a "://" after it. Standard schemes have an authority + // and a path section. This includes file:, which some callers may want to + // filter out explicitly by calling SchemeIsFile. + bool IsStandard() const; + + // Returns true if the given parameter (should be lower-case ASCII to match + // the canonicalized scheme) is the scheme for this URL. This call is more + // efficient than getting the scheme and comparing it because no copies or + // object constructions are done. + bool SchemeIs(const char* lower_ascii_scheme) const; + + // We often need to know if this is a file URL. File URLs are "standard", but + // are often treated separately by some programs. + bool SchemeIsFile() const { + return SchemeIs("file"); + } + + // If the scheme indicates a secure connection + bool SchemeIsSecure() const { + return SchemeIs("https"); + } + + // Returns true if the hostname is an IP address. Note: this function isn't + // as cheap as a simple getter because it re-parses the hostname to verify. + // This currently identifies only IPv4 addresses (bug 822685). + bool HostIsIPAddress() const; + + // Getters for various components of the URL. The returned string will be + // empty if the component is empty or is not present. + std::string scheme() const { // Not including the colon. See also SchemeIs. + return ComponentString(parsed_.scheme); + } + std::string username() const { + return ComponentString(parsed_.username); + } + std::string password() const { + return ComponentString(parsed_.password); + } + // Note that this may be a hostname, an IPv4 address, or an IPv6 literal + // surrounded by square brackets, like "[2001:db8::1]". To exclude these + // brackets, use HostNoBrackets() below. + std::string host() const { + return ComponentString(parsed_.host); + } + std::string port() const { // Returns -1 if "default" + return ComponentString(parsed_.port); + } + std::string path() const { // Including first slash following host + return ComponentString(parsed_.path); + } + std::string query() const { // Stuff following '?' + return ComponentString(parsed_.query); + } + std::string ref() const { // Stuff following '#' + return ComponentString(parsed_.ref); + } + + // Existance querying. These functions will return true if the corresponding + // URL component exists in this URL. Note that existance is different than + // being nonempty. http://www.google.com/? has a query that just happens to + // be empty, and has_query() will return true. + bool has_scheme() const { + return parsed_.scheme.len >= 0; + } + bool has_username() const { + return parsed_.username.len >= 0; + } + bool has_password() const { + return parsed_.password.len >= 0; + } + bool has_host() const { + // Note that hosts are special, absense of host means length 0. + return parsed_.host.len > 0; + } + bool has_port() const { + return parsed_.port.len >= 0; + } + bool has_path() const { + // Note that http://www.google.com/" has a path, the path is "/". This can + // return false only for invalid or nonstandard URLs. + return parsed_.path.len >= 0; + } + bool has_query() const { + return parsed_.query.len >= 0; + } + bool has_ref() const { + return parsed_.ref.len >= 0; + } + + // Returns a parsed version of the port. Can also be any of the special + // values defined in Parsed for ExtractPort. + int IntPort() const; + + // Returns the port number of the url, or the default port number. + // If the scheme has no concept of port (or unknown default) returns + // PORT_UNSPECIFIED. + int EffectiveIntPort() const; + + // Extracts the filename portion of the path and returns it. The filename + // is everything after the last slash in the path. This may be empty. + std::string ExtractFileName() const; + + // Returns the path that should be sent to the server. This is the path, + // parameter, and query portions of the URL. It is guaranteed to be ASCII. + std::string PathForRequest() const; + + // Returns the host, excluding the square brackets surrounding IPv6 address + // literals. This can be useful for passing to getaddrinfo(). + std::string HostNoBrackets() const; + + // Returns true if this URL's host matches or is in the same domain as + // the given input string. For example if this URL was "www.google.com", + // this would match "com", "google.com", and "www.google.com + // (input domain should be lower-case ASCII to match the canonicalized + // scheme). This call is more efficient than getting the host and check + // whether host has the specific domain or not because no copies or + // object constructions are done. + // + // If function DomainIs has parameter domain_len, which means the parameter + // lower_ascii_domain does not gurantee to terminate with NULL character. + bool DomainIs(const char* lower_ascii_domain, int domain_len) const; + + // If function DomainIs only has parameter lower_ascii_domain, which means + // domain string should be terminate with NULL character. + bool DomainIs(const char* lower_ascii_domain) const { + return DomainIs(lower_ascii_domain, + static_cast<int>(strlen(lower_ascii_domain))); + } + + // Swaps the contents of this GURL object with the argument without doing + // any memory allocations. + void Swap(GURL* other); + + // Returns a reference to a singleton empty GURL. This object is for callers + // who return references but don't have anything to return in some cases. + // This function may be called from any thread. + static const GURL& EmptyGURL(); + + private: + // Returns the substring of the input identified by the given component. + std::string ComponentString(const url_parse::Component& comp) const { + if (comp.len <= 0) + return std::string(); + return std::string(spec_, comp.begin, comp.len); + } + + // The actual text of the URL, in canonical ASCII form. + std::string spec_; + + // Set when the given URL is valid. Otherwise, we may still have a spec and + // components, but they may not identify valid resources (for example, an + // invalid port number, invalid characters in the scheme, etc.). + bool is_valid_; + + // Identified components of the canonical spec. + url_parse::Parsed parsed_; + + // TODO bug 684583: Add encoding for query params. +}; + +// Stream operator so GURL can be used in assertion statements. +inline std::ostream& operator<<(std::ostream& out, const GURL& url) { + return out << url.possibly_invalid_spec(); +} + +#endif // GOOGLEURL_SRC_GURL_H__ diff --git a/googleurl/src/gurl_test_main.cc b/googleurl/src/gurl_test_main.cc new file mode 100644 index 0000000..9a7c9f4 --- /dev/null +++ b/googleurl/src/gurl_test_main.cc @@ -0,0 +1,97 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "build/build_config.h" + +#if defined(OS_WIN) +#include <windows.h> +#endif + +#include <string> + +#include "testing/gtest/include/gtest/gtest.h" +#include "unicode/putil.h" +#include "unicode/udata.h" + +#define ICU_UTIL_DATA_SHARED 1 +#define ICU_UTIL_DATA_STATIC 2 + +#ifndef ICU_UTIL_DATA_IMPL + +#if defined(OS_WIN) +#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_SHARED +#elif defined(OS_MACOSX) +#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_STATIC +#elif defined(OS_LINUX) +#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_FILE +#endif + +#endif // ICU_UTIL_DATA_IMPL + +#if defined(OS_WIN) +#define ICU_UTIL_DATA_SYMBOL "icudt" U_ICU_VERSION_SHORT "_dat" +#define ICU_UTIL_DATA_SHARED_MODULE_NAME "icudt" U_ICU_VERSION_SHORT ".dll" +#endif + +bool InitializeICU() { +#if (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_SHARED) + // We expect to find the ICU data module alongside the current module. + // Because the module name is ASCII-only, "A" API should be safe. + HMODULE module = LoadLibraryA(ICU_UTIL_DATA_SHARED_MODULE_NAME); + if (!module) + return false; + + FARPROC addr = GetProcAddress(module, ICU_UTIL_DATA_SYMBOL); + if (!addr) + return false; + + UErrorCode err = U_ZERO_ERROR; + udata_setCommonData(reinterpret_cast<void*>(addr), &err); + return err == U_ZERO_ERROR; +#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_STATIC) + // Mac bundles the ICU data in. + return true; +#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE) + // We expect to find the ICU data module alongside the current module. + u_setDataDirectory("."); + // Only look for the packaged data file; + // the default behavior is to look for individual files. + UErrorCode err = U_ZERO_ERROR; + udata_setFileAccess(UDATA_ONLY_PACKAGES, &err); + return err == U_ZERO_ERROR; +#endif +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + + InitializeICU(); + + return RUN_ALL_TESTS(); +} diff --git a/googleurl/src/gurl_unittest.cc b/googleurl/src/gurl_unittest.cc new file mode 100644 index 0000000..4e81de6 --- /dev/null +++ b/googleurl/src/gurl_unittest.cc @@ -0,0 +1,433 @@ +// Copyright 2007 Google Inc. All Rights Reserved. +// Author: brettw@google.com (Brett Wilson) + +#include "googleurl/src/gurl.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_test_utils.h" +#include "testing/gtest/include/gtest/gtest.h" + +// Some implementations of base/basictypes.h may define ARRAYSIZE. +// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro +// which is in our version of basictypes.h. +#ifndef ARRAYSIZE +#define ARRAYSIZE ARRAYSIZE_UNSAFE +#endif + +using url_test_utils::WStringToUTF16; +using url_test_utils::ConvertUTF8ToUTF16; + +namespace { + +template<typename CHAR> +void SetupReplacement(void (url_canon::Replacements<CHAR>::*func)(const CHAR*, + const url_parse::Component&), + url_canon::Replacements<CHAR>* replacements, + const CHAR* str) { + if (str) { + url_parse::Component comp; + if (str[0]) + comp.len = static_cast<int>(strlen(str)); + (replacements->*func)(str, comp); + } +} + +} // namespace + +// Different types of URLs should be handled differently by url_util, and +// handed off to different canonicalizers. +TEST(GURLTest, Types) { + struct TypeTest { + const char* src; + const char* expected; + } type_cases[] = { + // URLs with "://" should be treated as standard and have a hostname, even + // when the scheme is unknown. + {"something:///HOSTNAME.com/", "something://hostname.com/"}, + // In the reverse, lacking a "://" means a path URL so no canonicalization + // should happen. + {"something:HOSTNAME.com/", "something:HOSTNAME.com/"}, + {"something:/HOSTNAME.com/", "something:/HOSTNAME.com/"}, +#ifdef WIN32 + // URLs that look like absolute Windows drive specs. + {"c:\\foo.txt", "file:///C:/foo.txt"}, + {"Z|foo.txt", "file:///Z:/foo.txt"}, + {"\\\\server\\foo.txt", "file://server/foo.txt"}, + {"//server/foo.txt", "file://server/foo.txt"}, +#endif + }; + + for (size_t i = 0; i < ARRAYSIZE(type_cases); i++) { + GURL gurl(type_cases[i].src); + EXPECT_STREQ(type_cases[i].expected, gurl.spec().c_str()); + } +} + +// Test the basic creation and querying of components in a GURL. We assume +// the parser is already tested and works, so we are mostly interested if the +// object does the right thing with the results. +TEST(GURLTest, Components) { + GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref")); + EXPECT_TRUE(url.is_valid()); + EXPECT_TRUE(url.SchemeIs("http")); + EXPECT_FALSE(url.SchemeIsFile()); + + // This is the narrow version of the URL, which should match the wide input. + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url.spec()); + + EXPECT_EQ("http", url.scheme()); + EXPECT_EQ("user", url.username()); + EXPECT_EQ("pass", url.password()); + EXPECT_EQ("google.com", url.host()); + EXPECT_EQ("99", url.port()); + EXPECT_EQ(99, url.IntPort()); + EXPECT_EQ("/foo;bar", url.path()); + EXPECT_EQ("q=a", url.query()); + EXPECT_EQ("ref", url.ref()); +} + +TEST(GURLTest, Empty) { + GURL url; + EXPECT_FALSE(url.is_valid()); + EXPECT_EQ("", url.spec()); + + EXPECT_EQ("", url.scheme()); + EXPECT_EQ("", url.username()); + EXPECT_EQ("", url.password()); + EXPECT_EQ("", url.host()); + EXPECT_EQ("", url.port()); + EXPECT_EQ(url_parse::PORT_UNSPECIFIED, url.IntPort()); + EXPECT_EQ("", url.path()); + EXPECT_EQ("", url.query()); + EXPECT_EQ("", url.ref()); +} + +TEST(GURLTest, Copy) { + GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref")); + + GURL url2(url); + EXPECT_TRUE(url2.is_valid()); + + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec()); + EXPECT_EQ("http", url2.scheme()); + EXPECT_EQ("user", url2.username()); + EXPECT_EQ("pass", url2.password()); + EXPECT_EQ("google.com", url2.host()); + EXPECT_EQ("99", url2.port()); + EXPECT_EQ(99, url2.IntPort()); + EXPECT_EQ("/foo;bar", url2.path()); + EXPECT_EQ("q=a", url2.query()); + EXPECT_EQ("ref", url2.ref()); + + // Copying of invalid URL should be invalid + GURL invalid; + GURL invalid2(invalid); + EXPECT_FALSE(invalid2.is_valid()); + EXPECT_EQ("", invalid2.spec()); + EXPECT_EQ("", invalid2.scheme()); + EXPECT_EQ("", invalid2.username()); + EXPECT_EQ("", invalid2.password()); + EXPECT_EQ("", invalid2.host()); + EXPECT_EQ("", invalid2.port()); + EXPECT_EQ(url_parse::PORT_UNSPECIFIED, invalid2.IntPort()); + EXPECT_EQ("", invalid2.path()); + EXPECT_EQ("", invalid2.query()); + EXPECT_EQ("", invalid2.ref()); +} + +// Given an invalid URL, we should still get most of the components. +TEST(GURLTest, Invalid) { + GURL url("http:google.com:foo"); + EXPECT_FALSE(url.is_valid()); + EXPECT_EQ("http://google.com:foo/", url.possibly_invalid_spec()); + + EXPECT_EQ("http", url.scheme()); + EXPECT_EQ("", url.username()); + EXPECT_EQ("", url.password()); + EXPECT_EQ("google.com", url.host()); + EXPECT_EQ("foo", url.port()); + EXPECT_EQ(url_parse::PORT_INVALID, url.IntPort()); + EXPECT_EQ("/", url.path()); + EXPECT_EQ("", url.query()); + EXPECT_EQ("", url.ref()); +} + +TEST(GURLTest, Resolve) { + // The tricky cases for relative URL resolving are tested in the + // canonicalizer unit test. Here, we just test that the GURL integration + // works properly. + struct ResolveCase { + const char* base; + const char* relative; + bool expected_valid; + const char* expected; + } resolve_cases[] = { + {"http://www.google.com/", "foo.html", true, "http://www.google.com/foo.html"}, + {"http://www.google.com/", "http://images.google.com/foo.html", true, "http://images.google.com/foo.html"}, + {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"}, + {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"}, + {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"}, + // Unknown schemes with a "://" should be treated as standard. + {"somescheme://foo/", "bar", true, "somescheme://foo/bar"}, + // Unknown schemes with no "://" are not standard. + {"data:blahblah", "http://google.com/", true, "http://google.com/"}, + {"data:blahblah", "http:google.com", true, "http://google.com/"}, + {"data:/blahblah", "file.html", false, ""}, + }; + + for (size_t i = 0; i < ARRAYSIZE(resolve_cases); i++) { + // 8-bit code path. + GURL input(resolve_cases[i].base); + GURL output = input.Resolve(resolve_cases[i].relative); + EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()); + EXPECT_EQ(resolve_cases[i].expected, output.spec()); + + // Wide code path. + GURL inputw(ConvertUTF8ToUTF16(resolve_cases[i].base)); + GURL outputw = + input.Resolve(ConvertUTF8ToUTF16(resolve_cases[i].relative)); + EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()); + EXPECT_EQ(resolve_cases[i].expected, outputw.spec()); + } +} + +TEST(GURLTest, GetOrigin) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"javascript:window.alert(\"hello,world\");", ""}, + {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/"}, + {"http://user@www.google.com", "http://www.google.com/"}, + {"http://:pass@www.google.com", "http://www.google.com/"}, + {"http://:@www.google.com", "http://www.google.com/"}, + }; + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + GURL url(cases[i].input); + GURL origin = url.GetOrigin(); + EXPECT_EQ(cases[i].expected, origin.spec()); + } +} + +TEST(GURLTest, GetWithEmptyPath) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"javascript:window.alert(\"hello, world\");", ""}, + {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + GURL url(cases[i].input); + GURL empty_path = url.GetWithEmptyPath(); + EXPECT_EQ(cases[i].expected, empty_path.spec()); + } +} + +TEST(GURLTest, Replacements) { + // The url canonicalizer replacement test will handle most of these case. + // The most important thing to do here is to check that the proper + // canonicalizer gets called based on the scheme of the input. + struct ReplaceCase { + const char* base; + const char* scheme; + const char* username; + const char* password; + const char* host; + const char* port; + const char* path; + const char* query; + const char* ref; + const char* expected; + } replace_cases[] = { + {"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "http://www.google.com/"}, + {"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", "", "window.open('foo');", "", "", "javascript:window.open('foo');"}, + {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo","search", "ref", "http://www.google.com:99/foo?search#ref"}, +#ifdef WIN32 + {"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", "c:\\", "", "", "file:///C:/"}, +#endif + }; + + for (size_t i = 0; i < ARRAYSIZE(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + GURL url(cur.base); + GURL::Replacements repl; + SetupReplacement(&GURL::Replacements::SetScheme, &repl, cur.scheme); + SetupReplacement(&GURL::Replacements::SetUsername, &repl, cur.username); + SetupReplacement(&GURL::Replacements::SetPassword, &repl, cur.password); + SetupReplacement(&GURL::Replacements::SetHost, &repl, cur.host); + SetupReplacement(&GURL::Replacements::SetPort, &repl, cur.port); + SetupReplacement(&GURL::Replacements::SetPath, &repl, cur.path); + SetupReplacement(&GURL::Replacements::SetQuery, &repl, cur.query); + SetupReplacement(&GURL::Replacements::SetRef, &repl, cur.ref); + GURL output = url.ReplaceComponents(repl); + + EXPECT_EQ(replace_cases[i].expected, output.spec()); + } +} + +TEST(GURLTest, PathForRequest) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "/"}, + {"http://www.google.com/", "/"}, + {"http://www.google.com/foo/bar.html?baz=22", "/foo/bar.html?baz=22"}, + {"http://www.google.com/foo/bar.html#ref", "/foo/bar.html"}, + {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + GURL url(cases[i].input); + std::string path_request = url.PathForRequest(); + EXPECT_EQ(cases[i].expected, path_request); + } +} + +TEST(GURLTest, EffectiveIntPort) { + struct PortTest { + const char* spec; + int expected_int_port; + } port_tests[] = { + // http + {"http://www.google.com/", 80}, + {"http://www.google.com:80/", 80}, + {"http://www.google.com:443/", 443}, + + // https + {"https://www.google.com/", 443}, + {"https://www.google.com:443/", 443}, + {"https://www.google.com:80/", 80}, + + // ftp + {"ftp://www.google.com/", 21}, + {"ftp://www.google.com:21/", 21}, + {"ftp://www.google.com:80/", 80}, + + // gopher + {"gopher://www.google.com/", 70}, + {"gopher://www.google.com:70/", 70}, + {"gopher://www.google.com:80/", 80}, + + // file - no port + {"file://www.google.com/", url_parse::PORT_UNSPECIFIED}, + {"file://www.google.com:443/", url_parse::PORT_UNSPECIFIED}, + + // data - no port + {"data:www.google.com:90", url_parse::PORT_UNSPECIFIED}, + {"data:www.google.com", url_parse::PORT_UNSPECIFIED}, + }; + + for (size_t i = 0; i < ARRAYSIZE(port_tests); i++) { + GURL url(port_tests[i].spec); + EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort()); + } +} + +TEST(GURLTest, IPAddress) { + struct IPTest { + const char* spec; + bool expected_ip; + } ip_tests[] = { + {"http://www.google.com/", false}, + {"http://192.168.9.1/", true}, + {"http://192.168.9.1.2/", false}, + {"http://192.168.m.1/", false}, + {"http://2001:db8::1/", false}, + {"http://[2001:db8::1]/", true}, + {"", false}, + {"some random input!", false}, + }; + + for (size_t i = 0; i < ARRAYSIZE(ip_tests); i++) { + GURL url(ip_tests[i].spec); + EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress()); + } +} + +TEST(GURLTest, HostNoBrackets) { + struct TestCase { + const char* input; + const char* expected_host; + const char* expected_plainhost; + } cases[] = { + {"http://www.google.com", "www.google.com", "www.google.com"}, + {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"}, + {"http://[::]/", "[::]", "::"}, + + // Don't require a valid URL, but don't crash either. + {"http://[]/", "[]", ""}, + {"http://[x]/", "[x]", "x"}, + {"http://[x/", "[x", "[x"}, + {"http://x]/", "x]", "x]"}, + {"http://[/", "[", "["}, + {"http://]/", "]", "]"}, + {"", "", ""}, + }; + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + GURL url(cases[i].input); + EXPECT_EQ(cases[i].expected_host, url.host()); + EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets()); + } +} + +TEST(GURLTest, DomainIs) { + const char google_domain[] = "google.com"; + + GURL url_1("http://www.google.com:99/foo"); + EXPECT_TRUE(url_1.DomainIs(google_domain)); + + GURL url_2("http://google.com:99/foo"); + EXPECT_TRUE(url_2.DomainIs(google_domain)); + + GURL url_3("http://google.com./foo"); + EXPECT_TRUE(url_3.DomainIs(google_domain)); + + GURL url_4("http://google.com/foo"); + EXPECT_FALSE(url_4.DomainIs("google.com.")); + + GURL url_5("http://google.com./foo"); + EXPECT_TRUE(url_5.DomainIs("google.com.")); + + GURL url_6("http://www.google.com./foo"); + EXPECT_TRUE(url_6.DomainIs(".com.")); + + GURL url_7("http://www.balabala.com/foo"); + EXPECT_FALSE(url_7.DomainIs(google_domain)); + + GURL url_8("http://www.google.com.cn/foo"); + EXPECT_FALSE(url_8.DomainIs(google_domain)); + + GURL url_9("http://www.iamnotgoogle.com/foo"); + EXPECT_FALSE(url_9.DomainIs(google_domain)); + + GURL url_10("http://www.iamnotgoogle.com../foo"); + EXPECT_FALSE(url_10.DomainIs(".com")); +} + +// Newlines should be stripped from inputs. +TEST(GURLTest, Newlines) { + // Constructor. + GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n "); + EXPECT_EQ("http://www.google.com/asdf", url_1.spec()); + + // Relative path resolver. + GURL url_2 = url_1.Resolve(" \n /fo\to\r "); + EXPECT_EQ("http://www.google.com/foo", url_2.spec()); + + // Note that newlines are NOT stripped from ReplaceComponents. +} + +TEST(GURLTest, IsStandard) { + GURL a("http:foo/bar"); + EXPECT_TRUE(a.IsStandard()); + + GURL b("foo:bar/baz"); + EXPECT_FALSE(b.IsStandard()); + + GURL c("foo://bar/baz"); + EXPECT_TRUE(c.IsStandard()); +} diff --git a/googleurl/src/url_canon.h b/googleurl/src/url_canon.h new file mode 100644 index 0000000..143574d --- /dev/null +++ b/googleurl/src/url_canon.h @@ -0,0 +1,871 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#ifndef GOOGLEURL_SRC_URL_CANON_H__ +#define GOOGLEURL_SRC_URL_CANON_H__ + +#include <memory.h> +#include <stdlib.h> + +#include "base/string16.h" +#include "googleurl/src/url_parse.h" + +namespace url_canon { + +// Canonicalizer output ------------------------------------------------------- + +// Base class for the canonicalizer output, this maintains a buffer and +// supports simple resizing and append operations on it. +// +// It is VERY IMPORTANT that no virtual function calls be made on the common +// code path. We only have two virtual function calls, the destructor and a +// resize function that is called when the existing buffer is not big enough. +// The derived class is then in charge of setting up our buffer which we will +// manage. +template<typename T> +class CanonOutputT { + public: + CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) { + } + virtual ~CanonOutputT() { + } + + // Implemented to resize the buffer. This function should update the buffer + // pointer to point to the new buffer, and any old data up to |cur_len_| in + // the buffer must be copied over. + // + // The new size |sz| must be larger than buffer_len_. + virtual void Resize(int sz) = 0; + + // Accessor for returning a character at a given position. The input offset + // must be in the valid range. + inline char at(int offset) const { + return buffer_[offset]; + } + + // Sets the character at the given position. The given position MUST be less + // than the length(). + inline void set(int offset, int ch) { + buffer_[offset] = ch; + } + + // Returns the number of characters currently in the buffer. + inline int length() const { + return cur_len_; + } + + // Returns the current capacity of the buffer. The length() is the number of + // characters that have been declared to be written, but the capacity() is + // the number that can be written without reallocation. If the caller must + // write many characters at once, it can make sure there is enough capacity, + // write the data, then use set_size() to declare the new length(). + int capacity() const { + return buffer_len_; + } + + // Called by the user of this class to get the output. The output will NOT + // be NULL-terminated. Call length() to get the + // length. + const T* data() const { + return buffer_; + } + T* data() { + return buffer_; + } + + // Shortens the URL to the new length. Used for "backing up" when processing + // relative paths. This can also be used if an external function writes a lot + // of data to the buffer (when using the "Raw" version below) beyond the end, + // to declare the new length. + // + // This MUST NOT be used to expand the size of the buffer beyond capacity(). + void set_length(int new_len) { + cur_len_ = new_len; + } + + // This is the most performance critical function, since it is called for + // every character. + void push_back(T ch) { + // In VC2005, putting this common case first speeds up execution + // dramatically because this branch is predicted as taken. + if (cur_len_ < buffer_len_) { + buffer_[cur_len_] = ch; + cur_len_++; + return; + } + + // Grow the buffer to hold at least one more item. Hopefully we won't have + // to do this very often. + if (!Grow(1)) + return; + + // Actually do the insertion. + buffer_[cur_len_] = ch; + cur_len_++; + } + + // Appends the given string to the output. + void Append(const T* str, int str_len) { + if (cur_len_ + str_len > buffer_len_) { + if (!Grow(cur_len_ + str_len - buffer_len_)) + return; + } + for (int i = 0; i < str_len; i++) + buffer_[cur_len_ + i] = str[i]; + cur_len_ += str_len; + } + + protected: + // Grows the given buffer so that it can fit at least |min_additional| + // characters. Returns true if the buffer could be resized, false on OOM. + bool Grow(int min_additional) { + static const int kMinBufferLen = 16; + int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_; + do { + if (new_len >= (1 << 30)) // Prevent overflow below. + return false; + new_len *= 2; + } while (new_len < buffer_len_ + min_additional); + Resize(new_len); + return true; + } + + T* buffer_; + int buffer_len_; + + // Used characters in the buffer. + int cur_len_; +}; + +// Simple implementation of the CanonOutput using new[]. This class +// also supports a static buffer so if it is allocated on the stack, most +// URLs can be canonicalized with no heap allocations. +template<typename T, int fixed_capacity = 1024> +class RawCanonOutputT : public CanonOutputT<T> { + public: + RawCanonOutputT() : CanonOutputT<T>() { + this->buffer_ = fixed_buffer_; + this->buffer_len_ = fixed_capacity; + } + virtual ~RawCanonOutputT() { + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + } + + virtual void Resize(int sz) { + T* new_buf = new T[sz]; + memcpy(new_buf, this->buffer_, + sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz)); + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + this->buffer_ = new_buf; + this->buffer_len_ = sz; + } + + protected: + T fixed_buffer_[fixed_capacity]; +}; + +// Normally, all canonicalization output is in narrow characters. We support +// the templates so it can also be used internally if a wide buffer is +// required. +typedef CanonOutputT<char> CanonOutput; +typedef CanonOutputT<char16> CanonOutputW; + +template<int fixed_capacity> +class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {}; +template<int fixed_capacity> +class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {}; + +// Character set converter ---------------------------------------------------- +// +// Converts query strings into a custom encoding. The embedder can supply an +// implementation of this class to interface with their own character set +// conversion libraries. +// +// Embedders will want to see the unit test for the ICU version. + +class CharsetConverter { + public: + CharsetConverter() {} + virtual ~CharsetConverter() {} + + // Converts the given input string from UTF-16 to whatever output format the + // converter supports. This is used only for the query encoding conversion, + // which does not fail. Instead, the converter should insert "invalid + // character" characters in the output for invalid sequences, and do the + // best it can. + // + // If the input contains a character not representable in the output + // character set, the converter should append the HTML entity sequence in + // decimal, (such as "你") with escaping of the ampersand, number + // sign, and semicolon (in the previous example it would be + // "%26%2320320%3B"). This rule is based on what IE does in this situation. + virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output) = 0; +}; + +// Whitespace ----------------------------------------------------------------- + +// Searches for whitespace that should be removed from the middle of URLs, and +// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces +// are preserved, which is what most browsers do. A pointer to the output will +// be returned, and the length of that output will be in |output_len|. +// +// This should be called before parsing if whitespace removal is desired (which +// it normally is when you are canonicalizing). +// +// If no whitespace is removed, this function will not use the buffer and will +// return a pointer to the input, to avoid the extra copy. If modification is +// required, the given |buffer| will be used and the returned pointer will +// point to the beginning of the buffer. +// +// Therefore, callers should not use the buffer, since it may actuall be empty, +// use the computed pointer and |*output_len| instead. +const char* RemoveURLWhitespace(const char* input, int input_len, + CanonOutputT<char>* buffer, + int* output_len); +const char16* RemoveURLWhitespace(const char16* input, int input_len, + CanonOutputT<char16>* buffer, + int* output_len); + +// IDN ------------------------------------------------------------------------ + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must fall in the ASCII range, but will be encoded in UTF-16. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, returns false. The output in this case is undefined. +bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); + +// Piece-by-piece canonicalizers ---------------------------------------------- +// +// These individual canonicalizers append the canonicalized versions of the +// corresponding URL component to the given std::string. The spec and the +// previously-identified range of that component are the input. The range of +// the canonicalized component will be written to the output component. +// +// These functions all append to the output so they can be chained. Make sure +// the output is empty when you start. +// +// These functions returns boolean values indicating success. On failure, they +// will attempt to write something reasonable to the output so that, if +// displayed to the user, they will recognise it as something that's messed up. +// Nothing more should ever be done with these invalid URLs, however. + +// Scheme: Appends the scheme and colon to the URL. The output component will +// indicate the range of characters up to but not including the colon. +// +// Canonical URLs always have a scheme. If the scheme is not present in the +// input, this will just write the colon to indicate an empty scheme. Does not +// append slashes which will be needed before any authority components for most +// URLs. +// +// The 8-bit version requires UTF-8 encoding. +bool CanonicalizeScheme(const char* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); +bool CanonicalizeScheme(const char16* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); + +// User info: username/password. If present, this will add the delimiters so +// the output will be "<username>:<password>@" or "<username>@". Empty +// username/password pairs, or empty passwords, will get converted to +// nonexistant in the canonical version. +// +// The components for the username and password refer to ranges in the +// respective source strings. Usually, these will be the same string, which +// is legal as long as the two components don't overlap. +// +// The 8-bit version requires UTF-8 encoding. +bool CanonicalizeUserInfo(const char* username_source, + const url_parse::Component& username, + const char* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); +bool CanonicalizeUserInfo(const char16* username_source, + const url_parse::Component& username, + const char16* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); + + +// This structure holds detailed state exported from the IP/Host canonicalizers. +// Additional fields may be added as callers require them. +struct CanonHostInfo { + CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} + + // Convenience function to test if family is an IP address. + bool IsIPAddress() const { return family == IPV4 || family == IPV6; } + + // This field summarizes how the input was classified by the canonicalizer. + enum Family { + NEUTRAL, // - Doesn't resemble an IP address. As far as the IP + // canonicalizer is concerned, it should be treated as a + // hostname. + BROKEN, // - Almost an IP, but was not canonicalized. This could be an + // IPv4 address where truncation occurred, or something + // containing the special characters :[] which did not parse + // as an IPv6 address. Never attempt to connect to this + // address, because it might actually succeed! + IPV4, // - Successfully canonicalized as an IPv4 address. + IPV6, // - Successfully canonicalized as an IPv6 address. + }; + Family family; + + // If |family| is IPV4, then this is the number of nonempty dot-separated + // components in the input text, from 1 to 4. If |family| is not IPV4, + // this value is undefined. + int num_ipv4_components; + + // Location of host within the canonicalized output. + // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. + // CanonicalizeHostVerbose() always sets it. + url_parse::Component out_host; +}; + + +// Host. +// +// The 8-bit version requires UTF-8 encoding. Use this version when you only +// need to know whether canonicalization succeeded. +bool CanonicalizeHost(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); +bool CanonicalizeHost(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); + +// Extended version of CanonicalizeHost, which returns additional information. +// Use this when you need to know whether the hostname was an IP address. +// A successful return is indicated by host_info->family != BROKEN. See the +// definition of CanonHostInfo above for details. +void CanonicalizeHostVerbose(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +void CanonicalizeHostVerbose(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + + +// IP addresses. +// +// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is +// an IP address, it will canonicalize it as such, appending it to |output|. +// Additional status information is returned via the |*host_info| parameter. +// See the definition of CanonHostInfo above for details. +// +// This is called AUTOMATICALLY from the host canonicalizer, which ensures that +// the input is unescaped and name-prepped, etc. It should not normally be +// necessary or wise to call this directly. +void CanonicalizeIPAddress(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +void CanonicalizeIPAddress(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + +// Port: this function will add the colon for the port if a port is present. +// The caller can pass url_parse::PORT_UNSPECIFIED as the +// default_port_for_scheme argument if there is no default port. +// +// The 8-bit version requires UTF-8 encoding. +bool CanonicalizePort(const char* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); +bool CanonicalizePort(const char16* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +int DefaultPortForScheme(const char* scheme, int scheme_len); + +// Path. If the input does not begin in a slash (including if the input is +// empty), we'll prepend a slash to the path to make it canonical. +// +// The 8-bit version assumes UTF-8 encoding, but does not verify the validity +// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid +// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't +// an issue. Somebody giving us an 8-bit path is responsible for generating +// the path that the server expects (we'll escape high-bit characters), so +// if something is invalid, it's their problem. +bool CanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +bool CanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Canonicalizes the input as a file path. This is like CanonicalizePath except +// that it also handles Windows drive specs. For example, the path can begin +// with "c|\" and it will get properly canonicalized to "C:/". +// The string will be appended to |*output| and |*out_path| will be updated. +// +// The 8-bit version requires UTF-8 encoding. +bool FileCanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +bool FileCanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Query: Prepends the ? if needed. +// +// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly +// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode +// "invalid character." This function can not fail, we always just try to do +// our best for crazy input here since web pages can set it themselves. +// +// This will convert the given input into the output encoding that the given +// character set converter object provides. The converter will only be called +// if necessary, for ASCII input, no conversions are necessary. +// +// The converter can be NULL. In this case, the output encoding will be UTF-8. +void CanonicalizeQuery(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); +void CanonicalizeQuery(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); + +// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only +// canonicalizer that does not produce ASCII output). The output is +// guaranteed to be valid UTF-8. +// +// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use +// the "Unicode replacement character" for the confusing bits and copy the rest. +void CanonicalizeRef(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +void CanonicalizeRef(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Full canonicalizer --------------------------------------------------------- +// +// These functions replace any string contents, rather than append as above. +// See the above piece-by-piece functions for information specific to +// canonicalizing individual components. +// +// The output will be ASCII except the reference fragment, which may be UTF-8. +// +// The 8-bit versions require UTF-8 encoding. + +// Use for standard URLs with authorities and paths. +bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool CanonicalizeStandardURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for file URLs. +bool CanonicalizeFileURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool CanonicalizeFileURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for path URLs such as javascript. This does not modify the path in any +// way, for example, by escaping it. +bool CanonicalizePathURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool CanonicalizePathURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for mailto URLs. This "canonicalizes" the url into a path and query +// component. It does not attempt to merge "to" fields. It uses UTF-8 for +// the query encoding if there is a query. This is because a mailto URL is +// really intended for an external mail program, and the encoding of a page, +// etc. which would influence a query encoding normally are irrelevant. +bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool CanonicalizeMailtoURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Part replacer -------------------------------------------------------------- + +// Internal structure used for storing separate strings for each component. +// The basic canonicalization functions use this structure internally so that +// component remplacement (different strings for different components) can be +// treated on the same code path as regular canonicalization (the same string +// for each component). +// +// A url_parse::Parsed structure usually goes along with this. Those +// components identify offsets within these strings, so that they can all be +// in the same string, or spread arbitrarily across different ones. +// +// This structures does not own any data. It is the caller's responsibility to +// ensure that the data the pointers point to stays in scope and is not +// modified. +template<typename CHAR> +struct URLComponentSource { + // Constructor normally used by callers wishing to replace components. This + // will make them all NULL, which is no replacement. The caller would then + // override the compoents they want to replace. + URLComponentSource() + : scheme(NULL), + username(NULL), + password(NULL), + host(NULL), + port(NULL), + path(NULL), + query(NULL), + ref(NULL) { + } + + // Constructor normally used internally to initialize all the components to + // point to the same spec. + explicit URLComponentSource(const CHAR* default_value) + : scheme(default_value), + username(default_value), + password(default_value), + host(default_value), + port(default_value), + path(default_value), + query(default_value), + ref(default_value) { + } + + const CHAR* scheme; + const CHAR* username; + const CHAR* password; + const CHAR* host; + const CHAR* port; + const CHAR* path; + const CHAR* query; + const CHAR* ref; +}; + +// This structure encapsulates information on modifying a URL. Each component +// may either be left unchanged, replaced, or deleted. +// +// By default, each component is unchanged. For those components that should be +// modified, call either Set* or Clear* to modify it. +// +// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT +// IN SCOPE BY THE CALLER for as long as this object exists! +// +// Prefer the 8-bit replacement version if possible since it is more efficient. +template<typename CHAR> +class Replacements { + public: + Replacements() { + } + + // Scheme + void SetScheme(const CHAR* s, const url_parse::Component& comp) { + sources_.scheme = s; + components_.scheme = comp; + } + // Note: we don't have a ClearScheme since this doesn't make any sense. + bool IsSchemeOverridden() const { return sources_.scheme != NULL; } + + // Username + void SetUsername(const CHAR* s, const url_parse::Component& comp) { + sources_.username = s; + components_.username = comp; + } + void ClearUsername() { + sources_.username = Placeholder(); + components_.username = url_parse::Component(); + } + bool IsUsernameOverridden() const { return sources_.username != NULL; } + + // Password + void SetPassword(const CHAR* s, const url_parse::Component& comp) { + sources_.password = s; + components_.password = comp; + } + void ClearPassword() { + sources_.password = Placeholder(); + components_.password = url_parse::Component(); + } + bool IsPasswordOverridden() const { return sources_.password != NULL; } + + // Host + void SetHost(const CHAR* s, const url_parse::Component& comp) { + sources_.host = s; + components_.host = comp; + } + void ClearHost() { + sources_.host = Placeholder(); + components_.host = url_parse::Component(); + } + bool IsHostOverridden() const { return sources_.host != NULL; } + + // Port + void SetPort(const CHAR* s, const url_parse::Component& comp) { + sources_.port = s; + components_.port = comp; + } + void ClearPort() { + sources_.port = Placeholder(); + components_.port = url_parse::Component(); + } + bool IsPortOverridden() const { return sources_.port != NULL; } + + // Path + void SetPath(const CHAR* s, const url_parse::Component& comp) { + sources_.path = s; + components_.path = comp; + } + void ClearPath() { + sources_.path = Placeholder(); + components_.path = url_parse::Component(); + } + bool IsPathOverridden() const { return sources_.path != NULL; } + + // Query + void SetQuery(const CHAR* s, const url_parse::Component& comp) { + sources_.query = s; + components_.query = comp; + } + void ClearQuery() { + sources_.query = Placeholder(); + components_.query = url_parse::Component(); + } + bool IsQueryOverridden() const { return sources_.query != NULL; } + + // Ref + void SetRef(const CHAR* s, const url_parse::Component& comp) { + sources_.ref = s; + components_.ref = comp; + } + void ClearRef() { + sources_.ref = Placeholder(); + components_.ref = url_parse::Component(); + } + bool IsRefOverridden() const { return sources_.ref != NULL; } + + // Getters for the itnernal data. See the variables below for how the + // information is encoded. + const URLComponentSource<CHAR>& sources() const { return sources_; } + const url_parse::Parsed& components() const { return components_; } + + private: + // Returns a pointer to a static empty string that is used as a placeholder + // to indicate a component should be deleted (see below). + const CHAR* Placeholder() { + static const CHAR empty_string = 0; + return &empty_string; + } + + // We support three states: + // + // Action | Source Component + // -----------------------+-------------------------------------------------- + // Don't change component | NULL (unused) + // Replace component | (replacement string) (replacement component) + // Delete component | (non-NULL) (invalid component: (0,-1)) + // + // We use a pointer to the empty string for the source when the component + // should be deleted. + URLComponentSource<CHAR> sources_; + url_parse::Parsed components_; +}; + +// The base must be an 8-bit canonical URL. +bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Replacing some parts of a file URL is not permitted. Everything except +// the host, path, query, and ref will be ignored. +bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Path URLs can only have the scheme and path replaced. All other components +// will be ignored. +bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Mailto URLs can only have the scheme, path, and query replaced. +// All other components will be ignored. +bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Relative URL --------------------------------------------------------------- + +// Given an input URL or URL fragment |fragment|, determines if it is a +// relative or absolute URL and places the result into |*is_relative|. If it is +// relative, the relevant portion of the URL will be placed into +// |*relative_component| (there may have been trimmed whitespace, for example). +// This value is passed to ResolveRelativeURL. If the input is not relative, +// this value is UNDEFINED (it may be changed by the functin). +// +// Returns true on success (we successfully determined the URL is relative or +// not). Failure means that the combination of URLs doesn't make any sense. +// +// The base URL should always be canonical, therefore is ASCII. +bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); +bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); + +// Given a canonical parsed source URL, a URL fragment known to be relative, +// and the identified relevant portion of the relative URL (computed by +// IsRelativeURL), this produces a new parsed canonical URL in |output| and +// |out_parsed|. +// +// It also requires a flag indicating whether the base URL is a file: URL +// which triggers additional logic. +// +// The base URL should be canonical and have a host (may be empty for file +// URLs) and a path. If it doesn't have these, we can't resolve relative +// URLs off of it and will return the base as the output with an error flag. +// Becausee it is canonical is should also be ASCII. +// +// The query charset converter follows the same rules as CanonicalizeQuery. +// +// Returns true on success. On failure, the output will be "something +// reasonable" that will be consistent and valid, just probably not what +// was intended by the web page author or caller. +bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); +bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char16* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_H__ diff --git a/googleurl/src/url_canon_etc.cc b/googleurl/src/url_canon_etc.cc new file mode 100644 index 0000000..672b187 --- /dev/null +++ b/googleurl/src/url_canon_etc.cc @@ -0,0 +1,391 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Canonicalizers for random bits that aren't big enough for their own files. + +#include <string.h> + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +// Returns true if the given character should be removed from the middle of a +// URL. +inline bool IsRemovableURLWhitespace(int ch) { + return ch == '\r' || ch == '\n' || ch == '\t'; +} + +// Backend for RemoveURLWhitespace (see declaration in url_canon.h). +// It sucks that we have to do this, since this takes about 13% of the total URL +// canonicalization time. +template<typename CHAR> +const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len, + CanonOutputT<CHAR>* buffer, + int* output_len) { + // Fast verification that there's nothing that needs removal. This is the 99% + // case, so we want it to be fast and don't care about impacting the speed + // when we do find whitespace. + int found_whitespace = false; + for (int i = 0; i < input_len; i++) { + if (!IsRemovableURLWhitespace(input[i])) + continue; + found_whitespace = true; + break; + } + + if (!found_whitespace) { + // Didn't find any whitespace, we don't need to do anything. We can just + // return the input as the output. + *output_len = input_len; + return input; + } + + // Remove the whitespace into the new buffer and return it. + for (int i = 0; i < input_len; i++) { + if (!IsRemovableURLWhitespace(input[i])) + buffer->push_back(input[i]); + } + *output_len = buffer->length(); + return buffer->data(); +} + +// Contains the canonical version of each possible input letter in the scheme +// (basically, lower-cased). The corresponding entry will be 0 if the letter +// is not allowed in a scheme. +const char kSchemeCanonical[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 , +// @ A B C D E F G H I J K L M N O + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0, +// ` a b c d e f g h i j k l m n o + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 }; + +// This could be a table lookup as well by setting the high bit for each +// valid character, but it's only called once per URL, and it makes the lookup +// table easier to read not having extra stuff in it. +inline bool IsSchemeFirstChar(unsigned char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +template<typename CHAR, typename UCHAR> +bool DoScheme(const CHAR* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme) { + if (scheme.len <= 0) { + // Scheme is unspecified or empty, convert to empty by appending a colon. + *out_scheme = url_parse::Component(output->length(), 0); + output->push_back(':'); + return true; + } + + // The output scheme starts from the current position. + out_scheme->begin = output->length(); + + bool success = true; + int end = scheme.end(); + for (int i = scheme.begin; i < end; i++) { + UCHAR ch = static_cast<UCHAR>(spec[i]); + char replacement = 0; + if (ch < 0x80) { + if (i == scheme.begin) { + // Need to do a special check for the first letter of the scheme. + if (IsSchemeFirstChar(static_cast<unsigned char>(ch))) + replacement = kSchemeCanonical[ch]; + } else { + replacement = kSchemeCanonical[ch]; + } + } + + if (replacement) { + output->push_back(replacement); + } else if (ch == '%') { + // Canonicalizing the scheme multiple times should lead to the same + // result. Since invalid characters will be escaped, we need to preserve + // the percent to avoid multiple escaping. The scheme will be invalid. + success = false; + output->push_back('%'); + } else { + // Invalid character, store it but mark this scheme as invalid. + success = false; + + // This will escape the output and also handle encoding issues. + // Ignore the return value since we already failed. + AppendUTF8EscapedChar(spec, &i, end, output); + } + } + + // The output scheme ends with the the current position, before appending + // the colon. + out_scheme->len = output->length() - out_scheme->begin; + output->push_back(':'); + return success; +} + +// The username and password components reference ranges in the corresponding +// *_spec strings. Typically, these specs will be the same (we're +// canonicalizing a single source string), but may be different when +// replacing components. +template<typename CHAR, typename UCHAR> +bool DoUserInfo(const CHAR* username_spec, + const url_parse::Component& username, + const CHAR* password_spec, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password) { + if (username.len <= 0 && password.len <= 0) { + // Common case: no user info. We strip empty username/passwords. + *out_username = url_parse::Component(); + *out_password = url_parse::Component(); + return true; + } + + // Write the username. + out_username->begin = output->length(); + if (username.len > 0) { + // This will escape characters not valid for the username. + AppendStringOfType(&username_spec[username.begin], username.len, + CHAR_USERINFO, output); + } + out_username->len = output->length() - out_username->begin; + + // When there is a password, we need the separator. Note that we strip + // empty but specified passwords. + if (password.len > 0) { + output->push_back(':'); + out_password->begin = output->length(); + AppendStringOfType(&password_spec[password.begin], password.len, + CHAR_USERINFO, output); + out_password->len = output->length() - out_password->begin; + } else { + *out_password = url_parse::Component(); + } + + output->push_back('@'); + return true; +} + +// Helper functions for converting port integers to strings. +inline void WritePortInt(char* output, int output_len, int port) { + _itoa_s(port, output, output_len, 10); +} +inline void WritePortInt(char16* output, int output_len, int port) { + _itow_s(port, output, output_len, 10); +} + +// This function will prepend the colon if there will be a port. +template<typename CHAR, typename UCHAR> +bool DoPort(const CHAR* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port) { + int port_num = url_parse::ParsePort(spec, port); + if (port_num == url_parse::PORT_UNSPECIFIED || + port_num == default_port_for_scheme) { + *out_port = url_parse::Component(); + return true; // Leave port empty. + } + + if (port_num == url_parse::PORT_INVALID) { + // Invalid port: We'll copy the text from the input so the user can see + // what the error was, and mark the URL as invalid by returning false. + output->push_back(':'); + out_port->begin = output->length(); + AppendInvalidNarrowString(spec, port.begin, port.end(), output); + out_port->len = output->length() - out_port->begin; + return false; + } + + // Convert port number back to an integer. Max port value is 5 digits, and + // the Parsed::ExtractPort will have made sure the integer is in range. + const int buf_size = 6; + char buf[buf_size]; + WritePortInt(buf, buf_size, port_num); + + // Append the port number to the output, preceeded by a colon. + output->push_back(':'); + out_port->begin = output->length(); + for (int i = 0; i < buf_size && buf[i]; i++) + output->push_back(buf[i]); + + out_port->len = output->length() - out_port->begin; + return true; +} + +template<typename CHAR, typename UCHAR> +void DoCanonicalizeRef(const CHAR* spec, + const url_parse::Component& ref, + CanonOutput* output, + url_parse::Component* out_ref) { + if (ref.len < 0) { + // Common case of no ref. + *out_ref = url_parse::Component(); + return; + } + + // Append the ref separator. Note that we need to do this even when the ref + // is empty but present. + output->push_back('#'); + out_ref->begin = output->length(); + + // Now iterate through all the characters, converting to UTF-8 and validating. + int end = ref.end(); + for (int i = ref.begin; i < end; i++) { + if (spec[i] == 0) { + // IE just strips NULLs, so we do too. + continue; + } else if (static_cast<UCHAR>(spec[i]) < 0x20) { + // Unline IE seems to, we escape control characters. This will probably + // make the reference fragment unusable on a web page, but people + // shouldn't be using control characters in their anchor names. + AppendEscapedChar(static_cast<unsigned char>(spec[i]), output); + } else if (static_cast<UCHAR>(spec[i]) < 0x80) { + // Normal ASCII characters are just appended. + output->push_back(static_cast<char>(spec[i])); + } else { + // Non-ASCII characters are appended unescaped, but only when they are + // valid. Invalid Unicode characters are replaced with the "invalid + // character" as IE seems to. + unsigned code_point; + if (!ReadUTFChar(spec, &i, end, &code_point)) + AppendUTF8Value(kUnicodeReplacementCharacter, output); + else + AppendUTF8Value(code_point, output); + } + } + + out_ref->len = output->length() - out_ref->begin; +} + +} // namespace + +const char* RemoveURLWhitespace(const char* input, int input_len, + CanonOutputT<char>* buffer, + int* output_len) { + return DoRemoveURLWhitespace(input, input_len, buffer, output_len); +} + +const char16* RemoveURLWhitespace(const char16* input, int input_len, + CanonOutputT<char16>* buffer, + int* output_len) { + return DoRemoveURLWhitespace(input, input_len, buffer, output_len); +} + +char CanonicalSchemeChar(char16 ch) { + if (ch >= 0x80) + return 0; // Non-ASCII is not supported by schemes. + return kSchemeCanonical[ch]; +} + +bool CanonicalizeScheme(const char* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme) { + return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme); +} + +bool CanonicalizeScheme(const char16* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme) { + return DoScheme<char16, char16>(spec, scheme, output, out_scheme); +} + +bool CanonicalizeUserInfo(const char* username_source, + const url_parse::Component& username, + const char* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password) { + return DoUserInfo<char, unsigned char>( + username_source, username, password_source, password, + output, out_username, out_password); +} + +bool CanonicalizeUserInfo(const char16* username_source, + const url_parse::Component& username, + const char16* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password) { + return DoUserInfo<char16, char16>( + username_source, username, password_source, password, + output, out_username, out_password); +} + +bool CanonicalizePort(const char* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port) { + return DoPort<char, unsigned char>(spec, port, + default_port_for_scheme, + output, out_port); +} + +bool CanonicalizePort(const char16* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port) { + return DoPort<char16, char16>(spec, port, default_port_for_scheme, + output, out_port); +} + +void CanonicalizeRef(const char* spec, + const url_parse::Component& ref, + CanonOutput* output, + url_parse::Component* out_ref) { + DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref); +} + +void CanonicalizeRef(const char16* spec, + const url_parse::Component& ref, + CanonOutput* output, + url_parse::Component* out_ref) { + DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_fileurl.cc b/googleurl/src/url_canon_fileurl.cc new file mode 100644 index 0000000..97023eb --- /dev/null +++ b/googleurl/src/url_canon_fileurl.cc @@ -0,0 +1,215 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Functions for canonicalizing "file:" URLs. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_canon { + +namespace { + +#ifdef WIN32 + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template<typename CHAR> +int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = url_parse::CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!url_parse::DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // A drive spec is the start of a path, so we need to add a slash for the + // authority terminator (typically the third slash). + output->push_back('/'); + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z') + output->push_back(spec[after_slashes] - 'a' + 'A'); + else + output->push_back(static_cast<char>(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + return after_slashes + 2; +} + +#endif // WIN32 + +template<typename CHAR, typename UCHAR> +bool DoFileCanonicalizePath(const CHAR* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + // Copies and normalizes the "c:" at the beginning, if present. + out_path->begin = output->length(); + int after_drive; +#ifdef WIN32 + after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output); +#else + after_drive = path.begin; +#endif + + // Copies the rest of the path, starting from the slash following the + // drive colon (if any, Windows only), or the first slash of the path. + bool success = true; + if (after_drive < path.end()) { + // Use the regular path canonicalizer to canonicalize the rest of the + // path. Give it a fake output component to write into. DoCanonicalizeFile + // will compute the full path component. + url_parse::Component sub_path = + url_parse::MakeRange(after_drive, path.end()); + url_parse::Component fake_output_path; + success = CanonicalizePath(spec, sub_path, output, &fake_output_path); + } else { + // No input path, canonicalize to a slash. + output->push_back('/'); + } + + out_path->len = output->length() - out_path->begin; + return success; +} + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = url_parse::Component(); + new_parsed->password = url_parse::Component(); + new_parsed->port = url_parse::Component(); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("file://", 7); + new_parsed->scheme.len = 4; + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = CanonicalizeHost(source.host, parsed.host, + output, &new_parsed->host); + success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path, + output, &new_parsed->path); + CanonicalizeQuery(source.query, parsed.query, query_converter, + output, &new_parsed->query); + + // Ignore failure for refs since the URL can probably still be loaded. + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace + +bool CanonicalizeFileURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeFileURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, query_converter, + output, new_parsed); +} + +bool CanonicalizeFileURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeFileURL<char16, char16>( + URLComponentSource<char16>(spec), parsed, query_converter, + output, new_parsed); +} + +bool FileCanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + return DoFileCanonicalizePath<char, unsigned char>(spec, path, + output, out_path); +} + +bool FileCanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + return DoFileCanonicalizePath<char16, char16>(spec, path, + output, out_path); +} + +bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeFileURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeFileURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_host.cc b/googleurl/src/url_canon_host.cc new file mode 100644 index 0000000..6642004 --- /dev/null +++ b/googleurl/src/url_canon_host.cc @@ -0,0 +1,401 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "base/logging.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +// For reference, here's what IE supports: +// Key: 0 (disallowed: failure if present in the input) +// + (allowed either escaped or unescaped, and unmodified) +// U (allowed escaped or unescaped but always unescaped if present in +// escaped form) +// E (allowed escaped or unescaped but always escaped if present in +// unescaped form) +// % (only allowed escaped in the input, will be unmodified). +// I left blank alpha numeric characters. +// +// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f +// ----------------------------------------------- +// 0 0 E E E E E E E E E E E E E E E +// 1 E E E E E E E E E E E E E E E E +// 2 E + E E + E + + + + + + + U U 0 +// 3 % % E + E 0 <-- Those are : ; < = > ? +// 4 % +// 5 U 0 U U U <-- Those are [ \ ] ^ _ +// 6 E <-- That's ` +// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE) +// +// NOTE: I didn't actually test all the control characters. Some may be +// disallowed in the input, but they are all accepted escaped except for 0. +// I also didn't test if characters affecting HTML parsing are allowed +// unescaped, eg. (") or (#), which would indicate the beginning of the path. +// Surprisingly, space is accepted in the input and always escaped. + +// This table lists the canonical version of all characters we allow in the +// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar +// value to indicate that this character should be escaped. We are a little more +// restrictive than IE, but less restrictive than Firefox. +// +// Note that we disallow the % character. We will allow it when part of an +// escape sequence, of course, but this disallows "%25". Even though IE allows +// it, allowing it would put us in a funny state. If there was an invalid +// escape sequence like "%zz", we'll add "%25zz" to the output and fail. +// Allowing percents means we'll succeed a second time, so validity would change +// based on how many times you run the canonicalizer. We prefer to always report +// the same vailidity, so reject this. +const unsigned char kEsc = 0xff; +const unsigned char kHostCharLookup[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 , +// @ A B C D E F G H I J K L M N O + kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_', +// ` a b c d e f g h i j k l m n o + kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 }; + +const int kTempHostBufferLen = 1024; +typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer; +typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW; + +// Scans a host name and fills in the output flags according to what we find. +// |has_non_ascii| will be true if there are any non-7-bit characters, and +// |has_escaped| will be true if there is a percent sign. +template<typename CHAR, typename UCHAR> +void ScanHostname(const CHAR* spec, const url_parse::Component& host, + bool* has_non_ascii, bool* has_escaped) { + int end = host.end(); + *has_non_ascii = false; + *has_escaped = false; + for (int i = host.begin; i < end; i++) { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + *has_non_ascii = true; + else if (spec[i] == '%') + *has_escaped = true; + } +} + +// Canonicalizes a host name that is entirely 8-bit characters (even though +// the type holding them may be 16 bits. Escaped characters will be unescaped. +// Non-7-bit characters (for example, UTF-8) will be passed unchanged. +// +// The |*has_non_ascii| flag will be true if there are non-7-bit characters in +// the output. +// +// This function is used in two situations: +// +// * When the caller knows there is no non-ASCII or percent escaped +// characters. This is what DoHost does. The result will be a completely +// canonicalized host since we know nothing weird can happen (escaped +// characters could be unescaped to non-7-bit, so they have to be treated +// with suspicion at this point). It does not use the |has_non_ascii| flag. +// +// * When the caller has an 8-bit string that may need unescaping. +// DoComplexHost calls us this situation to do unescaping and validation. +// After this, it may do other IDN operations depending on the value of the +// |*has_non_ascii| flag. +// +// The return value indicates if the output is a potentially valid host name. +template<typename INCHAR, typename OUTCHAR> +bool DoSimpleHost(const INCHAR* host, + int host_len, + CanonOutputT<OUTCHAR>* output, + bool* has_non_ascii) { + *has_non_ascii = false; + + bool success = true; + for (int i = 0; i < host_len; ++i) { + unsigned int source = host[i]; + if (source == '%') { + // Unescape first, if possible. + // Source will be used only if decode operation was successful. + if (!DecodeEscaped(host, &i, host_len, + reinterpret_cast<unsigned char*>(&source))) { + // Invalid escaped character. There is nothing that can make this + // host valid. We append an escaped percent so the URL looks reasonable + // and mark as failed. + AppendEscapedChar('%', output); + success = false; + continue; + } + } + + if (source < 0x80) { + // We have ASCII input, we can use our lookup table. + unsigned char replacement = kHostCharLookup[source]; + if (!replacement) { + // Invalid character, add it as percent-escaped and mark as failed. + AppendEscapedChar(source, output); + success = false; + } else if (replacement == kEsc) { + // This character is valid but should be escaped. + AppendEscapedChar(source, output); + } else { + // Common case, the given character is valid in a hostname, the lookup + // table tells us the canonical representation of that character (lower + // cased). + output->push_back(replacement); + } + } else { + // It's a non-ascii char. Just push it to the output. + // In case where we have char16 input, and char output it's safe to + // cast char16->char only if input string was converted to ASCII. + output->push_back(static_cast<OUTCHAR>(source)); + *has_non_ascii = true; + } + } + + return success; +} + +// Canonicalizes a host that requires IDN conversion. Returns true on success +bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) { + // We need to escape URL before doing IDN conversion, since punicode strings + // cannot be escaped after they are created. + RawCanonOutputW<kTempHostBufferLen> url_escaped_host; + bool has_non_ascii; + DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii); + + StackBufferW wide_output; + if (!IDNToASCII(url_escaped_host.data(), + url_escaped_host.length(), + &wide_output)) { + // Some error, give up. This will write some reasonable looking + // representation of the string to the output. + AppendInvalidNarrowString(src, 0, src_len, output); + return false; + } + + // Now we check the ASCII output like a normal host. It will also handle + // unescaping. Although we unescaped everything before this function call, if + // somebody does %00 as fullwidth, ICU will convert this to ASCII. + bool success = DoSimpleHost(wide_output.data(), + wide_output.length(), + output, &has_non_ascii); + DCHECK(!has_non_ascii); + return success; +} + +// 8-bit convert host to its ASCII version: this converts the UTF-8 input to +// UTF-16. The has_escaped flag should be set if the input string requires +// unescaping. +bool DoComplexHost(const char* host, int host_len, + bool has_non_ascii, bool has_escaped, CanonOutput* output) { + // Save the current position in the output. We may write stuff and rewind it + // below, so we need to know where to rewind to. + int begin_length = output->length(); + + // Points to the UTF-8 data we want to convert. This will either be the + // input or the unescaped version written to |*output| if necessary. + const char* utf8_source; + int utf8_source_len; + if (has_escaped) { + // Unescape before converting to UTF-16 for IDN. We write this into the + // output because it most likely does not require IDNization, and we can + // save another huge stack buffer. It will be replaced below if it requires + // IDN. This will also update our non-ASCII flag so we know whether the + // unescaped input requires IDN. + if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { + // Error with some escape sequence. We'll call the current output + // complete. DoSimpleHost will have written some "reasonable" output. + return false; + } + + // Unescaping may have left us with ASCII input, in which case the + // unescaped version we wrote to output is complete. + if (!has_non_ascii) { + return true; + } + + // Save the pointer into the data was just converted (it may be appended to + // other data in the output buffer). + utf8_source = &output->data()[begin_length]; + utf8_source_len = output->length() - begin_length; + } else { + // We don't need to unescape, use input for IDNization later. (We know the + // input has non-ASCII, or the simple version would have been called + // instead of us.) + utf8_source = host; + utf8_source_len = host_len; + } + + // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion. + // Above, we may have used the output to write the unescaped values to, so + // we have to rewind it to where we started after we convert it to UTF-16. + StackBufferW utf16; + if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) { + // In this error case, the input may or may not be the output. + StackBuffer utf8; + for (int i = 0; i < utf8_source_len; i++) + utf8.push_back(utf8_source[i]); + output->set_length(begin_length); + AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output); + return false; + } + output->set_length(begin_length); + + // This will call DoSimpleHost which will do normal ASCII canonicalization + // and also check for IP addresses in the outpt. + return DoIDNHost(utf16.data(), utf16.length(), output); +} + +// UTF-16 convert host to its ASCII version. The set up is already ready for +// the backend, so we just pass through. The has_escaped flag should be set if +// the input string requires unescaping. +bool DoComplexHost(const char16* host, int host_len, + bool has_non_ascii, bool has_escaped, CanonOutput* output) { + if (has_escaped) { + // Yikes, we have escaped characters with wide input. The escaped + // characters should be interpreted as UTF-8. To solve this problem, + // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN. + // + // We don't bother to optimize the conversion in the ASCII case (which + // *could* just be a copy) and use the UTF-8 path, because it should be + // very rare that host names have escaped characters, and it is relatively + // fast to do the conversion anyway. + StackBuffer utf8; + if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) { + AppendInvalidNarrowString(host, 0, host_len, output); + return false; + } + + // Once we convert to UTF-8, we can use the 8-bit version of the complex + // host handling code above. + return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, + has_escaped, output); + } + + // No unescaping necessary, we can safely pass the input to ICU. This + // function will only get called if we either have escaped or non-ascii + // input, so it's safe to just use ICU now. Even if the input is ASCII, + // this function will do the right thing (just slower than we could). + return DoIDNHost(host, host_len, output); +} + +template<typename CHAR, typename UCHAR> +void DoHost(const CHAR* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (host.len <= 0) { + // Empty hosts don't need anything. + host_info->family = CanonHostInfo::NEUTRAL; + host_info->out_host = url_parse::Component(); + return; + } + + bool has_non_ascii, has_escaped; + ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); + + // Keep track of output's initial length, so we can rewind later. + const int output_begin = output->length(); + + bool success; + if (!has_non_ascii && !has_escaped) { + success = DoSimpleHost(&spec[host.begin], host.len, + output, &has_non_ascii); + DCHECK(!has_non_ascii); + } else { + success = DoComplexHost(&spec[host.begin], host.len, + has_non_ascii, has_escaped, output); + } + + if (!success) { + // Canonicalization failed. Set BROKEN to notify the caller. + host_info->family = CanonHostInfo::BROKEN; + } else { + // After all the other canonicalization, check if we ended up with an IP + // address. IP addresses are small, so writing into this temporary buffer + // should not cause an allocation. + RawCanonOutput<64> canon_ip; + CanonicalizeIPAddress(output->data(), + url_parse::MakeRange(output_begin, output->length()), + &canon_ip, host_info); + + // If we got an IPv4/IPv6 address, copy the canonical form back to the + // real buffer. Otherwise, it's a hostname or broken IP, in which case + // we just leave it in place. + if (host_info->IsIPAddress()) { + output->set_length(output_begin); + output->Append(canon_ip.data(), canon_ip.length()); + } + } + + host_info->out_host = url_parse::MakeRange(output_begin, output->length()); +} + +} // namespace + +bool CanonicalizeHost(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host) { + CanonHostInfo host_info; + DoHost<char, unsigned char>(spec, host, output, &host_info); + *out_host = host_info.out_host; + return (host_info.family != CanonHostInfo::BROKEN); +} + +bool CanonicalizeHost(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host) { + CanonHostInfo host_info; + DoHost<char16, char16>(spec, host, output, &host_info); + *out_host = host_info.out_host; + return (host_info.family != CanonHostInfo::BROKEN); +} + +void CanonicalizeHostVerbose(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo *host_info) { + DoHost<char, unsigned char>(spec, host, output, host_info); +} + +void CanonicalizeHostVerbose(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo *host_info) { + DoHost<char16, char16>(spec, host, output, host_info); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_icu.cc b/googleurl/src/url_canon_icu.cc new file mode 100644 index 0000000..b06808c --- /dev/null +++ b/googleurl/src/url_canon_icu.cc @@ -0,0 +1,207 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ICU integration functions. + +#include <stdlib.h> +#include <string.h> +#include <unicode/ucnv.h> +#include <unicode/ucnv_cb.h> +#include <unicode/uidna.h> + +#include "googleurl/src/url_canon_icu.h" +#include "googleurl/src/url_canon_internal.h" // for _itoa_s + +#include "base/logging.h" + +namespace url_canon { + +namespace { + +// Called when converting a character that can not be represented, this will +// append an escaped version of the numerical character reference for that code +// point. It is of the form "Ӓ" and we will escape the non-digits to +// "%26%231234%3B". Why? This is what Netscape did back in the olden days. +void appendURLEscapedChar(const void* context, + UConverterFromUnicodeArgs* from_args, + const UChar* code_units, + int32_t length, + UChar32 code_point, + UConverterCallbackReason reason, + UErrorCode* err) { + if (reason == UCNV_UNASSIGNED) { + *err = U_ZERO_ERROR; + + const static int prefix_len = 6; + const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped + ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err); + + DCHECK(code_point < 0x110000); + char number[8]; // Max Unicode code point is 7 digits. + _itoa_s(code_point, number, 10); + int number_len = static_cast<int>(strlen(number)); + ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err); + + const static int postfix_len = 3; + const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped + ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err); + } +} + +// A class for scoping the installation of the invalid character callback. +class AppendHandlerInstaller { + public: + // The owner of this object must ensure that the converter is alive for the + // duration of this object's lifetime. + AppendHandlerInstaller(UConverter* converter) : converter_(converter) { + UErrorCode err = U_ZERO_ERROR; + ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0, + &old_callback_, &old_context_, &err); + } + + ~AppendHandlerInstaller() { + UErrorCode err = U_ZERO_ERROR; + ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err); + } + + private: + UConverter* converter_; + + UConverterFromUCallback old_callback_; + const void* old_context_; +}; + +} // namespace + +ICUCharsetConverter::ICUCharsetConverter(UConverter* converter) + : converter_(converter) { +} + +void ICUCharsetConverter::ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output) { + // Install our error handler. It will be called for character that can not + // be represented in the destination character set. + AppendHandlerInstaller handler(converter_); + + int begin_offset = output->length(); + int dest_capacity = output->capacity() - begin_offset; + output->set_length(output->length()); + + do { + UErrorCode err = U_ZERO_ERROR; + char* dest = &output->data()[begin_offset]; + int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity, + input, input_len, &err); + if (err != U_BUFFER_OVERFLOW_ERROR) { + output->set_length(begin_offset + required_capacity); + return; + } + + // Output didn't fit, expand + dest_capacity = required_capacity; + output->Resize(begin_offset + dest_capacity); + } while (true); +} + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must be ASCII, but is represented as wide characters. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, this will return false. The output in this case is undefined. +bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) { + DCHECK(output->length() == 0); // Output buffer is assumed empty. + while (true) { + // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate + // the spec (which do exist). This does not present any risk and is a + // little more future proof. + UErrorCode err = U_ZERO_ERROR; + int num_converted = uidna_IDNToASCII(src, src_len, output->data(), + output->capacity(), + UIDNA_ALLOW_UNASSIGNED, NULL, &err); + if (err == U_ZERO_ERROR) { + output->set_length(num_converted); + return true; + } + if (err != U_BUFFER_OVERFLOW_ERROR) + return false; // Unknown error, give up. + + // Not enough room in our buffer, expand. + output->Resize(output->capacity() * 2); + } +} + +bool ReadUTFChar(const char* str, int* begin, int length, + unsigned* code_point_out) { + int code_point; // Avoids warning when U8_NEXT writes -1 to it. + U8_NEXT(str, *begin, length, code_point); + *code_point_out = static_cast<unsigned>(code_point); + + // The ICU macro above moves to the next char, we want to point to the last + // char consumed. + (*begin)--; + + // Validate the decoded value. + if (U_IS_UNICODE_CHAR(code_point)) + return true; + *code_point_out = kUnicodeReplacementCharacter; + return false; +} + +bool ReadUTFChar(const char16* str, int* begin, int length, + unsigned* code_point) { + if (U16_IS_SURROGATE(str[*begin])) { + if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length || + !U16_IS_TRAIL(str[*begin + 1])) { + // Invalid surrogate pair. + *code_point = kUnicodeReplacementCharacter; + return false; + } else { + // Valid surrogate pair. + *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]); + (*begin)++; + } + } else { + // Not a surrogate, just one 16-bit word. + *code_point = str[*begin]; + } + + if (U_IS_UNICODE_CHAR(*code_point)) + return true; + + // Invalid code point. + *code_point = kUnicodeReplacementCharacter; + return false; +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_icu.h b/googleurl/src/url_canon_icu.h new file mode 100644 index 0000000..3980663 --- /dev/null +++ b/googleurl/src/url_canon_icu.h @@ -0,0 +1,63 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ICU integration functions. + +#ifndef GOOGLEURL_SRC_URL_CANON_ICU_H__ +#define GOOGLEURL_SRC_URL_CANON_ICU_H__ + +#include "googleurl/src/url_canon.h" + +typedef struct UConverter UConverter; + +namespace url_canon { + +// An implementation of CharsetConverter that implementations can use to +// interface the canonicalizer with ICU's conversion routines. +class ICUCharsetConverter : public CharsetConverter { + public: + // Constructs a converter using an already-existing ICU character set + // converter. This converter is NOT owned by this object; the lifetime must + // be managed by the creator such that it is alive as long as this is. + ICUCharsetConverter(UConverter* converter); + + virtual ~ICUCharsetConverter() {} + + virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output); + + private: + // The ICU converter, not owned by this class. + UConverter* converter_; +}; + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_ICU_H__ diff --git a/googleurl/src/url_canon_internal.cc b/googleurl/src/url_canon_internal.cc new file mode 100644 index 0000000..6b776bc --- /dev/null +++ b/googleurl/src/url_canon_internal.cc @@ -0,0 +1,427 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <cstdio> +#include <errno.h> +#include <stdlib.h> +#include <string> + +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +template<typename CHAR, typename UCHAR> +void DoAppendStringOfType(const CHAR* source, int length, + SharedCharTypes type, + CanonOutput* output) { + for (int i = 0; i < length; i++) { + if (static_cast<UCHAR>(source[i]) >= 0x80) { + // ReadChar will fill the code point with kUnicodeReplacementCharacter + // when the input is invalid, which is what we want. + unsigned code_point; + ReadUTFChar(source, &i, length, &code_point); + AppendUTF8EscapedValue(code_point, output); + } else { + // Just append the 7-bit character, possibly escaping it. + unsigned char uch = static_cast<unsigned char>(source[i]); + if (!IsCharOfType(uch, type)) + AppendEscapedChar(uch, output); + else + output->push_back(uch); + } + } +} + +// This function assumes the input values are all contained in 8-bit, +// although it allows any type. Returns true if input is valid, false if not. +template<typename CHAR, typename UCHAR> +void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end, + CanonOutput* output) { + for (int i = begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(spec[i]); + if (uch >= 0x80) { + // Handle UTF-8/16 encodings. This call will correctly handle the error + // case by appending the invalid character. + AppendUTF8EscapedChar(spec, &i, end, output); + } else if (uch <= ' ' || uch == 0x7f) { + // This function is for error handling, so we escape all control + // characters and spaces, but not anything else since we lack + // context to do something more specific. + AppendEscapedChar(static_cast<unsigned char>(uch), output); + } else { + output->push_back(static_cast<char>(uch)); + } + } +} + +// Overrides one component, see the url_canon::Replacements structure for +// what the various combionations of source pointer and component mean. +void DoOverrideComponent(const char* override_source, + const url_parse::Component& override_component, + const char** dest, + url_parse::Component* dest_component) { + if (override_source) { + *dest = override_source; + *dest_component = override_component; + } +} + +// Similar to DoOverrideComponent except that it takes a UTF-16 input and does +// not actually set the output character pointer. +// +// The input is converted to UTF-8 at the end of the given buffer as a temporary +// holding place. The component indentifying the portion of the buffer used in +// the |utf8_buffer| will be specified in |*dest_component|. +// +// This will not actually set any |dest| pointer like DoOverrideComponent +// does because all of the pointers will point into the |utf8_buffer|, which +// may get resized while we're overriding a subsequent component. Instead, the +// caller should use the beginning of the |utf8_buffer| as the string pointer +// for all components once all overrides have been prepared. +bool PrepareUTF16OverrideComponent( + const char16* override_source, + const url_parse::Component& override_component, + CanonOutput* utf8_buffer, + url_parse::Component* dest_component) { + bool success = true; + if (override_source) { + if (!override_component.is_valid()) { + // Non-"valid" component (means delete), so we need to preserve that. + *dest_component = url_parse::Component(); + } else { + // Convert to UTF-8. + dest_component->begin = utf8_buffer->length(); + success = ConvertUTF16ToUTF8(&override_source[override_component.begin], + override_component.len, utf8_buffer); + dest_component->len = utf8_buffer->length() - dest_component->begin; + } + } + return success; +} + +} // namespace + +// See the header file for this array's declaration. +const unsigned char kSharedCharTypeTable[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f + 0, // 0x20 ' ' (escape spaces in queries) + CHAR_QUERY | CHAR_USERINFO, // 0x21 ! + 0, // 0x22 " + 0, // 0x23 # (invalid in query since it marks the ref) + CHAR_QUERY | CHAR_USERINFO, // 0x24 $ + CHAR_QUERY | CHAR_USERINFO, // 0x25 % + CHAR_QUERY | CHAR_USERINFO, // 0x26 & + CHAR_QUERY | CHAR_USERINFO, // 0x27 ' + CHAR_QUERY | CHAR_USERINFO, // 0x28 ( + CHAR_QUERY | CHAR_USERINFO, // 0x29 ) + CHAR_QUERY | CHAR_USERINFO, // 0x2a * + CHAR_QUERY | CHAR_USERINFO, // 0x2b + + CHAR_QUERY | CHAR_USERINFO, // 0x2c , + CHAR_QUERY | CHAR_USERINFO, // 0x2d - + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x2e . + CHAR_QUERY, // 0x2f / + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x30 0 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x31 1 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x32 2 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x33 3 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x34 4 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x35 5 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x36 6 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x37 7 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC, // 0x38 8 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC, // 0x39 9 + CHAR_QUERY, // 0x3a : + CHAR_QUERY, // 0x3b ; + 0, // 0x3c < (Try to prevent certain types of XSS.) + CHAR_QUERY, // 0x3d = + 0, // 0x3e > (Try to prevent certain types of XSS.) + CHAR_QUERY, // 0x3f ? + CHAR_QUERY, // 0x40 @ + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x41 A + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x42 B + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x43 C + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x44 D + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x45 E + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x46 F + CHAR_QUERY | CHAR_USERINFO, // 0x47 G + CHAR_QUERY | CHAR_USERINFO, // 0x48 H + CHAR_QUERY | CHAR_USERINFO, // 0x49 I + CHAR_QUERY | CHAR_USERINFO, // 0x4a J + CHAR_QUERY | CHAR_USERINFO, // 0x4b K + CHAR_QUERY | CHAR_USERINFO, // 0x4c L + CHAR_QUERY | CHAR_USERINFO, // 0x4d M + CHAR_QUERY | CHAR_USERINFO, // 0x4e N + CHAR_QUERY | CHAR_USERINFO, // 0x4f O + CHAR_QUERY | CHAR_USERINFO, // 0x50 P + CHAR_QUERY | CHAR_USERINFO, // 0x51 Q + CHAR_QUERY | CHAR_USERINFO, // 0x52 R + CHAR_QUERY | CHAR_USERINFO, // 0x53 S + CHAR_QUERY | CHAR_USERINFO, // 0x54 T + CHAR_QUERY | CHAR_USERINFO, // 0x55 U + CHAR_QUERY | CHAR_USERINFO, // 0x56 V + CHAR_QUERY | CHAR_USERINFO, // 0x57 W + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x58 X + CHAR_QUERY | CHAR_USERINFO, // 0x59 Y + CHAR_QUERY | CHAR_USERINFO, // 0x5a Z + CHAR_QUERY, // 0x5b [ + CHAR_QUERY, // 0x5c '\' + CHAR_QUERY, // 0x5d ] + CHAR_QUERY, // 0x5e ^ + CHAR_QUERY | CHAR_USERINFO, // 0x5f _ + CHAR_QUERY, // 0x60 ` + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x61 a + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x62 b + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x63 c + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x64 d + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x65 e + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x66 f + CHAR_QUERY | CHAR_USERINFO, // 0x67 g + CHAR_QUERY | CHAR_USERINFO, // 0x68 h + CHAR_QUERY | CHAR_USERINFO, // 0x69 i + CHAR_QUERY | CHAR_USERINFO, // 0x6a j + CHAR_QUERY | CHAR_USERINFO, // 0x6b k + CHAR_QUERY | CHAR_USERINFO, // 0x6c l + CHAR_QUERY | CHAR_USERINFO, // 0x6d m + CHAR_QUERY | CHAR_USERINFO, // 0x6e n + CHAR_QUERY | CHAR_USERINFO, // 0x6f o + CHAR_QUERY | CHAR_USERINFO, // 0x70 p + CHAR_QUERY | CHAR_USERINFO, // 0x71 q + CHAR_QUERY | CHAR_USERINFO, // 0x72 r + CHAR_QUERY | CHAR_USERINFO, // 0x73 s + CHAR_QUERY | CHAR_USERINFO, // 0x74 t + CHAR_QUERY | CHAR_USERINFO, // 0x75 u + CHAR_QUERY | CHAR_USERINFO, // 0x76 v + CHAR_QUERY | CHAR_USERINFO, // 0x77 w + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x78 x + CHAR_QUERY | CHAR_USERINFO, // 0x79 y + CHAR_QUERY | CHAR_USERINFO, // 0x7a z + CHAR_QUERY, // 0x7b { + CHAR_QUERY, // 0x7c | + CHAR_QUERY, // 0x7d } + CHAR_QUERY | CHAR_USERINFO, // 0x7e ~ + 0, // 0x7f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 - 0xaf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 - 0xbf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 - 0xcf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 - 0xdf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff +}; + +const char kHexCharLookup[0x10] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', +}; + +const char kCharToHexLookup[8] = { + 0, // 0x00 - 0x1f + '0', // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39 + 'A' - 10, // 0x40 - 0x5f: letters A - F are 0x41 - 0x46 + 'a' - 10, // 0x60 - 0x7f: letters a - f are 0x61 - 0x66 + 0, // 0x80 - 0x9F + 0, // 0xA0 - 0xBF + 0, // 0xC0 - 0xDF + 0, // 0xE0 - 0xFF +}; + +const char16 kUnicodeReplacementCharacter = 0xfffd; + +void AppendStringOfType(const char* source, int length, + SharedCharTypes type, + CanonOutput* output) { + DoAppendStringOfType<char, unsigned char>(source, length, type, output); +} + +void AppendStringOfType(const char16* source, int length, + SharedCharTypes type, + CanonOutput* output) { + DoAppendStringOfType<char16, char16>(source, length, type, output); +} + +void AppendInvalidNarrowString(const char* spec, int begin, int end, + CanonOutput* output) { + DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output); +} + +void AppendInvalidNarrowString(const char16* spec, int begin, int end, + CanonOutput* output) { + DoAppendInvalidNarrowString<char16, char16>(spec, begin, end, output); +} + +bool ConvertUTF16ToUTF8(const char16* input, int input_len, + CanonOutput* output) { + bool success = true; + for (int i = 0; i < input_len; i++) { + unsigned code_point; + success &= ReadUTFChar(input, &i, input_len, &code_point); + AppendUTF8Value(code_point, output); + } + return success; +} + +bool ConvertUTF8ToUTF16(const char* input, int input_len, + CanonOutputT<char16>* output) { + bool success = true; + for (int i = 0; i < input_len; i++) { + unsigned code_point; + success &= ReadUTFChar(input, &i, input_len, &code_point); + AppendUTF16Value(code_point, output); + } + return success; +} + +void SetupOverrideComponents(const char* base, + const Replacements<char>& repl, + URLComponentSource<char>* source, + url_parse::Parsed* parsed) { + // Get the source and parsed structures of the things we are replacing. + const URLComponentSource<char>& repl_source = repl.sources(); + const url_parse::Parsed& repl_parsed = repl.components(); + + DoOverrideComponent(repl_source.scheme, repl_parsed.scheme, + &source->scheme, &parsed->scheme); + DoOverrideComponent(repl_source.username, repl_parsed.username, + &source->username, &parsed->username); + DoOverrideComponent(repl_source.password, repl_parsed.password, + &source->password, &parsed->password); + + // Our host should be empty if not present, so override the default setup. + DoOverrideComponent(repl_source.host, repl_parsed.host, + &source->host, &parsed->host); + if (parsed->host.len == -1) + parsed->host.len = 0; + + DoOverrideComponent(repl_source.port, repl_parsed.port, + &source->port, &parsed->port); + DoOverrideComponent(repl_source.path, repl_parsed.path, + &source->path, &parsed->path); + DoOverrideComponent(repl_source.query, repl_parsed.query, + &source->query, &parsed->query); + DoOverrideComponent(repl_source.ref, repl_parsed.ref, + &source->ref, &parsed->ref); +} + +bool SetupUTF16OverrideComponents(const char* base, + const Replacements<char16>& repl, + CanonOutput* utf8_buffer, + URLComponentSource<char>* source, + url_parse::Parsed* parsed) { + bool success = true; + + // Get the source and parsed structures of the things we are replacing. + const URLComponentSource<char16>& repl_source = repl.sources(); + const url_parse::Parsed& repl_parsed = repl.components(); + + success &= PrepareUTF16OverrideComponent( + repl_source.scheme, repl_parsed.scheme, + utf8_buffer, &parsed->scheme); + success &= PrepareUTF16OverrideComponent( + repl_source.username, repl_parsed.username, + utf8_buffer, &parsed->username); + success &= PrepareUTF16OverrideComponent( + repl_source.password, repl_parsed.password, + utf8_buffer, &parsed->password); + success &= PrepareUTF16OverrideComponent( + repl_source.host, repl_parsed.host, + utf8_buffer, &parsed->host); + success &= PrepareUTF16OverrideComponent( + repl_source.port, repl_parsed.port, + utf8_buffer, &parsed->port); + success &= PrepareUTF16OverrideComponent( + repl_source.path, repl_parsed.path, + utf8_buffer, &parsed->path); + success &= PrepareUTF16OverrideComponent( + repl_source.query, repl_parsed.query, + utf8_buffer, &parsed->query); + success &= PrepareUTF16OverrideComponent( + repl_source.ref, repl_parsed.ref, + utf8_buffer, &parsed->ref); + + // PrepareUTF16OverrideComponent will not have set the data pointer since the + // buffer could be resized, invalidating the pointers. We set the data + // pointers for affected components now that the buffer is finalized. + if (repl_source.scheme) source->scheme = utf8_buffer->data(); + if (repl_source.username) source->username = utf8_buffer->data(); + if (repl_source.password) source->password = utf8_buffer->data(); + if (repl_source.host) source->host = utf8_buffer->data(); + if (repl_source.port) source->port = utf8_buffer->data(); + if (repl_source.path) source->path = utf8_buffer->data(); + if (repl_source.query) source->query = utf8_buffer->data(); + if (repl_source.ref) source->ref = utf8_buffer->data(); + + return success; +} + +#ifndef WIN32 + +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) { + const char* format_str; + if (radix == 10) + format_str = "%d"; + else if (radix == 16) + format_str = "%x"; + else + return EINVAL; + + int written = snprintf(buffer, size_in_chars, format_str, value); + if (static_cast<size_t>(written) >= size_in_chars) { + // Output was truncated, or written was negative. + return EINVAL; + } + return 0; +} + +int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix) { + if (radix != 10) + return EINVAL; + + // No more than 12 characters will be required for a 32-bit integer. + // Add an extra byte for the terminating null. + char temp[13]; + int written = snprintf(temp, sizeof(temp), "%d", value); + if (static_cast<size_t>(written) >= size_in_chars) { + // Output was truncated, or written was negative. + return EINVAL; + } + + for (int i = 0; i < written; ++i) { + buffer[i] = static_cast<char16>(temp[i]); + } + buffer[written] = '\0'; + return 0; +} + +#endif // !WIN32 + +} // namespace url_canon diff --git a/googleurl/src/url_canon_internal.h b/googleurl/src/url_canon_internal.h new file mode 100644 index 0000000..4b1e45a --- /dev/null +++ b/googleurl/src/url_canon_internal.h @@ -0,0 +1,460 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is intended to be included in another C++ file where the character +// types are defined. This allows us to write mostly generic code, but not have +// templace bloat because everything is inlined when anybody calls any of our +// functions. + +#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ + +#include <stdlib.h> + +#include "googleurl/src/url_canon.h" + +namespace url_canon { + +// Character type handling ----------------------------------------------------- + +// Bits that identify different character types. These types identify different +// bits that are set for each 8-bit character in the kSharedCharTypeTable. +enum SharedCharTypes { + // Characters that do not require escaping in queries. Characters that do + // not have this flag will be escaped, see url_canon_query.cc + CHAR_QUERY = 1, + + // Valid in the username/password field. + CHAR_USERINFO = 2, + + // Valid in a IPv4 address (digits plus dot and 'x' for hex). + CHAR_IPV4 = 4, + + // Valid in an ASCII-representation of a hex digit (as in %-escaped). + CHAR_HEX = 8, + + // Valid in an ASCII-representation of a decimal digit. + CHAR_DEC = 16, + + // Valid in an ASCII-representation of an octal digit. + CHAR_OCT = 32, +}; + +// This table contains the flags in SharedCharTypes for each 8-bit character. +// Some canonicalization functions have their own specialized lookup table. +// For those with simple requirements, we have collected the flags in one +// place so there are fewer lookup tables to load into the CPU cache. +// +// Using an unsigned char type has a small but measurable performance benefit +// over using a 32-bit number. +extern const unsigned char kSharedCharTypeTable[0x100]; + +// More readable wrappers around the character type lookup table. +inline bool IsCharOfType(unsigned char c, SharedCharTypes type) { + return !!(kSharedCharTypeTable[c] & type); +} +inline bool IsQueryChar(unsigned char c) { + return IsCharOfType(c, CHAR_QUERY); +} +inline bool IsIPv4Char(unsigned char c) { + return IsCharOfType(c, CHAR_IPV4); +} +inline bool IsHexChar(unsigned char c) { + return IsCharOfType(c, CHAR_HEX); +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. +void AppendStringOfType(const char* source, int length, + SharedCharTypes type, + CanonOutput* output); +void AppendStringOfType(const char16* source, int length, + SharedCharTypes type, + CanonOutput* output); + +// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit +// that will be used to represent it. +extern const char kHexCharLookup[0x10]; + +// This lookup table allows fast conversion between ASCII hex letters and their +// corresponding numerical value. The 8-bit range is divided up into 8 +// regions of 0x20 characters each. Each of the three character types (numbers, +// uppercase, lowercase) falls into different regions of this range. The table +// contains the amount to subtract from characters in that range to get at +// the corresponding numerical value. +// +// See HexDigitToValue for the lookup. +extern const char kCharToHexLookup[8]; + +// Assumes the input is a valid hex digit! Call IsHexChar before using this. +inline unsigned char HexCharToValue(unsigned char c) { + return c - kCharToHexLookup[c / 0x20]; +} + +// Indicates if the given character is a dot or dot equivalent, returning the +// number of characters taken by it. This will be one for a literal dot, 3 for +// an escaped dot. If the character is not a dot, this will return 0. +template<typename CHAR> +inline int IsDot(const CHAR* spec, int offset, int end) { + if (spec[offset] == '.') { + return 1; + } else if (spec[offset] == '%' && offset + 3 <= end && + spec[offset + 1] == '2' && + (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) { + // Found "%2e" + return 3; + } + return 0; +} + +// Returns the canonicalized version of the input character according to scheme +// rules. This is implemented alongside the scheme canonicalizer, and is +// required for relative URL resolving to test for scheme equality. +// +// Returns 0 if the input character is not a valid scheme character. +char CanonicalSchemeChar(char16 ch); + +// Write a single character, escaped, to the output. This always escapes: it +// does no checking that thee character requires escaping. +// Escaping makes sense only 8 bit chars, so code works in all cases of +// input parameters (8/16bit). +template<typename UINCHAR, typename OUTCHAR> +inline void AppendEscapedChar(UINCHAR ch, + CanonOutputT<OUTCHAR>* output) { + output->push_back('%'); + output->push_back(kHexCharLookup[ch >> 4]); + output->push_back(kHexCharLookup[ch & 0xf]); +} + +// The character we'll substitute for undecodable or invalid characters. +extern const char16 kUnicodeReplacementCharacter; + +// UTF-8 functions ------------------------------------------------------------ + +// Reads one character in UTF-8 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-byte ASCII character, it will not be changed). +// +// Implementation is in url_canon_icu.cc. +bool ReadUTFChar(const char* str, int* begin, int length, + unsigned* code_point_out); + +// Generic To-UTF-8 converter. This will call the given append method for each +// character that should be appended, with the given output method. Wrappers +// are provided below for escaped and non-escaped versions of this. +template<class Output, void Appender(unsigned char, Output*)> +inline void DoAppendUTF8(unsigned char_value, Output* output) { + if (char_value <= 0x7f) { + Appender(static_cast<unsigned char>(char_value), output); + } else if (char_value <= 0x7ff) { + // 110xxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0xffff) { + // 1110xxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0x1fffff) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0x10FFFF) { // Max unicode code point. + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xf8 | (char_value >> 24)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 18) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else { + // Invalid UTF-8 character (>20 bits) + } +} + +// Helper used by AppendUTF8Value below. We use an unsigned parameter so there +// are no funny sign problems with the input, but then have to convert it to +// a regular char for appending. +inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { + output->push_back(static_cast<char>(ch)); +} + +// Writes the given character to the output as UTF-8. This does NO checking +// of the validity of the unicode characters; the caller should ensure that +// the value it is appending is valid to append. +inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { + DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); +} + +// Writes the given character to the output as UTF-8, escaping ALL +// characters (even when they are ASCII). This does NO checking of the +// validity of the unicode characters; the caller should ensure that the value +// it is appending is valid to append. +inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { + DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); +} + +// UTF-16 functions ----------------------------------------------------------- + +// Reads one character in UTF-16 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-16-bit-word character, it will not be changed). +// +// Implementation is in url_canon_icu.cc. +bool ReadUTFChar(const char16* str, int* begin, int length, + unsigned* code_point); + +// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. +inline void AppendUTF16Value(unsigned code_point, + CanonOutputT<char16>* output) { + if (code_point > 0xffff) { + output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0)); + output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00)); + } else { + output->push_back(static_cast<char16>(code_point)); + } +} + +// Escaping functions --------------------------------------------------------- + +// Writes the given character to the output as UTF-8, escaped. Call this +// function only when the input is wide. Returns true on success. Failure +// means there was some problem with the encoding, we'll still try to +// update the |*begin| pointer and add a placeholder character to the +// output so processing can continue. +// +// We will append the character starting at ch[begin] with the buffer ch +// being |length|. |*begin| will be updated to point to the last character +// consumed (we may consume more than one for UTF-16) so that if called in +// a loop, incrementing the pointer will move to the next character. +// +// Every single output character will be escaped. This means that if you +// give it an ASCII character as input, it will be escaped. Some code uses +// this when it knows that a character is invalid according to its rules +// for validity. If you don't want escaping for ASCII characters, you will +// have to filter them out prior to calling this function. +// +// Assumes that ch[begin] is within range in the array, but does not assume +// that any following characters are. +inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length, + CanonOutput* output) { + // UTF-16 input. Readchar16 will handle invalid characters for us and give + // us the kUnicodeReplacementCharacter, so we don't have to do special + // checking after failure, just pass through the failure to the caller. + unsigned char_value; + bool success = ReadUTFChar(str, begin, length, &char_value); + AppendUTF8EscapedValue(char_value, output); + return success; +} + +// Handles UTF-8 input. See the wide version above for usage. +inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, + CanonOutput* output) { + // ReadUTF8Char will handle invalid characters for us and give us the + // kUnicodeReplacementCharacter, so we don't have to do special checking + // after failure, just pass through the failure to the caller. + unsigned ch; + bool success = ReadUTFChar(str, begin, length, &ch); + AppendUTF8EscapedValue(ch, output); + return success; +} + +// Given a '%' character at |*begin| in the string |spec|, this will decode +// the escaped value and put it into |*unescaped_value| on success (returns +// true). On failure, this will return false, and will not write into +// |*unescaped_value|. +// +// |*begin| will be updated to point to the last character of the escape +// sequence so that when called with the index of a for loop, the next time +// through it will point to the next character to be considered. On failure, +// |*begin| will be unchanged. +inline bool Is8BitChar(char c) { + return true; // this case is specialized to avoid a warning +} +inline bool Is8BitChar(char16 c) { + return c <= 255; +} + +template<typename CHAR> +inline bool DecodeEscaped(const CHAR* spec, int* begin, int end, + unsigned char* unescaped_value) { + if (*begin + 3 > end || + !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) { + // Invalid escape sequence because there's not enough room, or the + // digits are not ASCII. + return false; + } + + unsigned char first = static_cast<unsigned char>(spec[*begin + 1]); + unsigned char second = static_cast<unsigned char>(spec[*begin + 2]); + if (!IsHexChar(first) || !IsHexChar(second)) { + // Invalid hex digits, fail. + return false; + } + + // Valid escape sequence. + *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second); + *begin += 2; + return true; +} + +// Appends the given substring to the output, escaping "some" characters that +// it feels may not be safe. It assumes the input values are all contained in +// 8-bit although it allows any type. +// +// This is used in error cases to append invalid output so that it looks +// approximately correct. Non-error cases should not call this function since +// the escaping rules are not guaranteed! +void AppendInvalidNarrowString(const char* spec, int begin, int end, + CanonOutput* output); +void AppendInvalidNarrowString(const char16* spec, int begin, int end, + CanonOutput* output); + +// Misc canonicalization helpers ---------------------------------------------- + +// Converts between UTF-8 and UTF-16, returning true on successful conversion. +// The output will be appended to the given canonicalizer output (so make sure +// it's empty if you want to replace). +// +// On invalid input, this will still write as much output as possible, +// replacing the invalid characters with the "invalid character". It will +// return false in the failure case, and the caller should not continue as +// normal. +bool ConvertUTF16ToUTF8(const char16* input, int input_len, + CanonOutput* output); +bool ConvertUTF8ToUTF16(const char* input, int input_len, + CanonOutputT<char16>* output); + +// Converts from UTF-16 to 8-bit using the character set converter. If the +// converter is NULL, this will use UTF-8. +void ConvertUTF16ToQueryEncoding(const char16* input, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output); + +// Applies the replacements to the given component source. The component source +// should be pre-initialized to the "old" base. That is, all pointers will +// point to the spec of the old URL, and all of the Parsed components will +// be indices into that string. +// +// The pointers and components in the |source| for all non-NULL strings in the +// |repl| (replacements) will be updated to reference those strings. +// Canonicalizing with the new |source| and |parsed| can then combine URL +// components from many different strings. +void SetupOverrideComponents(const char* base, + const Replacements<char>& repl, + URLComponentSource<char>* source, + url_parse::Parsed* parsed); + +// Like the above 8-bit version, except that it additionally converts the +// UTF-16 input to UTF-8 before doing the overrides. +// +// The given utf8_buffer is used to store the converted components. They will +// be appended one after another, with the parsed structure identifying the +// appropriate substrings. This buffer is a parameter because the source has +// no storage, so the buffer must have the same lifetime as the source +// parameter owned by the caller. +// +// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of +// |source| will point into this buffer, which could be invalidated if +// additional data is added and the CanonOutput resizes its buffer. +// +// Returns true on success. Fales means that the input was not valid UTF-16, +// although we will have still done the override with "invalid characters" in +// place of errors. +bool SetupUTF16OverrideComponents(const char* base, + const Replacements<char16>& repl, + CanonOutput* utf8_buffer, + URLComponentSource<char>* source, + url_parse::Parsed* parsed); + +// Implemented in url_canon_path.cc, these are required by the relative URL +// resolver as well, so we declare them here. +bool CanonicalizePartialPath(const char* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output); +bool CanonicalizePartialPath(const char16* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output); + +#ifndef WIN32 + +// Implementations of Windows' int-to-string conversions +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix); +int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix); + +// Secure template overloads for these functions +template<size_t N> +inline int _itoa_s(int value, char (&buffer)[N], int radix) { + return _itoa_s(value, buffer, N, radix); +} + +template<size_t N> +inline int _itow_s(int value, char16 (&buffer)[N], int radix) { + return _itow_s(value, buffer, N, radix); +} + +// _strtoui64 and strtoull behave the same +inline unsigned long long _strtoui64(const char* nptr, + char** endptr, int base) { + return strtoull(nptr, endptr, base); +} + +#endif // WIN32 + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ diff --git a/googleurl/src/url_canon_internal_file.h b/googleurl/src/url_canon_internal_file.h new file mode 100644 index 0000000..63a9c5b --- /dev/null +++ b/googleurl/src/url_canon_internal_file.h @@ -0,0 +1,157 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// As with url_canon_internal.h, this file is intended to be included in +// another C++ file where the template types are defined. This allows the +// programmer to use this to use these functions for their own strings +// types, without bloating the code by having inline templates used in +// every call site. +// +// *** This file must be included after url_canon_internal as we depend on some +// functions in it. *** + +#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ +#define GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ + +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +using namespace url_canon; + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template<typename CHAR> +static int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z') + output->push_back(spec[after_slashes] - 'a' + 'A'); + else + output->push_back(static_cast<char>(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + output->push_back('/'); + return after_slashes + 2; +} + +// FileDoDriveSpec will have already added the first backslash, so we need to +// write everything following the slashes using the path canonicalizer. +template<typename CHAR, typename UCHAR> +static void FileDoPath(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // Normalize the number of slashes after the drive letter. The path + // canonicalizer expects the input to begin in a slash already so + // doesn't check. We want to handle no-slashes + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + // Now use the regular path canonicalizer to canonicalize the rest of the + // path. We supply it with the path following the slashes. It won't prepend + // a slash because it assumes any nonempty path already starts with one. + // We explicitly filter out calls with no path here to prevent that case. + ParsedURL::Component sub_path(after_slashes, end - after_slashes); + if (sub_path.len > 0) { + // Give it a fake output component to write into. DoCanonicalizeFile will + // compute the full path component. + ParsedURL::Component fake_output_path; + URLCanonInternal<CHAR, UCHAR>::DoPath( + spec, sub_path, output, &fake_output_path); + } +} + +template<typename CHAR, typename UCHAR> +static bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source, + const ParsedURL& parsed, + CanonOutput* output, + ParsedURL* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = ParsedURL::Component(0, -1); + new_parsed->password = ParsedURL::Component(0, -1); + new_parsed->port = ParsedURL::Component(0, -1); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->push_back('f'); + output->push_back('i'); + output->push_back('l'); + output->push_back('e'); + new_parsed->scheme.len = output->length() - new_parsed->scheme.begin; + output->push_back(':'); + + // Write the separator for the host. + output->push_back('/'); + output->push_back('/'); + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = URLCanonInternal<CHAR, UCHAR>::DoHost( + source.host, parsed.host, output, &new_parsed->host); + + // Write a separator for the start of the path. We'll ignore any slashes + // already at the beginning of the path. + new_parsed->path.begin = output->length(); + output->push_back('/'); + + // Copies and normalizes the "c:" at the beginning, if present. + int after_drive = FileDoDriveSpec(source.path, parsed.path.begin, + parsed.path.end(), output); + + // Copies the rest of the path + FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output); + new_parsed->path.len = output->length() - new_parsed->path.begin; + + // Things following the path we can use the standard canonicalizers for. + success &= URLCanonInternal<CHAR, UCHAR>::DoQuery( + source.query, parsed.query, output, &new_parsed->query); + success &= URLCanonInternal<CHAR, UCHAR>::DoRef( + source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ diff --git a/googleurl/src/url_canon_ip.cc b/googleurl/src/url_canon_ip.cc new file mode 100644 index 0000000..d84ff7d --- /dev/null +++ b/googleurl/src/url_canon_ip.cc @@ -0,0 +1,734 @@ +// Copyright 2009, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "googleurl/src/url_canon_ip.h" + +#include <stdlib.h> + +#include "base/basictypes.h" +#include "base/logging.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +// Converts one of the character types that represent a numerical base to the +// corresponding base. +int BaseForType(SharedCharTypes type) { + switch (type) { + case CHAR_HEX: + return 16; + case CHAR_DEC: + return 10; + case CHAR_OCT: + return 8; + default: + return 0; + } +} + +template<typename CHAR, typename UCHAR> +bool DoFindIPv4Components(const CHAR* spec, + const url_parse::Component& host, + url_parse::Component components[4]) { + int cur_component = 0; // Index of the component we're working on. + int cur_component_begin = host.begin; // Start of the current component. + int end = host.end(); + for (int i = host.begin; /* nothing */; i++) { + if (i == end || spec[i] == '.') { + // Found the end of the current component. + int component_len = i - cur_component_begin; + components[cur_component] = + url_parse::Component(cur_component_begin, component_len); + + // The next component starts after the dot. + cur_component_begin = i + 1; + cur_component++; + + // Don't allow empty components (two dots in a row), except we may + // allow an empty component at the end (this would indicate that the + // input ends in a dot). We also want to error if the component is + // empty and it's the only component (cur_component == 1). + if (component_len == 0 && (i != end || cur_component == 1)) + return false; + + if (i == end) + break; // End of the input. + + if (cur_component == 4) { + // Anything else after the 4th component is an error unless it is a + // dot that would otherwise be treated as the end of input. + if (spec[i] == '.' && i + 1 == end) + break; + return false; + } + } else if (static_cast<UCHAR>(spec[i]) >= 0x80 || + !IsIPv4Char(static_cast<unsigned char>(spec[i]))) { + // Invalid character for an IPv4 address. + return false; + } + } + + // Fill in any unused components. + while (cur_component < 4) + components[cur_component++] = url_parse::Component(); + return true; +} + +// Converts an IPv4 component to a 32-bit number, while checking for overflow. +// +// Possible return values: +// - IPV4 - The number was valid, and did not overflow. +// - BROKEN - The input was numeric, but too large for a 32-bit field. +// - NEUTRAL - Input was not numeric. +// +// The input is assumed to be ASCII. FindIPv4Components should have stripped +// out any input that is greater than 7 bits. The components are assumed +// to be non-empty. +template<typename CHAR> +CanonHostInfo::Family IPv4ComponentToNumber( + const CHAR* spec, + const url_parse::Component& component, + uint32* number) { + // Figure out the base + SharedCharTypes base; + int base_prefix_len = 0; // Size of the prefix for this base. + if (spec[component.begin] == '0') { + // Either hex or dec, or a standalone zero. + if (component.len == 1) { + base = CHAR_DEC; + } else if (spec[component.begin + 1] == 'X' || + spec[component.begin + 1] == 'x') { + base = CHAR_HEX; + base_prefix_len = 2; + } else { + base = CHAR_OCT; + base_prefix_len = 1; + } + } else { + base = CHAR_DEC; + } + + // Extend the prefix to consume all leading zeros. + while (base_prefix_len < component.len && + spec[component.begin + base_prefix_len] == '0') + base_prefix_len++; + + // Put the component, minus any base prefix, into a NULL-terminated buffer so + // we can call the standard library. Because leading zeros have already been + // discarded, filling the entire buffer is guaranteed to trigger the 32-bit + // overflow check. + const int kMaxComponentLen = 16; + char buf[kMaxComponentLen + 1]; // digits + '\0' + int dest_i = 0; + for (int i = component.begin + base_prefix_len; i < component.end(); i++) { + // We know the input is 7-bit, so convert to narrow (if this is the wide + // version of the template) by casting. + char input = static_cast<char>(spec[i]); + + // Validate that this character is OK for the given base. + if (!IsCharOfType(input, base)) + return CanonHostInfo::NEUTRAL; + + // Fill the buffer, if there's space remaining. This check allows us to + // verify that all characters are numeric, even those that don't fit. + if (dest_i < kMaxComponentLen) + buf[dest_i++] = input; + } + + buf[dest_i] = '\0'; + + // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal + // number can overflow a 64-bit number in <= 16 characters). + uint64 num = _strtoui64(buf, NULL, BaseForType(base)); + + // Check for 32-bit overflow. + if (num > kuint32max) + return CanonHostInfo::BROKEN; + + // No overflow. Success! + *number = static_cast<uint32>(num); + return CanonHostInfo::IPV4; +} + +// Writes the given address (with each character representing one dotted +// part of an IPv4 address) to the output, and updating |*out_host| to +// identify the added portion. +void AppendIPv4Address(const unsigned char address[4], + CanonOutput* output, + url_parse::Component* out_host) { + out_host->begin = output->length(); + for (int i = 0; i < 4; i++) { + char str[16]; + _itoa_s(address[i], str, 10); + + for (int ch = 0; str[ch] != 0; ch++) + output->push_back(str[ch]); + + if (i != 3) + output->push_back('.'); + } + out_host->len = output->length() - out_host->begin; +} + +// See declaration of IPv4AddressToNumber for documentation. +template<typename CHAR> +CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components) { + // The identified components. Not all may exist. + url_parse::Component components[4]; + if (!FindIPv4Components(spec, host, components)) + return CanonHostInfo::NEUTRAL; + + // Convert existing components to digits. Values up to + // |existing_components| will be valid. + uint32 component_values[4]; + int existing_components = 0; + for (int i = 0; i < 4; i++) { + if (components[i].len <= 0) + continue; + CanonHostInfo::Family family = IPv4ComponentToNumber( + spec, components[i], &component_values[existing_components]); + + // Stop if we hit an invalid non-empty component. + if (family != CanonHostInfo::IPV4) + return family; + + existing_components++; + } + + // Use that sequence of numbers to fill out the 4-component IP address. + + // First, process all components but the last, while making sure each fits + // within an 8-bit field. + for (int i = 0; i < existing_components - 1; i++) { + if (component_values[i] > kuint8max) + return CanonHostInfo::BROKEN; + address[i] = static_cast<unsigned char>(component_values[i]); + } + + // Next, consume the last component to fill in the remaining bytes. + uint32 last_value = component_values[existing_components - 1]; + for (int i = 3; i >= existing_components - 1; i--) { + address[i] = static_cast<unsigned char>(last_value); + last_value >>= 8; + } + + // If the last component has residual bits, report overflow. + if (last_value != 0) + return CanonHostInfo::BROKEN; + + // Tell the caller how many components we saw. + *num_ipv4_components = existing_components; + + // Success! + return CanonHostInfo::IPV4; +} + +// Return true if we've made a final IPV4/BROKEN decision, false if the result +// is NEUTRAL, and we could use a second opinion. +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeIPv4Address(const CHAR* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + unsigned char address[4]; + host_info->family = IPv4AddressToNumber( + spec, host, address, &host_info->num_ipv4_components); + + switch (host_info->family) { + case CanonHostInfo::IPV4: + // Definitely an IPv4 address. + AppendIPv4Address(address, output, &host_info->out_host); + return true; + case CanonHostInfo::BROKEN: + // Definitely broken. + return true; + default: + // Could be IPv6 or a hostname. + return false; + } +} + +// Helper class that describes the main components of an IPv6 input string. +// See the following examples to understand how it breaks up an input string: +// +// [Example 1]: input = "[::aa:bb]" +// ==> num_hex_components = 2 +// ==> hex_components[0] = Component(3,2) "aa" +// ==> hex_components[1] = Component(6,2) "bb" +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(0, -1) +// +// [Example 2]: input = "[1:2::3:4:5]" +// ==> num_hex_components = 5 +// ==> hex_components[0] = Component(1,1) "1" +// ==> hex_components[1] = Component(3,1) "2" +// ==> hex_components[2] = Component(6,1) "3" +// ==> hex_components[3] = Component(8,1) "4" +// ==> hex_components[4] = Component(10,1) "5" +// ==> index_of_contraction = 2 +// ==> ipv4_component = Component(0, -1) +// +// [Example 3]: input = "[::ffff:192.168.0.1]" +// ==> num_hex_components = 1 +// ==> hex_components[0] = Component(3,4) "ffff" +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(8, 11) "192.168.0.1" +// +// [Example 4]: input = "[1::]" +// ==> num_hex_components = 1 +// ==> hex_components[0] = Component(1,1) "1" +// ==> index_of_contraction = 1 +// ==> ipv4_component = Component(0, -1) +// +// [Example 5]: input = "[::192.168.0.1]" +// ==> num_hex_components = 0 +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(8, 11) "192.168.0.1" +// +struct IPv6Parsed { + // Zero-out the parse information. + void reset() { + num_hex_components = 0; + index_of_contraction = -1; + ipv4_component.reset(); + } + + // There can be up to 8 hex components (colon separated) in the literal. + url_parse::Component hex_components[8]; + + // The count of hex components present. Ranges from [0,8]. + int num_hex_components; + + // The index of the hex component that the "::" contraction precedes, or + // -1 if there is no contraction. + int index_of_contraction; + + // The range of characters which are an IPv4 literal. + url_parse::Component ipv4_component; +}; + +// Parse the IPv6 input string. If parsing succeeded returns true and fills +// |parsed| with the information. If parsing failed (because the input is +// invalid) returns false. +template<typename CHAR, typename UCHAR> +bool DoParseIPv6(const CHAR* spec, + const url_parse::Component& host, + IPv6Parsed* parsed) { + // Zero-out the info. + parsed->reset(); + + if (!host.is_nonempty()) + return false; + + // The index for start and end of address range (no brackets). + int begin = host.begin; + int end = host.end(); + + int cur_component_begin = begin; // Start of the current component. + + // Scan through the input, searching for hex components, "::" contractions, + // and IPv4 components. + for (int i = begin; /* i <= end */; i++) { + bool is_colon = spec[i] == ':'; + bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':'; + + // We reached the end of the current component if we encounter a colon + // (separator between hex components, or start of a contraction), or end of + // input. + if (is_colon || i == end) { + int component_len = i - cur_component_begin; + + // A component should not have more than 4 hex digits. + if (component_len > 4) + return false; + + // Don't allow empty components. + if (component_len == 0) { + // The exception is when contractions appear at beginning of the + // input or at the end of the input. + if (!((is_contraction && i == begin) || (i == end && + parsed->index_of_contraction == parsed->num_hex_components))) + return false; + } + + // Add the hex component we just found to running list. + if (component_len > 0) { + // Can't have more than 8 components! + if (parsed->num_hex_components >= 8) + return false; + + parsed->hex_components[parsed->num_hex_components++] = + url_parse::Component(cur_component_begin, component_len); + } + } + + if (i == end) + break; // Reached the end of the input, DONE. + + // We found a "::" contraction. + if (is_contraction) { + // There can be at most one contraction in the literal. + if (parsed->index_of_contraction != -1) + return false; + parsed->index_of_contraction = parsed->num_hex_components; + ++i; // Consume the colon we peeked. + } + + if (is_colon) { + // Colons are separators between components, keep track of where the + // current component started (after this colon). + cur_component_begin = i + 1; + } else { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + return false; // Not ASCII. + + if (!IsHexChar(static_cast<unsigned char>(spec[i]))) { + // Regular components are hex numbers. It is also possible for + // a component to be an IPv4 address in dotted form. + if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) { + // Since IPv4 address can only appear at the end, assume the rest + // of the string is an IPv4 address. (We will parse this separately + // later). + parsed->ipv4_component = url_parse::Component( + cur_component_begin, end - cur_component_begin); + break; + } else { + // The character was neither a hex digit, nor an IPv4 character. + return false; + } + } + } + } + + return true; +} + +// Verifies the parsed IPv6 information, checking that the various components +// add up to the right number of bits (hex components are 16 bits, while +// embedded IPv4 formats are 32 bits, and contractions are placeholdes for +// 16 or more bits). Returns true if sizes match up, false otherwise. On +// success writes the length of the contraction (if any) to +// |out_num_bytes_of_contraction|. +bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed, + int* out_num_bytes_of_contraction) { + // Each group of four hex digits contributes 16 bits. + int num_bytes_without_contraction = parsed.num_hex_components * 2; + + // If an IPv4 address was embedded at the end, it contributes 32 bits. + if (parsed.ipv4_component.is_valid()) + num_bytes_without_contraction += 4; + + // If there was a "::" contraction, its size is going to be: + // MAX([16bits], [128bits] - num_bytes_without_contraction). + int num_bytes_of_contraction = 0; + if (parsed.index_of_contraction != -1) { + num_bytes_of_contraction = 16 - num_bytes_without_contraction; + if (num_bytes_of_contraction < 2) + num_bytes_of_contraction = 2; + } + + // Check that the numbers add up. + if (num_bytes_without_contraction + num_bytes_of_contraction != 16) + return false; + + *out_num_bytes_of_contraction = num_bytes_of_contraction; + return true; +} + +// Converts a hex comonent into a number. This cannot fail since the caller has +// already verified that each character in the string was a hex digit, and +// that there were no more than 4 characters. +template<typename CHAR> +uint16 IPv6HexComponentToNumber(const CHAR* spec, + const url_parse::Component& component) { + DCHECK(component.len <= 4); + + // Copy the hex string into a C-string. + char buf[5]; + for (int i = 0; i < component.len; ++i) + buf[i] = static_cast<char>(spec[component.begin + i]); + buf[component.len] = '\0'; + + // Convert it to a number (overflow is not possible, since with 4 hex + // characters we can at most have a 16 bit number). + return static_cast<uint16>(_strtoui64(buf, NULL, 16)); +} + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +template<typename CHAR, typename UCHAR> +bool DoIPv6AddressToNumber(const CHAR* spec, + const url_parse::Component& host, + unsigned char address[16]) { + // Make sure the component is bounded by '[' and ']'. + int end = host.end(); + if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']') + return false; + + // Exclude the square brackets. + url_parse::Component ipv6_comp(host.begin + 1, host.len - 2); + + // Parse the IPv6 address -- identify where all the colon separated hex + // components are, the "::" contraction, and the embedded IPv4 address. + IPv6Parsed ipv6_parsed; + if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed)) + return false; + + // Do some basic size checks to make sure that the address doesn't + // specify more than 128 bits or fewer than 128 bits. This also resolves + // how may zero bytes the "::" contraction represents. + int num_bytes_of_contraction; + if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction)) + return false; + + int cur_index_in_address = 0; + + // Loop through each hex components, and contraction in order. + for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) { + // Append the contraction if it appears before this component. + if (i == ipv6_parsed.index_of_contraction) { + for (int j = 0; j < num_bytes_of_contraction; ++j) + address[cur_index_in_address++] = 0; + } + // Append the hex component's value. + if (i != ipv6_parsed.num_hex_components) { + // Get the 16-bit value for this hex component. + uint16 number = IPv6HexComponentToNumber<CHAR>( + spec, ipv6_parsed.hex_components[i]); + // Append to |address|, in network byte order. + address[cur_index_in_address++] = (number & 0xFF00) >> 8; + address[cur_index_in_address++] = (number & 0x00FF); + } + } + + // If there was an IPv4 section, convert it into a 32-bit number and append + // it to |address|. + if (ipv6_parsed.ipv4_component.is_valid()) { + // We only allow the embedded IPv4 syntax to be used for "compat" and + // "mapped" formats: + // "compat" ==> 0:0:0:0:0:ffff:<IPv4-literal> + // "mapped" ==> 0:0:0:0:0:0000:<IPv4-literal> + for (int j = 0; j < 10; ++j) { + if (address[j] != 0) + return false; + } + if (!((address[10] == 0 && address[11] == 0) || + (address[10] == 0xFF && address[11] == 0xFF))) + return false; + + // Append the 32-bit number to |address|. + int ignored_num_ipv4_components; + if (CanonHostInfo::IPV4 != + IPv4AddressToNumber(spec, + ipv6_parsed.ipv4_component, + &address[cur_index_in_address], + &ignored_num_ipv4_components)) + return false; + } + + return true; +} + +// Searches for the longest sequence of zeros in |address|, and writes the +// range into |contraction_range|. The run of zeros must be at least 16 bits, +// and if there is a tie the first is chosen. +void ChooseIPv6ContractionRange(const unsigned char address[16], + url_parse::Component* contraction_range) { + // The longest run of zeros in |address| seen so far. + url_parse::Component max_range; + + // The current run of zeros in |address| being iterated over. + url_parse::Component cur_range; + + for (int i = 0; i < 16; i += 2) { + // Test for 16 bits worth of zero. + bool is_zero = (address[i] == 0 && address[i + 1] == 0); + + if (is_zero) { + // Add the zero to the current range (or start a new one). + if (!cur_range.is_valid()) + cur_range = url_parse::Component(i, 0); + cur_range.len += 2; + } + + if (!is_zero || i == 14) { + // Just completed a run of zeros. If the run is greater than 16 bits, + // it is a candidate for the contraction. + if (cur_range.len > 2 && cur_range.len > max_range.len) { + max_range = cur_range; + } + cur_range.reset(); + } + } + *contraction_range = max_range; +} + +// Return true if we've made a final IPV6/BROKEN decision, false if the result +// is NEUTRAL, and we could use a second opinion. +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeIPv6Address(const CHAR* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + // Turn the IP address into a 128 bit number. + unsigned char address[16]; + if (!IPv6AddressToNumber(spec, host, address)) { + // If it's not an IPv6 address, scan for characters that should *only* + // exist in an IPv6 address. + for (int i = host.begin; i < host.end(); i++) { + switch (spec[i]) { + case '[': + case ']': + case ':': + host_info->family = CanonHostInfo::BROKEN; + return true; + } + } + + // No invalid characters. Could still be IPv4 or a hostname. + host_info->family = CanonHostInfo::NEUTRAL; + return false; + } + + host_info->out_host.begin = output->length(); + output->push_back('['); + + // We will now output the address according to the rules in: + // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4 + + // Start by finding where to place the "::" contraction (if any). + url_parse::Component contraction_range; + ChooseIPv6ContractionRange(address, &contraction_range); + + for (int i = 0; i <= 14;) { + // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive. + DCHECK(i % 2 == 0); + if (i == contraction_range.begin && contraction_range.len > 0) { + // Jump over the contraction. + if (i == 0) + output->push_back(':'); + output->push_back(':'); + i = contraction_range.end(); + } else { + // Consume the next 16 bits from |address|. + int x = address[i] << 8 | address[i + 1]; + + i += 2; + + // Stringify the 16 bit number (at most requires 4 hex digits). + char str[5]; + _itoa_s(x, str, 16); + for (int ch = 0; str[ch] != 0; ++ch) + output->push_back(str[ch]); + + // Put a colon after each number, except the last. + if (i < 16) + output->push_back(':'); + } + } + + output->push_back(']'); + host_info->out_host.len = output->length() - host_info->out_host.begin; + + host_info->family = CanonHostInfo::IPV6; + return true; +} + +} // namespace + +bool FindIPv4Components(const char* spec, + const url_parse::Component& host, + url_parse::Component components[4]) { + return DoFindIPv4Components<char, unsigned char>(spec, host, components); +} + +bool FindIPv4Components(const char16* spec, + const url_parse::Component& host, + url_parse::Component components[4]) { + return DoFindIPv4Components<char16, char16>(spec, host, components); +} + +void CanonicalizeIPAddress(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (DoCanonicalizeIPv4Address<char, unsigned char>( + spec, host, output, host_info)) + return; + if (DoCanonicalizeIPv6Address<char, unsigned char>( + spec, host, output, host_info)) + return; +} + +void CanonicalizeIPAddress(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (DoCanonicalizeIPv4Address<char16, char16>( + spec, host, output, host_info)) + return; + if (DoCanonicalizeIPv6Address<char16, char16>( + spec, host, output, host_info)) + return; +} + +CanonHostInfo::Family IPv4AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components) { + return DoIPv4AddressToNumber<char>(spec, host, address, num_ipv4_components); +} + +CanonHostInfo::Family IPv4AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components) { + return DoIPv4AddressToNumber<char16>( + spec, host, address, num_ipv4_components); +} + +bool IPv6AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[16]) { + return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address); +} + +bool IPv6AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[16]) { + return DoIPv6AddressToNumber<char16, char16>(spec, host, address); +} + + +} // namespace url_canon diff --git a/googleurl/src/url_canon_ip.h b/googleurl/src/url_canon_ip.h new file mode 100644 index 0000000..6ce069d --- /dev/null +++ b/googleurl/src/url_canon_ip.h @@ -0,0 +1,98 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_CANON_IP_H__ +#define GOOGLEURL_SRC_URL_CANON_IP_H__ + +#include "base/string16.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_parse.h" + +namespace url_canon { + +// Searches the host name for the portions of the IPv4 address. On success, +// each component will be placed into |components| and it will return true. +// It will return false if the host can not be separated as an IPv4 address +// or if there are any non-7-bit characters or other characters that can not +// be in an IP address. (This is important so we fail as early as possible for +// common non-IP hostnames.) +// +// Not all components may exist. If there are only 3 components, for example, +// the last one will have a length of -1 or 0 to indicate it does not exist. +// +// Note that many platform's inet_addr will ignore everything after a space +// in certain curcumstances if the stuff before the space looks like an IP +// address. IE6 is included in this. We do NOT handle this case. In many cases, +// the browser's canonicalization will get run before this which converts +// spaces to %20 (in the case of IE7) or rejects them (in the case of +// Mozilla), so this code path never gets hit. Our host canonicalization will +// notice these spaces and escape them, which will make IP address finding +// fail. This seems like better behavior than stripping after a space. +bool FindIPv4Components(const char* spec, + const url_parse::Component& host, + url_parse::Component components[4]); +bool FindIPv4Components(const char16* spec, + const url_parse::Component& host, + url_parse::Component components[4]); + +// Converts an IPv4 address to a 32-bit number (network byte order). +// +// Possible return values: +// IPV4 - IPv4 address was successfully parsed. +// BROKEN - Input was formatted like an IPv4 address, but overflow occurred +// during parsing. +// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address. +// It might be an IPv6 address, or a hostname. +// +// On success, |num_ipv4_components| will be populated with the number of +// components in the IPv4 address. +CanonHostInfo::Family IPv4AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); +CanonHostInfo::Family IPv4AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +// +// NOTE that |host| is expected to be surrounded by square brackets. +// i.e. "[::1]" rather than "::1". +bool IPv6AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[16]); +bool IPv6AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[16]); + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_IP_H__ diff --git a/googleurl/src/url_canon_mailtourl.cc b/googleurl/src/url_canon_mailtourl.cc new file mode 100644 index 0000000..97868b8 --- /dev/null +++ b/googleurl/src/url_canon_mailtourl.cc @@ -0,0 +1,137 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Functions for canonicalizing "mailto:" URLs. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_canon { + +namespace { + + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + + // mailto: only uses {scheme, path, query} -- clear the rest. + new_parsed->username = url_parse::Component(); + new_parsed->password = url_parse::Component(); + new_parsed->host = url_parse::Component(); + new_parsed->port = url_parse::Component(); + new_parsed->ref = url_parse::Component(); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("mailto:", 7); + new_parsed->scheme.len = 6; + + bool success = true; + + // Path + if (parsed.path.is_valid()) { + new_parsed->path.begin = output->length(); + + // Copy the path using path URL's more lax escaping rules. + // We convert to UTF-8 and escape non-ASCII, but leave all + // ASCII characters alone. + int end = parsed.path.end(); + for (int i = parsed.path.begin; i < end; ++i) { + UCHAR uch = static_cast<UCHAR>(source.path[i]); + if (uch < 0x20 || uch >= 0x80) + success &= AppendUTF8EscapedChar(source.path, &i, end, output); + else + output->push_back(static_cast<char>(uch)); + } + + new_parsed->path.len = output->length() - new_parsed->path.begin; + } else { + // No path at all + new_parsed->path.reset(); + } + + // Query -- always use the default utf8 charset converter. + CanonicalizeQuery(source.query, parsed.query, NULL, + output, &new_parsed->query); + + return success; +} + +} // namespace + +bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeMailtoURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, output, new_parsed); +} + +bool CanonicalizeMailtoURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeMailtoURL<char16, char16>( + URLComponentSource<char16>(spec), parsed, output, new_parsed); +} + +bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeMailtoURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeMailtoURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_path.cc b/googleurl/src/url_canon_path.cc new file mode 100644 index 0000000..98ca40b --- /dev/null +++ b/googleurl/src/url_canon_path.cc @@ -0,0 +1,380 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Canonicalization functions for the paths of URLs. + +#include "base/logging.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_canon { + +namespace { + +enum CharacterFlags { + // Pass through unchanged, whether escaped or unescaped. This doesn't + // actually set anything so you can't OR it to check, it's just to make the + // table below more clear when neither ESCAPE or UNESCAPE is set. + PASS = 0, + + // This character requires special handling in DoPartialPath. Doing this test + // first allows us to filter out the common cases of regular characters that + // can be directly copied. + SPECIAL = 1, + + // This character must be escaped in the canonical output. Note that all + // escaped chars also have the "special" bit set so that the code that looks + // for this is triggered. Not valid with PASS or ESCAPE + ESCAPE_BIT = 2, + ESCAPE = ESCAPE_BIT | SPECIAL, + + // This character must be unescaped in canonical output. Not valid with + // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these + // characters unescaped, they should just be copied. + UNESCAPE = 4, + + // This character is disallowed in URLs. Note that the "special" bit is also + // set to trigger handling. + INVALID_BIT = 8, + INVALID = INVALID_BIT | SPECIAL, +}; + +// This table contains one of the above flag values. Note some flags are more +// than one bits because they also turn on the "special" flag. Special is the +// only flag that may be combined with others. +// +// This table is designed to match exactly what IE does with the characters. +// +// Dot is even more special, and the escaped version is handled specially by +// IsDot. Therefore, we don't need the "escape" flag, and even the "unescape" +// bit is never handled (we just need the "special") bit. +const unsigned char kPathCharLookup[0x100] = { +// NULL control chars... + INVALID, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, +// control chars... + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, +// ' ' ! " # $ % & ' ( ) * + , - . / + ESCAPE, PASS, ESCAPE, ESCAPE, PASS, ESCAPE, PASS, PASS, PASS, PASS, PASS, PASS, PASS, UNESCAPE,SPECIAL, PASS, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE, +// @ A B C D E F G H I J K L M N O + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, +// P Q R S T U V W X Y Z [ \ ] ^ _ + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE, +// ` a b c d e f g h i j k l m n o + ESCAPE, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, +// p q r s t u v w x y z { | } ~ <NBSP> + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE, ESCAPE, ESCAPE, UNESCAPE,ESCAPE, +// ...all the high-bit characters are escaped + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE}; + +enum DotDisposition { + // The given dot is just part of a filename and is not special. + NOT_A_DIRECTORY, + + // The given dot is the current directory. + DIRECTORY_CUR, + + // The given dot is the first of a double dot that should take us up one. + DIRECTORY_UP +}; + +// When the path resolver finds a dot, this function is called with the +// character following that dot to see what it is. The return value +// indicates what type this dot is (see above). This code handles the case +// where the dot is at the end of the input. +// +// |*consumed_len| will contain the number of characters in the input that +// express what we found. +// +// If the input is "../foo", |after_dot| = 1, |end| = 6, and +// at the end, |*consumed_len| = 2 for the "./" this function consumed. The +// original dot length should be handled by the caller. +template<typename CHAR> +DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot, + int end, int* consumed_len) { + if (after_dot == end) { + // Single dot at the end. + *consumed_len = 0; + return DIRECTORY_CUR; + } + if (url_parse::IsURLSlash(spec[after_dot])) { + // Single dot followed by a slash. + *consumed_len = 1; // Consume the slash + return DIRECTORY_CUR; + } + + int second_dot_len = IsDot(spec, after_dot, end); + if (second_dot_len) { + int after_second_dot = after_dot + second_dot_len; + if (after_second_dot == end) { + // Double dot at the end. + *consumed_len = second_dot_len; + return DIRECTORY_UP; + } + if (url_parse::IsURLSlash(spec[after_second_dot])) { + // Double dot followed by a slash. + *consumed_len = second_dot_len + 1; + return DIRECTORY_UP; + } + } + + // The dots are followed by something else, not a directory. + *consumed_len = 0; + return NOT_A_DIRECTORY; +} + +// Rewinds the output to the previous slash. It is assumed that the output +// ends with a slash and this doesn't count (we call this when we are +// appending directory paths, so the previous path component has and ending +// slash). +// +// This will stop at the first slash (assumed to be at position +// |path_begin_in_output| and not go any higher than that. Some web pages +// do ".." too many times, so we need to handle that brokenness. +// +// It searches for a literal slash rather than including a backslash as well +// because it is run only on the canonical output. +// +// The output is guaranteed to end in a slash when this function completes. +void BackUpToPreviousSlash(int path_begin_in_output, + CanonOutput* output) { + DCHECK(output->length() > 0); + + int i = output->length() - 1; + DCHECK(output->at(i) == '/'); + if (i == path_begin_in_output) + return; // We're at the first slash, nothing to do. + + // Now back up (skipping the trailing slash) until we find another slash. + i--; + while (output->at(i) != '/' && i > path_begin_in_output) + i--; + + // Now shrink the output to just include that last slash we found. + output->set_length(i + 1); +} + +// Appends the given path to the output. It assumes that if the input path +// starts with a slash, it should be copied to the output. If no path has +// already been appended to the output (the case when not resolving +// relative URLs), the path should begin with a slash. +// +// If there are already path components (this mode is used when appending +// relative paths for resolving), it assumes that the output already has +// a trailing slash and that if the input begins with a slash, it should be +// copied to the output. +// +// We do not collapse multiple slashes in a row to a single slash. It seems +// no web browsers do this, and we don't want incompababilities, even though +// it would be correct for most systems. +template<typename CHAR, typename UCHAR> +bool DoPartialPath(const CHAR* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output) { + int end = path.end(); + + bool success = true; + for (int i = path.begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(spec[i]); + if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) { + // We only need to test wide input for having non-ASCII characters. For + // narrow input, we'll always just use the lookup table. We don't try to + // do anything tricky with decoding/validating UTF-8. This function will + // read one or two UTF-16 characters and append the output as UTF-8. This + // call will be removed in 8-bit mode. + success &= AppendUTF8EscapedChar(spec, &i, end, output); + } else { + // Normal ASCII character or 8-bit input, use the lookup table. + unsigned char out_ch = static_cast<unsigned char>(uch); + unsigned char flags = kPathCharLookup[out_ch]; + if (flags & SPECIAL) { + // Needs special handling of some sort. + int dotlen; + if ((dotlen = IsDot(spec, i, end)) > 0) { + // See if this dot was preceeded by a slash in the output. We + // assume that when canonicalizing paths, they will always + // start with a slash and not a dot, so we don't have to + // bounds check the output. + // + // Note that we check this in the case of dots so we don't have to + // special case slashes. Since slashes are much more common than + // dots, this actually increases performance measurably (though + // slightly). + DCHECK(output->length() > path_begin_in_output); + if (output->length() > path_begin_in_output && + output->at(output->length() - 1) == '/') { + // Slash followed by a dot, check to see if this is means relative + int consumed_len; + switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end, + &consumed_len)) { + case NOT_A_DIRECTORY: + // Copy the dot to the output, it means nothing special. + output->push_back('.'); + i += dotlen - 1; + break; + case DIRECTORY_CUR: // Current directory, just skip the input. + i += dotlen + consumed_len - 1; + break; + case DIRECTORY_UP: + BackUpToPreviousSlash(path_begin_in_output, output); + i += dotlen + consumed_len - 1; + break; + } + } else { + // This dot is not preceeded by a slash, it is just part of some + // file name. + output->push_back('.'); + i += dotlen - 1; + } + + } else if (out_ch == '\\') { + // Convert backslashes to forward slashes + output->push_back('/'); + + } else if (out_ch == '%') { + // Handle escape sequences. + unsigned char unescaped_value; + if (DecodeEscaped(spec, &i, end, &unescaped_value)) { + // Valid escape sequence, see if we keep, reject, or unescape it. + char unescaped_flags = kPathCharLookup[unescaped_value]; + + if (unescaped_flags & UNESCAPE) { + // This escaped value shouldn't be escaped, copy it. + output->push_back(unescaped_value); + } else if (unescaped_flags & INVALID_BIT) { + // Invalid escaped character, copy it and remember the error. + output->push_back('%'); + output->push_back(static_cast<char>(spec[i - 1])); + output->push_back(static_cast<char>(spec[i])); + success = false; + } else { + // Valid escaped character but we should keep it escaped. We + // don't want to change the case of any hex letters in case + // the server is sensitive to that, so we just copy the two + // characters without checking (DecodeEscape will have advanced + // to the last character of the pair). + output->push_back('%'); + output->push_back(static_cast<char>(spec[i - 1])); + output->push_back(static_cast<char>(spec[i])); + } + } else { + // Invalid escape sequence. IE7 rejects any URLs with such + // sequences, while Firefox, IE6, and Safari all pass it through + // unchanged. We are more permissive unlike IE7. I don't think this + // can cause significant problems, if it does, we should change + // to be more like IE7. + output->push_back('%'); + } + + } else if (flags & INVALID_BIT) { + // For NULLs, etc. fail. + AppendEscapedChar(out_ch, output); + success = false; + + } else if (flags & ESCAPE_BIT) { + // This character should be escaped. + AppendEscapedChar(out_ch, output); + } + } else { + // Nothing special about this character, just append it. + output->push_back(out_ch); + } + } + } + return success; +} + +template<typename CHAR, typename UCHAR> +bool DoPath(const CHAR* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + bool success = true; + if (path.len > 0) { + out_path->begin = output->length(); + + // Write out an initial slash if the input has none. If we just parse a URL + // and then canonicalize it, it will of course have a slash already. This + // check is for the replacement and relative URL resolving cases of file + // URLs. + if (!url_parse::IsURLSlash(spec[path.begin])) + output->push_back('/'); + + success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output); + out_path->len = output->length() - out_path->begin; + } else { + // No input, canonical path is a slash. + output->push_back('/'); + *out_path = url_parse::Component(); + } + return success; +} + +} // namespace + +bool CanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + return DoPath<char, unsigned char>(spec, path, output, out_path); +} + +bool CanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + return DoPath<char16, char16>(spec, path, output, out_path); +} + +bool CanonicalizePartialPath(const char* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output) { + return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output, + output); +} + +bool CanonicalizePartialPath(const char16* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output) { + return DoPartialPath<char16, char16>(spec, path, path_begin_in_output, + output); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_pathurl.cc b/googleurl/src/url_canon_pathurl.cc new file mode 100644 index 0000000..4a990c7 --- /dev/null +++ b/googleurl/src/url_canon_pathurl.cc @@ -0,0 +1,128 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Functions for canonicalizing "path" URLs. Not to be confused with the path +// of a URL, these are URLs that have no authority section, only a path. For +// example, "javascript:" and "data:". + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + // Scheme: this will append the colon. + bool success = CanonicalizeScheme(source.scheme, parsed.scheme, + output, &new_parsed->scheme); + + // We assume there's no authority for path URLs. Note that hosts should never + // have -1 length. + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->host.reset(); + new_parsed->port.reset(); + + if (parsed.path.is_valid()) { + // Copy the path using path URL's more lax escaping rules (think for + // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all + // ASCII characters alone. This helps readability of JavaStript. + new_parsed->path.begin = output->length(); + int end = parsed.path.end(); + for (int i = parsed.path.begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(source.path[i]); + if (uch < 0x20 || uch >= 0x80) + success &= AppendUTF8EscapedChar(source.path, &i, end, output); + else + output->push_back(static_cast<char>(uch)); + } + new_parsed->path.len = output->length() - new_parsed->path.begin; + } else { + // Empty path. + new_parsed->path.reset(); + } + + // Assume there's no query or ref. + new_parsed->query.reset(); + new_parsed->ref.reset(); + + return success; +} + +} // namespace + +bool CanonicalizePathURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizePathURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, output, new_parsed); +} + +bool CanonicalizePathURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizePathURL<char16, char16>( + URLComponentSource<char16>(spec), parsed, output, new_parsed); +} + +bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizePathURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizePathURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_query.cc b/googleurl/src/url_canon_query.cc new file mode 100644 index 0000000..cee8774 --- /dev/null +++ b/googleurl/src/url_canon_query.cc @@ -0,0 +1,189 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +// Query canonicalization in IE +// ---------------------------- +// IE is very permissive for query parameters specified in links on the page +// (in contrast to links that it constructs itself based on form data). It does +// not unescape any character. It does not reject any escape sequence (be they +// invalid like "%2y" or freaky like %00). +// +// IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09), +// LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier +// layer since they are removed from all portions of the URL). All other +// characters are passed unmodified. Invalid UTF-16 sequences are preserved as +// well, with each character in the input being converted to UTF-8. It is the +// server's job to make sense of this invalid query. +// +// Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page) +// are converted to the invalid character and sent as unescaped UTF-8 (0xef, +// 0xbf, 0xbd). This may not be canonicalization, the parser may generate these +// strings before the URL handler ever sees them. +// +// Our query canonicalization +// -------------------------- +// We escape all non-ASCII characters and control characters, like Firefox. +// This is more conformant to the URL spec, and there do not seem to be many +// problems relating to Firefox's behavior. +// +// Like IE, we will never unescape (although the application may want to try +// unescaping to present the user with a more understandable URL). We will +// replace all invalid sequences (including invalid UTF-16 sequences, which IE +// doesn't) with the "invalid character," and we will escape it. + +namespace url_canon { + +namespace { + +// Returns true if the characters starting at |begin| and going until |end| +// (non-inclusive) are all representable in 7-bits. +template<typename CHAR, typename UCHAR> +bool IsAllASCII(const CHAR* spec, const url_parse::Component& query) { + int end = query.end(); + for (int i = query.begin; i < end; i++) { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + return false; + } + return true; +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. This version will accept 8 or 16 +// bit characters, but assumes that they have only 7-bit values. It also assumes +// that all UTF-8 values are correct, so doesn't bother checking +template<typename CHAR> +void AppendRaw8BitQueryString(const CHAR* source, int length, + CanonOutput* output) { + for (int i = 0; i < length; i++) { + if (!IsQueryChar(static_cast<unsigned char>(source[i]))) + AppendEscapedChar(static_cast<unsigned char>(source[i]), output); + else // Doesn't need escaping. + output->push_back(static_cast<char>(source[i])); + } +} + +// Runs the converter on the given UTF-8 input. Since the converter expects +// UTF-16, we have to convert first. The converter must be non-NULL. +void RunConverter(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output) { + // This function will replace any misencoded values with the invalid + // character. This is what we want so we don't have to check for error. + RawCanonOutputW<1024> utf16; + ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16); + converter->ConvertFromUTF16(utf16.data(), utf16.length(), output); +} + +// Runs the converter with the given UTF-16 input. We don't have to do +// anything, but this overriddden function allows us to use the same code +// for both UTF-8 and UTF-16 input. +void RunConverter(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output) { + converter->ConvertFromUTF16(&spec[query.begin], query.len, output); +} + +template<typename CHAR, typename UCHAR> +void DoConvertToQueryEncoding(const CHAR* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output) { + if (IsAllASCII<CHAR, UCHAR>(spec, query)) { + // Easy: the input can just appended with no character set conversions. + AppendRaw8BitQueryString(&spec[query.begin], query.len, output); + + } else { + // Harder: convert to the proper encoding first. + if (converter) { + // Run the converter to get an 8-bit string, then append it, escaping + // necessary values. + RawCanonOutput<1024> eight_bit; + RunConverter(spec, query, converter, &eight_bit); + AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output); + + } else { + // No converter, do our own UTF-8 conversion. + AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output); + } + } +} + +template<typename CHAR, typename UCHAR> +void DoCanonicalizeQuery(const CHAR* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query) { + if (query.len < 0) { + *out_query = url_parse::Component(); + return; + } + + output->push_back('?'); + out_query->begin = output->length(); + + DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output); + + out_query->len = output->length() - out_query->begin; +} + +} // namespace + +void CanonicalizeQuery(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query) { + DoCanonicalizeQuery<char, unsigned char>(spec, query, converter, + output, out_query); +} + +void CanonicalizeQuery(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query) { + DoCanonicalizeQuery<char16, char16>(spec, query, converter, + output, out_query); +} + +void ConvertUTF16ToQueryEncoding(const char16* input, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output) { + DoConvertToQueryEncoding<char16, char16>(input, query, + converter, output); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_relative.cc b/googleurl/src/url_canon_relative.cc new file mode 100644 index 0000000..446b951 --- /dev/null +++ b/googleurl/src/url_canon_relative.cc @@ -0,0 +1,571 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Canonicalizer functions for working with and resolving relative URLs. + +#include "base/logging.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_canon { + +namespace { + +// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug +// 379034), whereas IE is case-insensetive. +// +// We choose to be more permissive like IE. We don't need to worry about +// unescaping or anything here: neither IE or Firefox allow this. We also +// don't have to worry about invalid scheme characters since we are comparing +// against the canonical scheme of the base. +// +// The base URL should always be canonical, therefore is ASCII. +template<typename CHAR> +bool AreSchemesEqual(const char* base, + const url_parse::Component& base_scheme, + const CHAR* cmp, + const url_parse::Component& cmp_scheme) { + if (base_scheme.len != cmp_scheme.len) + return false; + for (int i = 0; i < base_scheme.len; i++) { + // We assume the base is already canonical, so we don't have to + // canonicalize it. + if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) != + base[base_scheme.begin + i]) + return false; + } + return true; +} + +#ifdef WIN32 + +// Here, we also allow Windows paths to be represented as "/C:/" so we can be +// consistent about URL paths beginning with slashes. This function is like +// DoesBeginWindowsDrivePath except that it also requires a slash at the +// beginning. +template<typename CHAR> +bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + if (start_offset >= spec_len) + return false; + return url_parse::IsURLSlash(spec[start_offset]) && + url_parse::DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len); +} + +#endif // WIN32 + +// See IsRelativeURL in the header file for usage. +template<typename CHAR> +bool DoIsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const CHAR* url, + int url_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component) { + *is_relative = false; // So we can default later to not relative. + + // Trim whitespace and construct a new range for the substring. + int begin = 0; + url_parse::TrimURL(url, &begin, &url_len); + if (begin >= url_len) { + // Empty URLs are relative, but do nothing. + *relative_component = url_parse::Component(begin, 0); + *is_relative = true; + return true; + } + +#ifdef WIN32 + // We special case paths like "C:\foo" so they can link directly to the + // file on Windows (IE compatability). The security domain stuff should + // prevent a link like this from actually being followed if its on a + // web page. + // + // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/" + // as relative, as this will just replace the path when the base scheme + // is a file and the answer will still be correct. + // + // We require strict backslashes when detecting UNC since two forward + // shashes should be treated a a relative URL with a hostname. + if (url_parse::DoesBeginWindowsDriveSpec(url, begin, url_len) || + url_parse::DoesBeginUNCPath(url, begin, url_len, true)) + return true; +#endif // WIN32 + + // See if we've got a scheme, if not, we know this is a relative URL. + // BUT: Just because we have a scheme, doesn't make it absolute. + // "http:foo.html" is a relative URL with path "foo.html". If the scheme is + // empty, we treat it as relative (":foo") like IE does. + url_parse::Component scheme; + if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) { + // Don't allow relative URLs if the base scheme doesn't support it. + if (!is_base_hierarchical) + return false; + + *relative_component = url_parse::MakeRange(begin, url_len); + *is_relative = true; + return true; + } + + // If the scheme isn't valid, then it's relative. + int scheme_end = scheme.end(); + for (int i = scheme.begin; i < scheme_end; i++) { + if (!CanonicalSchemeChar(url[i])) { + *relative_component = url_parse::MakeRange(begin, url_len); + *is_relative = true; + return true; + } + } + + // If the scheme is not the same, then we can't count it as relative. + if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme)) + return true; + + // When the scheme that they both share is not hierarchical, treat the + // incoming scheme as absolute (this way with the base of "data:foo", + // "data:bar" will be reported as absolute. + if (!is_base_hierarchical) + return true; + + // ExtractScheme guarantees that the colon immediately follows what it + // considers to be the scheme. CountConsecutiveSlashes will handle the + // case where the begin offset is the end of the input. + int colon_offset = scheme.end(); + int num_slashes = url_parse::CountConsecutiveSlashes(url, colon_offset + 1, + url_len); + + if (num_slashes == 0 || num_slashes == 1) { + // No slashes means it's a relative path like "http:foo.html". One slash + // is an absolute path. "http:/home/foo.html" + *is_relative = true; + *relative_component = url_parse::MakeRange(colon_offset + 1, url_len); + return true; + } + + // Two or more slashes after the scheme we treat as absolute. + return true; +} + +// Copies all characters in the range [begin, end) of |spec| to the output, +// up until and including the last slash. There should be a slash in the +// range, if not, nothing will be copied. +// +// The input is assumed to be canonical, so we search only for exact slashes +// and not backslashes as well. We also know that it's ASCII. +void CopyToLastSlash(const char* spec, + int begin, + int end, + CanonOutput* output) { + // Find the last slash. + int last_slash = -1; + for (int i = end - 1; i >= begin; i--) { + if (spec[i] == '/') { + last_slash = i; + break; + } + } + if (last_slash < 0) + return; // No slash. + + // Copy. + for (int i = begin; i <= last_slash; i++) + output->push_back(spec[i]); +} + +// Copies a single component from the source to the output. This is used +// when resolving relative URLs and a given component is unchanged. Since the +// source should already be canonical, we don't have to do anything special, +// and the input is ASCII. +void CopyOneComponent(const char* source, + const url_parse::Component& source_component, + CanonOutput* output, + url_parse::Component* output_component) { + if (source_component.len < 0) { + // This component is not present. + *output_component = url_parse::Component(); + return; + } + + output_component->begin = output->length(); + int source_end = source_component.end(); + for (int i = source_component.begin; i < source_end; i++) + output->push_back(source[i]); + output_component->len = output->length() - output_component->begin; +} + +#ifdef WIN32 + +// Called on Windows when the base URL is a file URL, this will copy the "C:" +// to the output, if there is a drive letter and if that drive letter is not +// being overridden by the relative URL. Otherwise, do nothing. +// +// It will return the index of the beginning of the next character in the +// base to be processed: if there is a "C:", the slash after it, or if +// there is no drive letter, the slash at the beginning of the path, or +// the end of the base. This can be used as the starting offset for further +// path processing. +template<typename CHAR> +int CopyBaseDriveSpecIfNecessary(const char* base_url, + int base_path_begin, + int base_path_end, + const CHAR* relative_url, + int path_start, + int relative_url_len, + CanonOutput* output) { + if (base_path_begin >= base_path_end) + return base_path_begin; // No path. + + // If the relative begins with a drive spec, don't do anything. The existing + // drive spec in the base will be replaced. + if (url_parse::DoesBeginWindowsDriveSpec(relative_url, + path_start, relative_url_len)) { + return base_path_begin; // Relative URL path is "C:/foo" + } + + // The path should begin with a slash (as all canonical paths do). We check + // if it is followed by a drive letter and copy it. + if (DoesBeginSlashWindowsDriveSpec(base_url, + base_path_begin, + base_path_end)) { + // Copy the two-character drive spec to the output. It will now look like + // "file:///C:" so the rest of it can be treated like a standard path. + output->push_back('/'); + output->push_back(base_url[base_path_begin + 1]); + output->push_back(base_url[base_path_begin + 2]); + return base_path_begin + 3; + } + + return base_path_begin; +} + +#endif // WIN32 + +// A subroutine of DoResolveRelativeURL, this resolves the URL knowning that +// the input is a relative path or less (qyuery or ref). +template<typename CHAR> +bool DoResolveRelativePath(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const CHAR* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + bool success = true; + + // We know the authority section didn't change, copy it to the output. We + // also know we have a path so can copy up to there. + url_parse::Component path, query, ref; + url_parse::ParsePathInternal(relative_url, + relative_component, + &path, + &query, + &ref); + // Canonical URLs always have a path, so we can use that offset. + output->Append(base_url, base_parsed.path.begin); + + if (path.len > 0) { + // The path is replaced or modified. + int true_path_begin = output->length(); + + // For file: URLs on Windows, we don't want to treat the drive letter and + // colon as part of the path for relative file resolution when the + // incoming URL does not provide a drive spec. We save the true path + // beginning so we can fix it up after we are done. + int base_path_begin = base_parsed.path.begin; +#ifdef WIN32 + if (base_is_file) { + base_path_begin = CopyBaseDriveSpecIfNecessary( + base_url, base_parsed.path.begin, base_parsed.path.end(), + relative_url, relative_component.begin, relative_component.end(), + output); + // Now the output looks like either "file://" or "file:///C:" + // and we can start appending the rest of the path. |base_path_begin| + // points to the character in the base that comes next. + } +#endif // WIN32 + + if (url_parse::IsURLSlash(relative_url[path.begin])) { + // Easy case: the path is an absolute path on the server, so we can + // just replace everything from the path on with the new versions. + // Since the input should be canonical hierarchical URL, we should + // always have a path. + success &= CanonicalizePath(relative_url, path, + output, &out_parsed->path); + } else { + // Relative path, replace the query, and reference. We take the + // original path with the file part stripped, and append the new path. + // The canonicalizer will take care of resolving ".." and "." + int path_begin = output->length(); + CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(), + output); + success &= CanonicalizePartialPath(relative_url, path, path_begin, + output); + out_parsed->path = url_parse::MakeRange(path_begin, output->length()); + + // Copy the rest of the stuff after the path from the relative path. + } + + // Finish with the query and reference part (these can't fail). + CanonicalizeQuery(relative_url, query, query_converter, + output, &out_parsed->query); + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + + // Fix the path beginning to add back the "C:" we may have written above. + out_parsed->path = url_parse::MakeRange(true_path_begin, + out_parsed->path.end()); + return success; + } + + // If we get here, the path is unchanged: copy to output. + CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path); + + if (query.is_valid()) { + // Just the query specified, replace the query and reference (ignore + // failures for refs) + CanonicalizeQuery(relative_url, query, query_converter, + output, &out_parsed->query); + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + return success; + } + + // If we get here, the query is unchanged: copy to output. Note that the + // range of the query parameter doesn't include the question mark, so we + // have to add it manually if there is a component. + if (base_parsed.query.is_valid()) + output->push_back('?'); + CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query); + + if (ref.is_valid()) { + // Just the reference specified: replace it (ignoring failures). + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + return success; + } + + // We should always have something to do in this function, the caller checks + // that some component is being replaced. + DCHECK(false) << "Not reached"; + return success; +} + +// Resolves a relative URL that contains a host. Typically, these will +// be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which +// should be kept from the original URL is the scheme. +template<typename CHAR> +bool DoResolveRelativeHost(const char* base_url, + const url_parse::Parsed& base_parsed, + const CHAR* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + // Parse the relative URL, just like we would for anything following a + // scheme. + url_parse::Parsed relative_parsed; // Everything but the scheme is valid. + url_parse::ParseAfterScheme(&relative_url[relative_component.begin], + relative_component.len, relative_component.begin, + &relative_parsed); + + // Now we can just use the replacement function to replace all the necessary + // parts of the old URL with the new one. + Replacements<CHAR> replacements; + replacements.SetUsername(relative_url, relative_parsed.username); + replacements.SetPassword(relative_url, relative_parsed.password); + replacements.SetHost(relative_url, relative_parsed.host); + replacements.SetPort(relative_url, relative_parsed.port); + replacements.SetPath(relative_url, relative_parsed.path); + replacements.SetQuery(relative_url, relative_parsed.query); + replacements.SetRef(relative_url, relative_parsed.ref); + + return ReplaceStandardURL(base_url, base_parsed, replacements, + query_converter, output, out_parsed); +} + +// Resolves a relative URL that happens to be an absolute file path. Examples +// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo". +template<typename CHAR> +bool DoResolveAbsoluteFile(const CHAR* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + // Parse the file URL. The file URl parsing function uses the same logic + // as we do for determining if the file is absolute, in which case it will + // not bother to look for a scheme. + url_parse::Parsed relative_parsed; + url_parse::ParseFileURL(&relative_url[relative_component.begin], + relative_component.len, &relative_parsed); + + return CanonicalizeFileURL(&relative_url[relative_component.begin], + relative_component.len, relative_parsed, + query_converter, output, out_parsed); +} + +// TODO(brettw) treat two slashes as root like Mozilla for FTP? +template<typename CHAR> +bool DoResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const CHAR* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + // Starting point for our output parsed. We'll fix what we change. + *out_parsed = base_parsed; + + // Sanity check: the input should have a host or we'll break badly below. + // We can only resolve relative URLs with base URLs that have hosts and + // paths (even the default path of "/" is OK). + // + // We allow hosts with no length so we can handle file URLs, for example. + if (base_parsed.path.len <= 0) { + // On error, return the input (resolving a relative URL on a non-relative + // base = the base). + int base_len = base_parsed.Length(); + for (int i = 0; i < base_len; i++) + output->push_back(base_url[i]); + return false; + } + + if (relative_component.len <= 0) { + // Empty relative URL, make no changes. + int base_len = base_parsed.Length(); + for (int i = 0; i < base_len; i++) + output->push_back(base_url[i]); + return true; + } + + int num_slashes = url_parse::CountConsecutiveSlashes( + relative_url, relative_component.begin, relative_component.end()); + +#ifdef WIN32 + // On Windows, two slashes for a file path (regardless of which direction + // they are) means that it's UNC. Two backslashes on any base scheme mean + // that it's an absolute UNC path (we use the base_is_file flag to control + // how strict the UNC finder is). + // + // We also allow Windows absolute drive specs on any scheme (for example + // "c:\foo") like IE does. There must be no preceeding slashes in this + // case (we reject anything like "/c:/foo") because that should be treated + // as a path. For file URLs, we allow any number of slashes since that would + // be setting the path. + // + // This assumes the absolute path resolver handles absolute URLs like this + // properly. url_util::DoCanonicalize does this. + int after_slashes = relative_component.begin + num_slashes; + if (url_parse::DoesBeginUNCPath(relative_url, relative_component.begin, + relative_component.end(), !base_is_file) || + ((num_slashes == 0 || base_is_file) && + url_parse::DoesBeginWindowsDriveSpec(relative_url, after_slashes, + relative_component.end()))) { + return DoResolveAbsoluteFile(relative_url, relative_component, + query_converter, output, out_parsed); + } +#else + // Other platforms need explicit handling for file: URLs with multiple + // slashes because the generic scheme parsing always extracts a host, but a + // file: URL only has a host if it has exactly 2 slashes. This also + // handles the special case where the URL is only slashes, since that + // doesn't have a host part either. + if (base_is_file && + (num_slashes > 2 || num_slashes == relative_component.len)) { + return DoResolveAbsoluteFile(relative_url, relative_component, + query_converter, output, out_parsed); + } +#endif + + // Any other double-slashes mean that this is relative to the scheme. + if (num_slashes >= 2) { + return DoResolveRelativeHost(base_url, base_parsed, + relative_url, relative_component, + query_converter, output, out_parsed); + } + + // When we get here, we know that the relative URL is on the same host. + return DoResolveRelativePath(base_url, base_parsed, base_is_file, + relative_url, relative_component, + query_converter, output, out_parsed); +} + +} // namespace + +bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component) { + return DoIsRelativeURL<char>( + base, base_parsed, fragment, fragment_len, is_base_hierarchical, + is_relative, relative_component); +} + +bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component) { + return DoIsRelativeURL<char16>( + base, base_parsed, fragment, fragment_len, is_base_hierarchical, + is_relative, relative_component); +} + +bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + return DoResolveRelativeURL<char>( + base_url, base_parsed, base_is_file, relative_url, + relative_component, query_converter, output, out_parsed); +} + +bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char16* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + return DoResolveRelativeURL<char16>( + base_url, base_parsed, base_is_file, relative_url, + relative_component, query_converter, output, out_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_stdstring.h b/googleurl/src/url_canon_stdstring.h new file mode 100644 index 0000000..2241eb1 --- /dev/null +++ b/googleurl/src/url_canon_stdstring.h @@ -0,0 +1,133 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This header file defines a canonicalizer output method class for STL +// strings. Because the canonicalizer tries not to be dependent on the STL, +// we have segregated it here. + +#ifndef GOOGLEURL_SRC_URL_CANON_STRING_H__ +#define GOOGLEURL_SRC_URL_CANON_STRING_H__ + +#include <string> +#include "googleurl/src/url_canon.h" + +namespace url_canon { + +// Write into a std::string given in the constructor. This object odes not own +// the string itself, and the user must ensure that the string stays alive +// throughout the lifetime of this object. +// +// The given string will be appended to; any existing data in the string will +// be preserved. The caller should reserve() the amount of data in the string +// they expect to be written. We will resize if necessary, but that's slow. +// +// Note that when canonicalization is complete, the string will likely have +// unused space at the end because we make the string very big to start out +// with (by |initial_size|). This ends up being important because resize +// operations are slow, and because the base class needs to write directly +// into the buffer. +// +// Therefore, the user should call Complete() before using the string that +// this class wrote into. +class StdStringCanonOutput : public CanonOutput { + public: + StdStringCanonOutput(std::string* str) + : CanonOutput(), + str_(str) { + cur_len_ = static_cast<int>(str_->size()); // Append to existing data. + str_->resize(str_->capacity()); + buffer_ = &(*str_)[0]; + buffer_len_ = static_cast<int>(str_->size()); + } + virtual ~StdStringCanonOutput() { + // Nothing to do, we don't own the string. + } + + // Must be called after writing has completed but before the string is used. + void Complete() { + str_->resize(cur_len_); + buffer_len_ = cur_len_; + } + + virtual void Resize(int sz) { + str_->resize(sz); + buffer_ = &(*str_)[0]; + buffer_len_ = sz; + } + + protected: + std::string* str_; +}; + +// An extension of the Replacements class that allows the setters to use +// standard strings. +// +// The strings passed as arguments are not copied and must remain valid until +// this class goes out of scope. +template<typename STR> +class StdStringReplacements : + public url_canon::Replacements<typename STR::value_type> { + public: + void SetSchemeStr(const STR& s) { + this->SetScheme(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetUsernameStr(const STR& s) { + this->SetUsername(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPasswordStr(const STR& s) { + this->SetPassword(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetHostStr(const STR& s) { + this->SetHost(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPortStr(const STR& s) { + this->SetPort(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPathStr(const STR& s) { + this->SetPath(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetQueryStr(const STR& s) { + this->SetQuery(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetRefStr(const STR& s) { + this->SetRef(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } +}; + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_STRING_H__ diff --git a/googleurl/src/url_canon_stdurl.cc b/googleurl/src/url_canon_stdurl.cc new file mode 100644 index 0000000..41a8fa9 --- /dev/null +++ b/googleurl/src/url_canon_stdurl.cc @@ -0,0 +1,202 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Functions to canonicalize "standard" URLs, which are ones that have an +// authority section including a host name. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + // Scheme: this will append the colon. + bool success = CanonicalizeScheme(source.scheme, parsed.scheme, + output, &new_parsed->scheme); + + // Authority (username, password, host, port) + bool have_authority; + if (parsed.username.is_valid() || parsed.password.is_valid() || + parsed.host.is_nonempty() || parsed.port.is_valid()) { + have_authority = true; + + // Only write the authority separators when we have a scheme. + if (parsed.scheme.is_valid()) { + output->push_back('/'); + output->push_back('/'); + } + + // User info: the canonicalizer will handle the : and @. + success &= CanonicalizeUserInfo(source.username, parsed.username, + source.password, parsed.password, + output, + &new_parsed->username, + &new_parsed->password); + + success &= CanonicalizeHost(source.host, parsed.host, + output, &new_parsed->host); + + // Host must not be empty for standard URLs. + if (!parsed.host.is_nonempty()) + success = false; + + // Port: the port canonicalizer will handle the colon. + int default_port = DefaultPortForScheme( + &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len); + success &= CanonicalizePort(source.port, parsed.port, default_port, + output, &new_parsed->port); + } else { + // No authority, clear the components. + have_authority = false; + new_parsed->host.reset(); + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->port.reset(); + success = false; // Standard URLs must have an authority. + } + + // Path + if (parsed.path.is_valid()) { + success &= CanonicalizePath(source.path, parsed.path, + output, &new_parsed->path); + } else if (have_authority || + parsed.query.is_valid() || parsed.ref.is_valid()) { + // When we have an empty path, make up a path when we have an authority + // or something following the path. The only time we allow an empty + // output path is when there is nothing else. + new_parsed->path = url_parse::Component(output->length(), 1); + output->push_back('/'); + } else { + // No path at all + new_parsed->path.reset(); + } + + // Query + CanonicalizeQuery(source.query, parsed.query, query_converter, + output, &new_parsed->query); + + // Ref: ignore failure for this, since the page can probably still be loaded. + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace + + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +int DefaultPortForScheme(const char* scheme, int scheme_len) { + int default_port = url_parse::PORT_UNSPECIFIED; + switch (scheme_len) { + case 4: + if (!strncmp(scheme, "http", scheme_len)) + default_port = 80; + break; + case 5: + if (!strncmp(scheme, "https", scheme_len)) + default_port = 443; + break; + case 3: + if (!strncmp(scheme, "ftp", scheme_len)) + default_port = 21; + else if (!strncmp(scheme, "wss", scheme_len)) + default_port = 443; + break; + case 6: + if (!strncmp(scheme, "gopher", scheme_len)) + default_port = 70; + break; + case 2: + if (!strncmp(scheme, "ws", scheme_len)) + default_port = 80; + break; + } + return default_port; +} + +bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeStandardURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, query_converter, + output, new_parsed); +} + +bool CanonicalizeStandardURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeStandardURL<char16, char16>( + URLComponentSource<char16>(spec), parsed, query_converter, + output, new_parsed); +} + +bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeStandardURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +// For 16-bit replacements, we turn all the replacements into UTF-8 so the +// regular codepath can be used. +bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeStandardURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_unittest.cc b/googleurl/src/url_canon_unittest.cc new file mode 100644 index 0000000..c5be423 --- /dev/null +++ b/googleurl/src/url_canon_unittest.cc @@ -0,0 +1,1936 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <errno.h> +#include <unicode/ucnv.h> + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_icu.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_test_utils.h" +#include "testing/gtest/include/gtest/gtest.h" + +// Some implementations of base/basictypes.h may define ARRAYSIZE. +// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro +// which is in our version of basictypes.h. +#ifndef ARRAYSIZE +#define ARRAYSIZE ARRAYSIZE_UNSAFE +#endif + +using url_test_utils::WStringToUTF16; +using url_test_utils::ConvertUTF8ToUTF16; +using url_test_utils::ConvertUTF16ToUTF8; +using url_canon::CanonHostInfo; + +namespace { + +struct ComponentCase { + const char* input; + const char* expected; + url_parse::Component expected_component; + bool expected_success; +}; + +// ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests +// treat each input as optional, and will only try processing if non-NULL. +// The output is always 8-bit. +struct DualComponentCase { + const char* input8; + const wchar_t* input16; + const char* expected; + url_parse::Component expected_component; + bool expected_success; +}; + +// Test cases for CanonicalizeIPAddress(). The inputs are identical to +// DualComponentCase, but the output has extra CanonHostInfo fields. +struct IPAddressCase { + const char* input8; + const wchar_t* input16; + const char* expected; + url_parse::Component expected_component; + + // CanonHostInfo fields, for verbose output. + CanonHostInfo::Family expected_family; + int expected_num_ipv4_components; +}; + +struct ReplaceCase { + const char* base; + const char* scheme; + const char* username; + const char* password; + const char* host; + const char* port; + const char* path; + const char* query; + const char* ref; + const char* expected; +}; + +// Wrapper around a UConverter object that managers creation and destruction. +class UConvScoper { + public: + explicit UConvScoper(const char* charset_name) { + UErrorCode err = U_ZERO_ERROR; + converter_ = ucnv_open(charset_name, &err); + } + + ~UConvScoper() { + if (converter_) + ucnv_close(converter_); + } + + // Returns the converter object, may be NULL. + UConverter* converter() const { return converter_; } + + private: + UConverter* converter_; +}; + +// Magic string used in the replacements code that tells SetupReplComp to +// call the clear function. +const char kDeleteComp[] = "|"; + +// Sets up a replacement for a single component. This is given pointers to +// the set and clear function for the component being replaced, and will +// either set the component (if it exists) or clear it (if the replacement +// string matches kDeleteComp). +// +// This template is currently used only for the 8-bit case, and the strlen +// causes it to fail in other cases. It is left a template in case we have +// tests for wide replacements. +template<typename CHAR> +void SetupReplComp( + void (url_canon::Replacements<CHAR>::*set)(const CHAR*, + const url_parse::Component&), + void (url_canon::Replacements<CHAR>::*clear)(), + url_canon::Replacements<CHAR>* rep, + const CHAR* str) { + if (str && str[0] == kDeleteComp[0]) { + (rep->*clear)(); + } else if (str) { + (rep->*set)(str, url_parse::Component(0, static_cast<int>(strlen(str)))); + } +} + +} // namespace + +TEST(URLCanonTest, UTF) { + // Low-level test that we handle reading, canonicalization, and writing + // UTF-8/UTF-16 strings properly. + struct UTFCase { + const char* input8; + const wchar_t* input16; + bool expected_success; + const char* output; + } utf_cases[] = { + // Valid canonical input should get passed through & escaped. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"}, + // Test a characer that takes > 16 bits (U+10300 = old italic letter A) + {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"}, + // Non-shortest-form UTF-8 are invalid. The bad char should be replaced + // with the invalid character (EF BF DB in UTF-8). + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, "%EF%BF%BD%E5%A5%BD"}, + // Invalid UTF-8 sequences should be marked as invalid (the first + // sequence is truncated). + {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"}, + // Character going off the end. + {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"}, + // ...same with low surrogates with no high surrogate. + {"\xed\xb0\x80", L"\xdc00", false, "%EF%BF%BD"}, + // Test a UTF-8 encoded surrogate value is marked as invalid. + // ED A0 80 = U+D800 + {"\xed\xa0\x80", NULL, false, "%EF%BF%BD"}, + }; + + std::string out_str; + for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) { + if (utf_cases[i].input8) { + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + int input_len = static_cast<int>(strlen(utf_cases[i].input8)); + bool success = true; + for (int ch = 0; ch < input_len; ch++) { + success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len, + &output); + } + output.Complete(); + EXPECT_EQ(utf_cases[i].expected_success, success); + EXPECT_EQ(std::string(utf_cases[i].output), out_str); + } + if (utf_cases[i].input16) { + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + string16 input_str(WStringToUTF16(utf_cases[i].input16)); + int input_len = static_cast<int>(input_str.length()); + bool success = true; + for (int ch = 0; ch < input_len; ch++) { + success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len, + &output); + } + output.Complete(); + EXPECT_EQ(utf_cases[i].expected_success, success); + EXPECT_EQ(std::string(utf_cases[i].output), out_str); + } + + if (utf_cases[i].input8 && utf_cases[i].input16 && + utf_cases[i].expected_success) { + // Check that the UTF-8 and UTF-16 inputs are equivalent. + + // UTF-16 -> UTF-8 + std::string input8_str(utf_cases[i].input8); + string16 input16_str(WStringToUTF16(utf_cases[i].input16)); + EXPECT_EQ(input8_str, ConvertUTF16ToUTF8(input16_str)); + + // UTF-8 -> UTF-16 + EXPECT_EQ(input16_str, ConvertUTF8ToUTF16(input8_str)); + } + } +} + +TEST(URLCanonTest, ICUCharsetConverter) { + struct ICUCase { + const wchar_t* input; + const char* encoding; + const char* expected; + } icu_cases[] = { + // UTF-8. + {L"Hello, world", "utf-8", "Hello, world"}, + {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"}, + // Non-BMP UTF-8. + {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"}, + // Big5 + {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"}, + // Unrepresentable character in the destination set. + {L"hello\x4f60\x06de\x597dworld", "big5", "hello\xa7\x41%26%231758%3B\xa6\x6eworld"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) { + UConvScoper conv(icu_cases[i].encoding); + ASSERT_TRUE(conv.converter() != NULL); + url_canon::ICUCharsetConverter converter(conv.converter()); + + std::string str; + url_canon::StdStringCanonOutput output(&str); + + string16 input_str(WStringToUTF16(icu_cases[i].input)); + int input_len = static_cast<int>(input_str.length()); + converter.ConvertFromUTF16(input_str.c_str(), input_len, &output); + output.Complete(); + + EXPECT_STREQ(icu_cases[i].expected, str.c_str()); + } + + // Test string sizes around the resize boundary for the output to make sure + // the converter resizes as needed. + const int static_size = 16; + UConvScoper conv("utf-8"); + ASSERT_TRUE(conv.converter()); + url_canon::ICUCharsetConverter converter(conv.converter()); + for (int i = static_size - 2; i <= static_size + 2; i++) { + // Make a string with the appropriate length. + string16 input; + for (int ch = 0; ch < i; ch++) + input.push_back('a'); + + url_canon::RawCanonOutput<static_size> output; + converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()), + &output); + EXPECT_EQ(input.length(), static_cast<size_t>(output.length())); + } +} + +TEST(URLCanonTest, Scheme) { + // Here, we're mostly testing that unusual characters are handled properly. + // The canonicalizer doesn't do any parsing or whitespace detection. It will + // also do its best on error, and will escape funny sequences (these won't be + // valid schemes and it will return error). + // + // Note that the canonicalizer will append a colon to the output to separate + // out the rest of the URL, which is not present in the input. We check, + // however, that the output range includes everything but the colon. + ComponentCase scheme_cases[] = { + {"http", "http:", url_parse::Component(0, 4), true}, + {"HTTP", "http:", url_parse::Component(0, 4), true}, + {" HTTP ", "%20http%20:", url_parse::Component(0, 10), false}, + {"htt: ", "htt%3A%20:", url_parse::Component(0, 9), false}, + {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", url_parse::Component(0, 22), false}, + // Don't re-escape something already escaped. Note that it will + // "canonicalize" the 'A' to 'a', but that's OK. + {"ht%3Atp", "ht%3atp:", url_parse::Component(0, 7), false}, + }; + + std::string out_str; + + for (size_t i = 0; i < arraysize(scheme_cases); i++) { + int url_len = static_cast<int>(strlen(scheme_cases[i].input)); + url_parse::Component in_comp(0, url_len); + url_parse::Component out_comp; + + out_str.clear(); + url_canon::StdStringCanonOutput output1(&out_str); + bool success = url_canon::CanonicalizeScheme(scheme_cases[i].input, + in_comp, &output1, &out_comp); + output1.Complete(); + + EXPECT_EQ(scheme_cases[i].expected_success, success); + EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); + EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); + + // Now try the wide version + out_str.clear(); + url_canon::StdStringCanonOutput output2(&out_str); + + string16 wide_input(ConvertUTF8ToUTF16(scheme_cases[i].input)); + in_comp.len = static_cast<int>(wide_input.length()); + success = url_canon::CanonicalizeScheme(wide_input.c_str(), in_comp, + &output2, &out_comp); + output2.Complete(); + + EXPECT_EQ(scheme_cases[i].expected_success, success); + EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); + EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); + } + + // Test the case where the scheme is declared nonexistant, it should be + // converted into an empty scheme. + url_parse::Component out_comp; + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + EXPECT_TRUE(url_canon::CanonicalizeScheme("", url_parse::Component(0, -1), + &output, &out_comp)); + output.Complete(); + + EXPECT_EQ(std::string(":"), out_str); + EXPECT_EQ(0, out_comp.begin); + EXPECT_EQ(0, out_comp.len); +} + +TEST(URLCanonTest, Host) { + IPAddressCase host_cases[] = { + // Basic canonicalization, uppercase should be converted to lowercase. + {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1}, + // Spaces and some other characters should be escaped. + {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", url_parse::Component(0, 22), CanonHostInfo::NEUTRAL, -1}, + // Exciting different types of spaces! + {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", url_parse::Component(0, 16), CanonHostInfo::NEUTRAL, -1}, + // Other types of space (no-break, zero-width, zero-width-no-break) are + // name-prepped away to nothing. + {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1}, + // Ideographic full stop (full-width period for Chinese, etc.) should be + // treated as a dot. + {NULL, L"www.foo\x3002"L"bar.com", "www.foo.bar.com", url_parse::Component(0, 15), CanonHostInfo::NEUTRAL, -1}, + // Invalid unicode characters should fail... + // ...In wide input, ICU will barf and we'll end up with the input as + // escaped UTF-8 (the invalid character should be replaced with the + // replacement character). + {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1}, + // ...This is the same as previous but with with escaped. + {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1}, + // Test name prepping, fullwidth input should be converted to ASCII and NOT + // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16. + {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", url_parse::Component(0, 6), CanonHostInfo::NEUTRAL, -1}, + // Test that fullwidth escaped values are properly name-prepped, + // then converted or rejected. + // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input) + {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1}, + {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1}, + // ...%00 in fullwidth should fail (also as escaped UTF-8 input) + {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1}, + {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1}, + // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN + {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1}, + // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped + // UTF-8 (wide case). The output should be equivalent to the true wide + // character input above). + {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1}, + // Invalid escaped characters should fail and the percents should be + // escaped. + {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", url_parse::Component(0, 10), CanonHostInfo::BROKEN, -1}, + // If we get an invalid character that has been escaped. + {"%25", L"%25", "%25", url_parse::Component(0, 3), CanonHostInfo::BROKEN, -1}, + {"hello%00", L"hello%00", "hello%00", url_parse::Component(0, 8), CanonHostInfo::BROKEN, -1}, + // Escaped numbers should be treated like IP addresses if they are. + {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + // Invalid escaping should trigger the regular host error handling. + {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", url_parse::Component(0, 17), CanonHostInfo::BROKEN, -1}, + // Something that isn't exactly an IP should get treated as a host and + // spaces escaped. + {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", url_parse::Component(0, 19), CanonHostInfo::NEUTRAL, -1}, + // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP. + // These are "0Xc0.0250.01" in fullwidth. + {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + // Broken IP addresses get marked as such. + {"192.168.0.257", L"192.168.0.257", "192.168.0.257", url_parse::Component(0, 13), CanonHostInfo::BROKEN, -1}, + {"[google.com]", L"[google.com]", "[google.com]", url_parse::Component(0, 12), CanonHostInfo::BROKEN, -1}, + // Cyrillic letter followed buy ( should return punicode for ( escaped before punicode string was created. I.e. + // if ( is escaped after punicode is created we would get xn--%28-8tb (incorrect). + {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", url_parse::Component(0, 11), CanonHostInfo::NEUTRAL, -1}, + }; + + // CanonicalizeHost() non-verbose. + std::string out_str; + for (size_t i = 0; i < arraysize(host_cases); i++) { + // Narrow version. + if (host_cases[i].input8) { + int host_len = static_cast<int>(strlen(host_cases[i].input8)); + url_parse::Component in_comp(0, host_len); + url_parse::Component out_comp; + + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + bool success = url_canon::CanonicalizeHost(host_cases[i].input8, in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, + success); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len); + } + + // Wide version. + if (host_cases[i].input16) { + string16 input16(WStringToUTF16(host_cases[i].input16)); + int host_len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, host_len); + url_parse::Component out_comp; + + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + bool success = url_canon::CanonicalizeHost(input16.c_str(), in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, + success); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len); + } + } + + // CanonicalizeHostVerbose() + for (size_t i = 0; i < arraysize(host_cases); i++) { + // Narrow version. + if (host_cases[i].input8) { + int host_len = static_cast<int>(strlen(host_cases[i].input8)); + url_parse::Component in_comp(0, host_len); + + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + CanonHostInfo host_info; + + url_canon::CanonicalizeHostVerbose(host_cases[i].input8, in_comp, + &output, &host_info); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); + if (host_cases[i].expected_family == CanonHostInfo::IPV4) { + EXPECT_EQ(host_cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } + + // Wide version. + if (host_cases[i].input16) { + string16 input16(WStringToUTF16(host_cases[i].input16)); + int host_len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, host_len); + + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + CanonHostInfo host_info; + + url_canon::CanonicalizeHostVerbose(input16.c_str(), in_comp, + &output, &host_info); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); + if (host_cases[i].expected_family == CanonHostInfo::IPV4) { + EXPECT_EQ(host_cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } + } +} + +TEST(URLCanonTest, IPv4) { + IPAddressCase cases[] = { + // Empty is not an IP address. + {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {".", L".", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Regular IP addresses in different bases. + {"192.168.0.1", L"192.168.0.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4}, + {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4}, + {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4}, + // Non-IP addresses due to invalid characters. + {"192.168.9.com", L"192.168.9.com", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Invalid characters for the base should be rejected. + {"19a.168.0.1", L"19a.168.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {"0308.0250.00.01", L"0308.0250.00.01", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // If there are not enough components, the last one should fill them out. + {"192", L"192", "0.0.0.192", url_parse::Component(0, 9), CanonHostInfo::IPV4, 1}, + {"0xC0a80001", L"0xC0a80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1}, + {"030052000001", L"030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1}, + {"000030052000001", L"000030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1}, + {"192.168", L"192.168", "192.0.0.168", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2}, + {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2}, + {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2}, + {"192.168.1", L"192.168.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + // Too many components means not an IP address. + {"192.168.0.0.1", L"192.168.0.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // We allow a single trailing dot. + {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4}, + {"192.168.0.1. hello", L"192.168.0.1. hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {"192.168.0.1..", L"192.168.0.1..", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Two dots in a row means not an IP address. + {"192.168..1", L"192.168..1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Any numerical overflow should be marked as BROKEN. + {"0x100.0", L"0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0x100.0.0", L"0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0x100.0.0.0", L"0x100.0.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0x100.0.0", L"0.0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0.0x100.0", L"0.0.0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0.0.0x100", L"0.0.0.0x100", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0.0x10000", L"0.0.0x10000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0x1000000", L"0.0x1000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0x100000000", L"0x100000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Repeat the previous tests, minus 1, to verify boundaries. + {"0xFF.0", L"0xFF.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 2}, + {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 3}, + {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4}, + {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4}, + {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4}, + {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4}, + {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", url_parse::Component(0, 13), CanonHostInfo::IPV4, 2}, + {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", url_parse::Component(0, 15), CanonHostInfo::IPV4, 1}, + // Old trunctations tests. They're all "BROKEN" now. + {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"192.168.0.257", L"192.168.0.257", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"192.168.0xa20001", L"192.168.0xa20001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"192.015052000001", L"192.015052000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0X12C0a80001", L"0X12C0a80001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"276.1.2", L"276.1.2", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Spaces should be rejected. + {"192.168.0.1 hello", L"192.168.0.1 hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Very large numbers. + {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", url_parse::Component(0, 11), CanonHostInfo::BROKEN, -1}, + // A number has no length limit, but long numbers can still overflow. + {"00000000000000000001", L"00000000000000000001", "0.0.0.1", url_parse::Component(0, 7), CanonHostInfo::IPV4, 1}, + {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // If a long component is non-numeric, it's a hostname, *not* a broken IP. + {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Truncation of all zeros should still result in 0. + {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", url_parse::Component(0, 7), CanonHostInfo::IPV4, 4}, + }; + + for (size_t i = 0; i < arraysize(cases); i++) { + // 8-bit version. + url_parse::Component component(0, + static_cast<int>(strlen(cases[i].input8))); + + std::string out_str1; + url_canon::StdStringCanonOutput output1(&out_str1); + url_canon::CanonHostInfo host_info; + url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1, + &host_info); + output1.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + if (host_info.family == CanonHostInfo::IPV4) { + EXPECT_STREQ(cases[i].expected, out_str1.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + + // 16-bit version. + string16 input16(WStringToUTF16(cases[i].input16)); + component = url_parse::Component(0, static_cast<int>(input16.length())); + + std::string out_str2; + url_canon::StdStringCanonOutput output2(&out_str2); + url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2, + &host_info); + output2.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + if (host_info.family == CanonHostInfo::IPV4) { + EXPECT_STREQ(cases[i].expected, out_str2.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } +} + +TEST(URLCanonTest, IPv6) { + IPAddressCase cases[] = { + // Empty is not an IP address. + {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Non-IPs with [:] characters are marked BROKEN. + {":", L":", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[", L"[", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[:", L"[:", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"]", L"]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {":]", L":]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[]", L"[]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[:]", L"[:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Regular IP address is invalid without bounding '[' and ']'. + {"2001:db8::1", L"2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[2001:db8::1", L"[2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"2001:db8::1]", L"2001:db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Regular IP addresses. + {"[::]", L"[::]", "[::]", url_parse::Component(0,4), CanonHostInfo::IPV6, -1}, + {"[::1]", L"[::1]", "[::1]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1}, + {"[1::]", L"[1::]", "[1::]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1}, + {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", url_parse::Component(0,10), CanonHostInfo::IPV6, -1}, + {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1}, + + // Leading zeros should be stripped. + {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1}, + + // Upper case letters should be lowercased. + {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", url_parse::Component(0,20), CanonHostInfo::IPV6, -1}, + + // The same address can be written with different contractions, but should + // get canonicalized to the same thing. + {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1}, + {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1}, + + // IPv4 addresses + // Only mapped and compat addresses can have IPv4 syntax embedded. + {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // IPv4 with last component missing. + {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1}, + + // IPv4 using hex. + // TODO(eroman): Should this format be disallowed? + {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1}, + + // There may be zeros surrounding the "::" contraction. + {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1}, + + {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", url_parse::Component(0,13), CanonHostInfo::IPV6, -1}, + + // Can only have one "::" contraction in an IPv6 string literal. + {"[2001::db8::1]", L"[2001::db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // No more than 2 consecutive ':'s. + {"[2001:db8:::1]", L"[2001:db8:::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[:::]", L"[:::]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Non-IP addresses due to invalid characters. + {"[2001::.com]", L"[2001::.com]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // If there are not enough components, the last one should fill them out. + // ... omitted at this time ... + // Too many components means not an IP address. Similarly with too few if using IPv4 compat or mapped addresses. + {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Too many bits (even though 8 comonents, the last one holds 32 bits). + {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // Too many bits specified -- the contraction would have to be zero-length + // to not exceed 128 bits. + {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // The contraction is for 16 bits of zero. + {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1}, + + // Cannot have a trailing colon. + {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // Cannot have negative numbers. + {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // Scope ID -- the URL may contain an optional ["%" <scope_id>] section. + // The scope_id should be included in the canonicalized URL, and is an + // unsigned decimal number. + + // Invalid because no ID was given after the percent. + + // Don't allow scope-id + {"[1::%1]", L"[1::%1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1::%eth0]", L"[1::%eth0]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1::%]", L"[1::%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[%]", L"[%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[::%:]", L"[::%:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // Don't allow leading or trailing colons. + {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // We allow a single trailing dot. + // ... omitted at this time ... + // Two dots in a row means not an IP address. + {"[::192.168..1]", L"[::192.168..1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Any non-first components get truncated to one byte. + // ... omitted at this time ... + // Spaces should be rejected. + {"[::1 hello]", L"[::1 hello]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + }; + + for (size_t i = 0; i < arraysize(cases); i++) { + // 8-bit version. + url_parse::Component component(0, + static_cast<int>(strlen(cases[i].input8))); + + std::string out_str1; + url_canon::StdStringCanonOutput output1(&out_str1); + url_canon::CanonHostInfo host_info; + url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1, + &host_info); + output1.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + if (host_info.family == CanonHostInfo::IPV6) { + EXPECT_STREQ(cases[i].expected, out_str1.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + } + + // 16-bit version. + string16 input16(WStringToUTF16(cases[i].input16)); + component = url_parse::Component(0, static_cast<int>(input16.length())); + + std::string out_str2; + url_canon::StdStringCanonOutput output2(&out_str2); + url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2, + &host_info); + output2.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + if (host_info.family == CanonHostInfo::IPV6) { + EXPECT_STREQ(cases[i].expected, out_str2.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + } + } +} + +TEST(URLCanonTest, UserInfo) { + // Note that the canonicalizer should escape and treat empty components as + // not being there. + + // We actually parse a full input URL so we can get the initial components. + struct UserComponentCase { + const char* input; + const char* expected; + url_parse::Component expected_username; + url_parse::Component expected_password; + bool expected_success; + } user_info_cases[] = { + {"http://user:pass@host.com/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true}, + {"http://@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true}, + {"http://:@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true}, + {"http://foo:@host.com/", "foo@", url_parse::Component(0, 3), url_parse::Component(0, -1), true}, + {"http://:foo@host.com/", ":foo@", url_parse::Component(0, 0), url_parse::Component(1, 3), true}, + {"http://^ :$\t@host.com/", "%5E%20:$%09@", url_parse::Component(0, 6), url_parse::Component(7, 4), true}, + {"http://user:pass@/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true}, + {"http://%2540:bar@domain.com/", "%2540:bar@", url_parse::Component(0, 5), url_parse::Component(6, 3), true }, + + // IE7 compatability: old versions allowed backslashes in usernames, but + // IE7 does not. We disallow it as well. + {"ftp://me\\mydomain:pass@foo.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true}, + }; + + for (size_t i = 0; i < ARRAYSIZE(user_info_cases); i++) { + int url_len = static_cast<int>(strlen(user_info_cases[i].input)); + url_parse::Parsed parsed; + url_parse::ParseStandardURL(user_info_cases[i].input, url_len, &parsed); + url_parse::Component out_user, out_pass; + std::string out_str; + url_canon::StdStringCanonOutput output1(&out_str); + + bool success = url_canon::CanonicalizeUserInfo(user_info_cases[i].input, + parsed.username, + user_info_cases[i].input, + parsed.password, + &output1, &out_user, + &out_pass); + output1.Complete(); + + EXPECT_EQ(user_info_cases[i].expected_success, success); + EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); + EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); + EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); + EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); + EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); + + // Now try the wide version + out_str.clear(); + url_canon::StdStringCanonOutput output2(&out_str); + string16 wide_input(ConvertUTF8ToUTF16(user_info_cases[i].input)); + success = url_canon::CanonicalizeUserInfo(wide_input.c_str(), + parsed.username, + wide_input.c_str(), + parsed.password, + &output2, &out_user, &out_pass); + output2.Complete(); + + EXPECT_EQ(user_info_cases[i].expected_success, success); + EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); + EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); + EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); + EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); + EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); + } +} + +TEST(URLCanonTest, Port) { + // We only need to test that the number gets properly put into the output + // buffer. The parser unit tests will test scanning the number correctly. + // + // Note that the CanonicalizePort will always prepend a colon to the output + // to separate it from the colon that it assumes preceeds it. + struct PortCase { + const char* input; + int default_port; + const char* expected; + url_parse::Component expected_component; + bool expected_success; + } port_cases[] = { + // Invalid input should be copied w/ failure. + {"as df", 80, ":as%20df", url_parse::Component(1, 7), false}, + {"-2", 80, ":-2", url_parse::Component(1, 2), false}, + // Default port should be omitted. + {"80", 80, "", url_parse::Component(0, -1), true}, + {"8080", 80, ":8080", url_parse::Component(1, 4), true}, + // PORT_UNSPECIFIED should mean always keep the port. + {"80", url_parse::PORT_UNSPECIFIED, ":80", url_parse::Component(1, 2), true}, + }; + + for (size_t i = 0; i < ARRAYSIZE(port_cases); i++) { + int url_len = static_cast<int>(strlen(port_cases[i].input)); + url_parse::Component in_comp(0, url_len); + url_parse::Component out_comp; + std::string out_str; + url_canon::StdStringCanonOutput output1(&out_str); + bool success = url_canon::CanonicalizePort(port_cases[i].input, in_comp, + port_cases[i].default_port, + &output1, &out_comp); + output1.Complete(); + + EXPECT_EQ(port_cases[i].expected_success, success); + EXPECT_EQ(std::string(port_cases[i].expected), out_str); + EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); + + // Now try the wide version + out_str.clear(); + url_canon::StdStringCanonOutput output2(&out_str); + string16 wide_input(ConvertUTF8ToUTF16(port_cases[i].input)); + success = url_canon::CanonicalizePort(wide_input.c_str(), in_comp, + port_cases[i].default_port, + &output2, &out_comp); + output2.Complete(); + + EXPECT_EQ(port_cases[i].expected_success, success); + EXPECT_EQ(std::string(port_cases[i].expected), out_str); + EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); + } +} + +TEST(URLCanonTest, Path) { + DualComponentCase path_cases[] = { + // ----- path collapsing tests ----- + {"/././foo", L"/././foo", "/foo", url_parse::Component(0, 4), true}, + {"/./.foo", L"/./.foo", "/.foo", url_parse::Component(0, 5), true}, + {"/foo/.", L"/foo/.", "/foo/", url_parse::Component(0, 5), true}, + {"/foo/./", L"/foo/./", "/foo/", url_parse::Component(0, 5), true}, + // double dots followed by a slash or the end of the string count + {"/foo/bar/..", L"/foo/bar/..", "/foo/", url_parse::Component(0, 5), true}, + {"/foo/bar/../", L"/foo/bar/../", "/foo/", url_parse::Component(0, 5), true}, + // don't count double dots when they aren't followed by a slash + {"/foo/..bar", L"/foo/..bar", "/foo/..bar", url_parse::Component(0, 10), true}, + // some in the middle + {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", url_parse::Component(0, 8), true}, + {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a", url_parse::Component(0, 2), true}, + // we should not be able to go above the root + {"/foo/../../..", L"/foo/../../..", "/", url_parse::Component(0, 1), true}, + {"/foo/../../../ton", L"/foo/../../../ton", "/ton", url_parse::Component(0, 4), true}, + // escaped dots should be unescaped and treated the same as dots + {"/foo/%2e", L"/foo/%2e", "/foo/", url_parse::Component(0, 5), true}, + {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", url_parse::Component(0, 8), true}, + {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar", "/..bar", url_parse::Component(0, 6), true}, + // Multiple slashes in a row should be preserved and treated like empty + // directory names. + {"////../..", L"////../..", "//", url_parse::Component(0, 2), true}, + + // ----- escaping tests ----- + {"/foo", L"/foo", "/foo", url_parse::Component(0, 4), true}, + // Valid escape sequence + {"/%20foo", L"/%20foo", "/%20foo", url_parse::Component(0, 7), true}, + // Invalid escape sequence we should pass through unchanged. + {"/foo%", L"/foo%", "/foo%", url_parse::Component(0, 5), true}, + {"/foo%2", L"/foo%2", "/foo%2", url_parse::Component(0, 6), true}, + // Invalid escape sequence: bad characters should be treated the same as + // the sourrounding text, not as escaped (in this case, UTF-8). + {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", url_parse::Component(0, 10), true}, + {"/foo%2\xc2\xa9zbar", NULL, "/foo%2%C2%A9zbar", url_parse::Component(0, 16), true}, + {NULL, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", url_parse::Component(0, 22), true}, + // Regular characters that are escaped should be unescaped + {"/foo%41%7a", L"/foo%41%7a", "/fooAz", url_parse::Component(0, 6), true}, + // Funny characters that are unescaped should be escaped + {"/foo\x09\x91%91", NULL, "/foo%09%91%91", url_parse::Component(0, 13), true}, + {NULL, L"/foo\x09\x91%91", "/foo%09%C2%91%91", url_parse::Component(0, 16), true}, + // Invalid characters that are escaped should cause a failure. + {"/foo%00%51", L"/foo%00%51", "/foo%00Q", url_parse::Component(0, 8), false}, + // Some characters should be passed through unchanged regardless of esc. + {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", url_parse::Component(0, 13), true}, + // Characters that are properly escaped should not have the case changed + // of hex letters. + {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", url_parse::Component(0, 13), true}, + // Funny characters that are unescaped should be escaped + {"/foo\tbar", L"/foo\tbar", "/foo%09bar", url_parse::Component(0, 10), true}, + // Backslashes should get converted to forward slashes + {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", url_parse::Component(0, 8), true}, + // Hashes found in paths (possibly only when the caller explicitly sets + // the path on an already-parsed URL) should be escaped. + {"/foo#bar", L"/foo#bar", "/foo%23bar", url_parse::Component(0, 10), true}, + // %7f should be allowed and %3D should not be unescaped (these were wrong + // in a previous version). + {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", url_parse::Component(0, 24), true}, + // @ should be unescaped. + {"/@asdf%40", L"/@asdf%40", "/@asdf@", url_parse::Component(0, 7), true}, + + // ----- encoding tests ----- + // Basic conversions + {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", url_parse::Component(0, 37), true}, + // Invalid unicode characters should fail. We only do validation on + // UTF-16 input, so this doesn't happen on 8-bit. + {"/\xef\xb7\x90zyx", NULL, "/%EF%B7%90zyx", url_parse::Component(0, 13), true}, + {NULL, L"/\xfdd0zyx", "/%EF%BF%BDzyx", url_parse::Component(0, 13), false}, + }; + + for (size_t i = 0; i < arraysize(path_cases); i++) { + if (path_cases[i].input8) { + int len = static_cast<int>(strlen(path_cases[i].input8)); + url_parse::Component in_comp(0, len); + url_parse::Component out_comp; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizePath(path_cases[i].input8, in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected_success, success); + EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(path_cases[i].expected, out_str); + } + + if (path_cases[i].input16) { + string16 input16(WStringToUTF16(path_cases[i].input16)); + int len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, len); + url_parse::Component out_comp; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + + bool success = url_canon::CanonicalizePath(input16.c_str(), in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected_success, success); + EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(path_cases[i].expected, out_str); + } + } + + // Manual test: embedded NULLs should be escaped and the URL should be marked + // as invalid. + const char path_with_null[] = "/ab\0c"; + url_parse::Component in_comp(0, 5); + url_parse::Component out_comp; + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizePath(path_with_null, in_comp, + &output, &out_comp); + output.Complete(); + EXPECT_FALSE(success); + EXPECT_EQ("/ab%00c", out_str); +} + +TEST(URLCanonTest, Query) { + struct QueryCase { + const char* input8; + const wchar_t* input16; + const char* encoding; + const char* expected; + } query_cases[] = { + // Regular ASCII case in some different encodings. + {"foo=bar", L"foo=bar", NULL, "?foo=bar"}, + {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"}, + {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"}, + {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"}, + // Allow question marks in the query without escaping + {"as?df", L"as?df", NULL, "?as?df"}, + // Always escape '#' since it would mark the ref. + {"as#df", L"as#df", NULL, "?as%23df"}, + // Escape some questionable 8-bit characters, but never unescape. + {"\x02hello\x7f bye", L"\x02hello\x7f bye", NULL, "?%02hello%7F%20bye"}, + {"%40%41123", L"%40%41123", NULL, "?%40%41123"}, + // Chinese input/output + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", NULL, "?q=%E4%BD%A0%E5%A5%BD"}, + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", "?q=%C4%E3%BA%C3"}, + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"}, + // Unencodable character in the destination character set should be + // escaped. The escape sequence unescapes to be the entity name: + // "?q=你" + {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", "?q=Chinese%26%2365319%3B"}, + // Invalid UTF-8/16 input should be replaced with invalid characters. + {"q=\xed\xed", L"q=\xd800\xd800", NULL, "?q=%EF%BF%BD%EF%BF%BD"}, + // Don't allow < or > because sometimes they are used for XSS if the + // URL is echoed in content. Firefox does this, IE doesn't. + {"q=<asdf>", L"q=<asdf>", NULL, "?q=%3Casdf%3E"}, + // Escape double quotemarks in the query. + {"q=\"asdf\"", L"q=\"asdf\"", NULL, "?q=%22asdf%22"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) { + url_parse::Component out_comp; + + UConvScoper conv(query_cases[i].encoding); + ASSERT_TRUE(!query_cases[i].encoding || conv.converter()); + url_canon::ICUCharsetConverter converter(conv.converter()); + + // Map NULL to a NULL converter pointer. + url_canon::ICUCharsetConverter* conv_pointer = &converter; + if (!query_cases[i].encoding) + conv_pointer = NULL; + + if (query_cases[i].input8) { + int len = static_cast<int>(strlen(query_cases[i].input8)); + url_parse::Component in_comp(0, len); + std::string out_str; + + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeQuery(query_cases[i].input8, in_comp, + conv_pointer, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + + if (query_cases[i].input16) { + string16 input16(WStringToUTF16(query_cases[i].input16)); + int len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, len); + std::string out_str; + + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeQuery(input16.c_str(), in_comp, + conv_pointer, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + } + + // Extra test for input with embedded NULL; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Component out_comp; + url_canon::CanonicalizeQuery("a \x00z\x01", url_parse::Component(0, 5), NULL, + &output, &out_comp); + output.Complete(); + EXPECT_EQ("?a%20%00z%01", out_str); +} + +TEST(URLCanonTest, Ref) { + // Refs are trivial, it just checks the encoding. + DualComponentCase ref_cases[] = { + // Regular one, we shouldn't escape spaces, et al. + {"hello, world", L"hello, world", "#hello, world", url_parse::Component(1, 12), true}, + // UTF-8/wide input should be preserved + {"\xc2\xa9", L"\xa9", "#\xc2\xa9", url_parse::Component(1, 2), true}, + // Test a characer that takes > 16 bits (U+10300 = old italic letter A) + {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#\xF0\x90\x8C\x80ss", url_parse::Component(1, 6), true}, + // Escaping should be preserved unchanged, even invalid ones + {"%41%a", L"%41%a", "#%41%a", url_parse::Component(1, 5), true}, + // Invalid UTF-8/16 input should be flagged and the input made valid + {"\xc2", NULL, "#\xef\xbf\xbd", url_parse::Component(1, 3), true}, + {NULL, L"\xd800\x597d", "#\xef\xbf\xbd\xe5\xa5\xbd", url_parse::Component(1, 6), true}, + // Test a Unicode invalid character. + {"a\xef\xb7\x90", L"a\xfdd0", "#a\xef\xbf\xbd", url_parse::Component(1, 4), true}, + // Refs can have # signs and we should preserve them. + {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", url_parse::Component(1, 9), true}, + {"#asdf", L"#asdf", "##asdf", url_parse::Component(1, 5), true}, + }; + + for (size_t i = 0; i < arraysize(ref_cases); i++) { + // 8-bit input + if (ref_cases[i].input8) { + int len = static_cast<int>(strlen(ref_cases[i].input8)); + url_parse::Component in_comp(0, len); + url_parse::Component out_comp; + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeRef(ref_cases[i].input8, in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(ref_cases[i].expected, out_str); + } + + // 16-bit input + if (ref_cases[i].input16) { + string16 input16(WStringToUTF16(ref_cases[i].input16)); + int len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, len); + url_parse::Component out_comp; + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(ref_cases[i].expected, out_str); + } + } + + // Try one with an embedded NULL. It should be stripped. + const char null_input[5] = "ab\x00z"; + url_parse::Component null_input_component(0, 4); + url_parse::Component out_comp; + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeRef(null_input, null_input_component, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(1, out_comp.begin); + EXPECT_EQ(3, out_comp.len); + EXPECT_EQ("#abz", out_str); +} + +TEST(URLCanonTest, CanonicalizeStandardURL) { + // The individual component canonicalize tests should have caught the cases + // for each of those components. Here, we just need to test that the various + // parts are included or excluded properly, and have the correct separators. + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + } cases[] = { + {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#", true}, + {"http://[www.google.com]/", "http://[www.google.com]/", false}, + {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#", false}, + {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo", true}, + {"www.google.com", ":www.google.com/", true}, + {"http://192.0x00A80001", "http://192.168.0.1/", true}, + {"http://www/foo%2Ehtml", "http://www/foo.html", true}, + {"http://user:pass@/", "http://user:pass@/", false}, + {"http://%25DOMAIN:foobar@foodomain.com/", "http://%25DOMAIN:foobar@foodomain.com/", true}, + + // Backslashes should get converted to forward slashes. + {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true}, + + // Busted refs shouldn't make the whole thing fail. + {"http://www.google.com/asdf#\xc2", "http://www.google.com/asdf#\xef\xbf\xbd", true}, + + // Basic port tests. + {"http://foo:80/", "http://foo/", true}, + {"http://foo:81/", "http://foo:81/", true}, + {"httpa://foo:80/", "httpa://foo:80/", true}, + {"http://foo:-80/", "http://foo:-80/", false}, + + {"https://foo:443/", "https://foo/", true}, + {"https://foo:80/", "https://foo:80/", true}, + {"ftp://foo:21/", "ftp://foo/", true}, + {"ftp://foo:80/", "ftp://foo:80/", true}, + {"gopher://foo:70/", "gopher://foo/", true}, + {"gopher://foo:443/", "gopher://foo:443/", true}, + {"ws://foo:80/", "ws://foo/", true}, + {"ws://foo:81/", "ws://foo:81/", true}, + {"ws://foo:443/", "ws://foo:443/", true}, + {"ws://foo:815/", "ws://foo:815/", true}, + {"wss://foo:80/", "wss://foo:80/", true}, + {"wss://foo:81/", "wss://foo:81/", true}, + {"wss://foo:443/", "wss://foo/", true}, + {"wss://foo:815/", "wss://foo:815/", true}, + }; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + url_parse::Parsed parsed; + url_parse::ParseStandardURL(cases[i].input, url_len, &parsed); + + url_parse::Parsed out_parsed; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizeStandardURL( + cases[i].input, url_len, parsed, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + } +} + +// The codepath here is the same as for regular canonicalization, so we just +// need to test that things are replaced or not correctly. +TEST(URLCanonTest, ReplaceStandardURL) { + ReplaceCase replace_cases[] = { + // Common case of truncating the path. + {"http://www.google.com/foo?bar=baz#ref", NULL, NULL, NULL, NULL, NULL, "/", kDeleteComp, kDeleteComp, "http://www.google.com/"}, + // Replace everything + {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"}, + // Replace nothing + {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"}, + }; + + for (size_t i = 0; i < arraysize(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + url_parse::Parsed parsed; + url_parse::ParseStandardURL(cur.base, base_len, &parsed); + + url_canon::Replacements<char> r; + typedef url_canon::Replacements<char> R; // Clean up syntax. + + // Note that for the scheme we pass in a different clear function since + // there is no function to clear the scheme. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Parsed out_parsed; + url_canon::ReplaceStandardURL(replace_cases[i].base, parsed, + r, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } + + // The path pointer should be ignored if the address is invalid. + { + const char src[] = "http://www.google.com/here_is_the_path"; + int src_len = static_cast<int>(strlen(src)); + + url_parse::Parsed parsed; + url_parse::ParseStandardURL(src, src_len, &parsed); + + // Replace the path to 0 length string. By using 1 as the string address, + // the test should get an access violation if it tries to dereference it. + url_canon::Replacements<char> r; + r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component(0, 0)); + std::string out_str1; + url_canon::StdStringCanonOutput output1(&out_str1); + url_parse::Parsed new_parsed; + url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output1, &new_parsed); + output1.Complete(); + EXPECT_STREQ("http://www.google.com/", out_str1.c_str()); + + // Same with an "invalid" path. + r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component()); + std::string out_str2; + url_canon::StdStringCanonOutput output2(&out_str2); + url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output2, &new_parsed); + output2.Complete(); + EXPECT_STREQ("http://www.google.com/", out_str2.c_str()); + } +} + +TEST(URLCanonTest, ReplaceFileURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"}, + // Replace nothing + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"}, + // Clear non-path components (common) + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///C:/gaba"}, + // Replace path with something that doesn't begin with a slash and make + // sure it get added properly. + {"file:///C:/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///home/gaba?query#ref"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///home/gaba"}, + {"file:///home/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"}, + }; + + for (size_t i = 0; i < arraysize(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + url_parse::Parsed parsed; + url_parse::ParseFileURL(cur.base, base_len, &parsed); + + url_canon::Replacements<char> r; + typedef url_canon::Replacements<char> R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Parsed out_parsed; + url_canon::ReplaceFileURL(cur.base, parsed, + r, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplacePathURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"data:foo", "javascript", NULL, NULL, NULL, NULL, "alert('foo?');", NULL, NULL, "javascript:alert('foo?');"}, + // Replace nothing + {"data:foo", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "data:foo"}, + // Replace one or the other + {"data:foo", "javascript", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "javascript:foo"}, + {"data:foo", NULL, NULL, NULL, NULL, NULL, "bar", NULL, NULL, "data:bar"}, + {"data:foo", NULL, NULL, NULL, NULL, NULL, kDeleteComp, NULL, NULL, "data:"}, + }; + + for (size_t i = 0; i < arraysize(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + url_parse::Parsed parsed; + url_parse::ParsePathURL(cur.base, base_len, &parsed); + + url_canon::Replacements<char> r; + typedef url_canon::Replacements<char> R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Parsed out_parsed; + url_canon::ReplacePathURL(cur.base, parsed, + r, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplaceMailtoURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"}, + // Replace nothing + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"}, + // Replace the path + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"}, + // Replace the query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"}, + // Replace the path and query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"}, + // Set the query to empty (should leave trailing question mark) + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"}, + // Clear the query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"}, + // Clear the path + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"}, + // Clear the path + query + {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"}, + // Setting the ref should have no effect + {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"}, + }; + + for (size_t i = 0; i < arraysize(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + url_parse::Parsed parsed; + url_parse::ParseMailtoURL(cur.base, base_len, &parsed); + + url_canon::Replacements<char> r; + typedef url_canon::Replacements<char> R; + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Parsed out_parsed; + url_canon::ReplaceMailtoURL(cur.base, parsed, + r, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, CanonicalizeFileURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + url_parse::Component expected_host; + url_parse::Component expected_path; + } cases[] = { +#ifdef _WIN32 + // Windows-style paths + {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)}, + {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)}, + {"file:", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)}, + {"file:UNChost/path", "file://unchost/path", true, url_parse::Component(7, 7), url_parse::Component(14, 5)}, + // CanonicalizeFileURL supports absolute Windows style paths for IE + // compatability. Note that the caller must decide that this is a file + // URL itself so it can call the file canonicalizer. This is usually + // done automatically as part of relative URL resolving. + {"c:\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)}, + {"C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)}, + {"/C|\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)}, + {"//C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)}, + {"//server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)}, + {"\\\\server\\file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)}, + {"/\\server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)}, + // We should preserve the number of slashes after the colon for IE + // compatability, except when there is none, in which case we should + // add one. + {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)}, + {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)}, + // Three slashes should be non-UNC, even if there is no drive spec (IE + // does this, which makes the resulting request invalid). + {"file:///foo/bar.txt", "file:///foo/bar.txt", true, url_parse::Component(), url_parse::Component(7, 12)}, + // TODO(brettw) we should probably fail for invalid host names, which + // would change the expected result on this test. We also currently allow + // colon even though it's probably invalid, because its currently the + // "natural" result of the way the canonicalizer is written. There doesn't + // seem to be a strong argument for why allowing it here would be bad, so + // we just tolerate it and the load will fail later. + {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false, url_parse::Component(7, 2), url_parse::Component(9, 16)}, + {"file:filer/home\\me", "file://filer/home/me", true, url_parse::Component(7, 5), url_parse::Component(12, 8)}, + // Make sure relative paths can't go above the "C:" + {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, url_parse::Component(), url_parse::Component(7, 12)}, + // Busted refs shouldn't make the whole thing fail. + {"file:///C:/asdf#\xc2", "file:///C:/asdf#\xef\xbf\xbd", true, url_parse::Component(), url_parse::Component(7, 8)}, +#else + // Unix-style paths + {"file:///home/me", "file:///home/me", true, url_parse::Component(), url_parse::Component(7, 8)}, + // Windowsy ones should get still treated as Unix-style. + {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)}, + {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)}, + // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html) + {"//", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)}, + {"///", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)}, + {"///test", "file:///test", true, url_parse::Component(), url_parse::Component(7, 5)}, + {"file://test", "file://test/", true, url_parse::Component(7, 4), url_parse::Component(11, 1)}, + {"file://localhost", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)}, + {"file://localhost/", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)}, + {"file://localhost/test", "file://localhost/test", true, url_parse::Component(7, 9), url_parse::Component(16, 5)}, +#endif // _WIN32 + }; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + url_parse::Parsed parsed; + url_parse::ParseFileURL(cases[i].input, url_len, &parsed); + + url_parse::Parsed out_parsed; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizeFileURL(cases[i].input, url_len, + parsed, NULL, &output, + &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified, the file canonicalizer has + // different code for writing the spec. + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(4, out_parsed.scheme.len); + + EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin); + EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len); + + EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); + EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); + } +} + +TEST(URLCanonTest, CanonicalizePathURL) { + // Path URLs should get canonicalized schemes but nothing else. + struct PathCase { + const char* input; + const char* expected; + } path_cases[] = { + {"javascript:", "javascript:"}, + {"JavaScript:Foo", "javascript:Foo"}, + {":\":This /is interesting;?#", ":\":This /is interesting;?#"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) { + int url_len = static_cast<int>(strlen(path_cases[i].input)); + url_parse::Parsed parsed; + url_parse::ParsePathURL(path_cases[i].input, url_len, &parsed); + + url_parse::Parsed out_parsed; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizePathURL(path_cases[i].input, url_len, + parsed, &output, + &out_parsed); + output.Complete(); + + EXPECT_TRUE(success); + EXPECT_EQ(path_cases[i].expected, out_str); + + EXPECT_EQ(0, out_parsed.host.begin); + EXPECT_EQ(-1, out_parsed.host.len); + + // When we end with a colon at the end, there should be no path. + if (path_cases[i].input[url_len - 1] == ':') { + EXPECT_EQ(0, out_parsed.path.begin); + EXPECT_EQ(-1, out_parsed.path.len); + } + } +} + +TEST(URLCanonTest, CanonicalizeMailtoURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + url_parse::Component expected_path; + url_parse::Component expected_query; + } cases[] = { + {"mailto:addr1", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()}, + {"mailto:addr1@foo.com", "mailto:addr1@foo.com", true, url_parse::Component(7, 13), url_parse::Component()}, + // Trailing whitespace is stripped. + {"MaIlTo:addr1 \t ", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()}, + {"MaIlTo:addr1?to=jon", "mailto:addr1?to=jon", true, url_parse::Component(7, 5), url_parse::Component(13,6)}, + {"mailto:addr1,addr2", "mailto:addr1,addr2", true, url_parse::Component(7, 11), url_parse::Component()}, + {"mailto:addr1, addr2", "mailto:addr1, addr2", true, url_parse::Component(7, 12), url_parse::Component()}, + {"mailto:addr1%2caddr2", "mailto:addr1%2caddr2", true, url_parse::Component(7, 13), url_parse::Component()}, + {"mailto:\xF0\x90\x8C\x80", "mailto:%F0%90%8C%80", true, url_parse::Component(7, 12), url_parse::Component()}, + // Null character should be escaped to %00 + {"mailto:addr1\0addr2?foo", "mailto:addr1%00addr2?foo", true, url_parse::Component(7, 13), url_parse::Component(21, 3)}, + // Invalid -- UTF-8 encoded surrogate value. + {"mailto:\xed\xa0\x80", "mailto:%EF%BF%BD", false, url_parse::Component(7, 9), url_parse::Component()}, + {"mailto:addr1?", "mailto:addr1?", true, url_parse::Component(7, 5), url_parse::Component(13, 0)}, + }; + + // Define outside of loop to catch bugs where components aren't reset + url_parse::Parsed parsed; + url_parse::Parsed out_parsed; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + if (i == 8) { + // The 9th test case purposely has a '\0' in it -- don't count it + // as the string terminator. + url_len = 22; + } + url_parse::ParseMailtoURL(cases[i].input, url_len, &parsed); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizeMailtoURL(cases[i].input, url_len, + parsed, &output, + &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(6, out_parsed.scheme.len); + + EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); + EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); + + EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin); + EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len); + } +} + +#ifndef WIN32 + +TEST(URLCanonTest, _itoa_s) { + // We fill the buffer with 0xff to ensure that it's getting properly + // null-terminated. We also allocate one byte more than what we tell + // _itoa_s about, and ensure that the extra byte is untouched. + char buf[6]; + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(12, buf, sizeof(buf) - 1, 10)); + EXPECT_STREQ("12", buf); + EXPECT_EQ('\xFF', buf[3]); + + // Test the edge cases - exactly the buffer size and one over + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 10)); + EXPECT_STREQ("1234", buf); + EXPECT_EQ('\xFF', buf[5]); + + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(EINVAL, url_canon::_itoa_s(12345, buf, sizeof(buf) - 1, 10)); + EXPECT_EQ('\xFF', buf[5]); // should never write to this location + + // Test the template overload (note that this will see the full buffer) + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(12, buf, 10)); + EXPECT_STREQ("12", buf); + EXPECT_EQ('\xFF', buf[3]); + + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(12345, buf, 10)); + EXPECT_STREQ("12345", buf); + + EXPECT_EQ(EINVAL, url_canon::_itoa_s(123456, buf, 10)); + + // Test that radix 16 is supported. + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 16)); + EXPECT_STREQ("4d2", buf); + EXPECT_EQ('\xFF', buf[5]); +} + +TEST(URLCanonTest, _itow_s) { + // We fill the buffer with 0xff to ensure that it's getting properly + // null-terminated. We also allocate one byte more than what we tell + // _itoa_s about, and ensure that the extra byte is untouched. + char16 buf[6]; + const char fill_mem = 0xff; + const char16 fill_char = 0xffff; + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itow_s(12, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(WStringToUTF16(L"12"), string16(buf)); + EXPECT_EQ(fill_char, buf[3]); + + // Test the edge cases - exactly the buffer size and one over + EXPECT_EQ(0, url_canon::_itow_s(1234, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(WStringToUTF16(L"1234"), string16(buf)); + EXPECT_EQ(fill_char, buf[5]); + + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(EINVAL, url_canon::_itow_s(12345, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(fill_char, buf[5]); // should never write to this location + + // Test the template overload (note that this will see the full buffer) + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itow_s(12, buf, 10)); + EXPECT_EQ(WStringToUTF16(L"12"), string16(buf)); + EXPECT_EQ(fill_char, buf[3]); + + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itow_s(12345, buf, 10)); + EXPECT_EQ(WStringToUTF16(L"12345"), string16(buf)); + + EXPECT_EQ(EINVAL, url_canon::_itow_s(123456, buf, 10)); +} + +#endif // !WIN32 + +// Returns true if the given two structures are the same. +static bool ParsedIsEqual(const url_parse::Parsed& a, + const url_parse::Parsed& b) { + return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len && + a.username.begin == b.username.begin && a.username.len == b.username.len && + a.password.begin == b.password.begin && a.password.len == b.password.len && + a.host.begin == b.host.begin && a.host.len == b.host.len && + a.port.begin == b.port.begin && a.port.len == b.port.len && + a.path.begin == b.path.begin && a.path.len == b.path.len && + a.query.begin == b.query.begin && a.query.len == b.query.len && + a.ref.begin == b.ref.begin && a.ref.len == b.ref.len; +} + +TEST(URLCanonTest, ResolveRelativeURL) { + struct RelativeCase { + const char* base; // Input base URL: MUST BE CANONICAL + bool is_base_hier; // Is the base URL hierarchical + bool is_base_file; // Tells us if the base is a file URL. + const char* test; // Input URL to test against. + bool succeed_relative; // Whether we expect IsRelativeURL to succeed + bool is_rel; // Whether we expect |test| to be relative or not. + bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed. + const char* resolved; // What we expect in the result when resolving. + } rel_cases[] = { + // Basic absolute input. + {"http://host/a", true, false, "http://another/", true, false, false, NULL}, + {"http://host/a", true, false, "http:////another/", true, false, false, NULL}, + // Empty relative URLs shouldn't change the input. + {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"}, + // Spaces at the ends of the relative path should be ignored. + {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"}, + {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"}, + {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"}, + // Matching schemes without two slashes are treated as relative. + {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"}, + {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"}, + {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"}, + {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"}, + // Nonmatching schemes are absolute. + {"http://host/a", true, false, "https:host2", true, false, false, NULL}, + {"http://host/a", true, false, "htto:/host2", true, false, false, NULL}, + // Absolute path input + {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"}, + {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"}, + {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"}, + {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"}, + {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"}, + {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"}, + // Relative path input + {"http://host/a", true, false, "b", true, true, true, "http://host/b"}, + {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"}, + {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"}, + {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"}, + {"http://host/a/", true, false, "..", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "./..", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "../.", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"}, + {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"}, + // Query input + {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"}, + {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"}, + {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"}, + // Ref input + {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"}, + {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"}, + {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"}, + // Non-hierarchical base: no relative handling. Relative input should + // error, and if a scheme is present, it should be treated as absolute. + {"data:foobar", false, false, "baz.html", false, false, false, NULL}, + {"data:foobar", false, false, "data:baz", true, false, false, NULL}, + {"data:foobar", false, false, "data:/base", true, false, false, NULL}, + // Non-hierarchical base: absolute input should succeed. + {"data:foobar", false, false, "http://host/", true, false, false, NULL}, + {"data:foobar", false, false, "http:host", true, false, false, NULL}, + // Invalid schemes should be treated as relative. + {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"}, + {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"}, + {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"}, + {"data:asdf", false, false, ":foo", false, false, false, NULL}, + // We should treat semicolons like any other character in URL resolving + {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"}, + {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"}, + {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"}, + // Relative URLs can also be written as "//foo/bar" which is relative to + // the scheme. In this case, it would take the old scheme, so for http + // the example would resolve to "http://foo/bar". + {"http://host/a", true, false, "//another", true, true, true, "http://another/"}, + {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"}, + {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "//", true, true, false, "http:"}, + // IE will also allow one or the other to be a backslash to get the same + // behavior. + {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"}, +#ifdef WIN32 + // Resolving against Windows file base URLs. + {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL}, + {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"}, + {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"}, + {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"}, + // But two backslashes on Windows should be UNC so should be treated + // as absolute. + {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL}, + // IE doesn't support drive specs starting with two slashes. It fails + // immediately and doesn't even try to load. We fix it up to either + // an absolute path or UNC depending on what it looks like. + {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"}, + {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"}, + // Windows drive specs should be allowed and treated as absolute. + {"file:///C:/foo", true, true, "c:", true, false, false, NULL}, + {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL}, + {"http://host/a", true, false, "c:\\foo", true, false, false, NULL}, + // Relative paths with drive letters should be allowed when the base is + // also a file. + {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"}, + // Treat absolute paths as being off of the drive. + {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"}, + {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"}, + {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"}, + // On Windows, two slashes without a drive letter when the base is a file + // means that the path is UNC. + {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"}, + {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"}, +#else + // On Unix we fall back to relative behavior since there's nothing else + // reasonable to do. + {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"}, +#endif + // Even on Windows, we don't allow relative drive specs when the base + // is not file. + {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"}, + {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(rel_cases); i++) { + const RelativeCase& cur_case = rel_cases[i]; + + url_parse::Parsed parsed; + int base_len = static_cast<int>(strlen(cur_case.base)); + if (cur_case.is_base_file) + url_parse::ParseFileURL(cur_case.base, base_len, &parsed); + else if (cur_case.is_base_hier) + url_parse::ParseStandardURL(cur_case.base, base_len, &parsed); + else + url_parse::ParsePathURL(cur_case.base, base_len, &parsed); + + // First see if it is relative. + int test_len = static_cast<int>(strlen(cur_case.test)); + bool is_relative; + url_parse::Component relative_component; + bool succeed_is_rel = url_canon::IsRelativeURL( + cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier, + &is_relative, &relative_component); + + EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) << + "succeed is rel failure on " << cur_case.test; + EXPECT_EQ(cur_case.is_rel, is_relative) << + "is rel failure on " << cur_case.test; + // Now resolve it. + if (succeed_is_rel && is_relative && cur_case.is_rel) { + std::string resolved; + url_canon::StdStringCanonOutput output(&resolved); + url_parse::Parsed resolved_parsed; + + bool succeed_resolve = url_canon::ResolveRelativeURL( + cur_case.base, parsed, cur_case.is_base_file, + cur_case.test, relative_component, NULL, &output, &resolved_parsed); + output.Complete(); + + EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve); + EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test; + + // Verify that the output parsed structure is the same as parsing a + // the URL freshly. + url_parse::Parsed ref_parsed; + int resolved_len = static_cast<int>(resolved.size()); + if (cur_case.is_base_file) + url_parse::ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed); + else if (cur_case.is_base_hier) + url_parse::ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed); + else + url_parse::ParsePathURL(resolved.c_str(), resolved_len, &ref_parsed); + EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed)); + } + } +} + +// It used to be when we did a replacement with a long buffer of UTF-16 +// characters, we would get invalid data in the URL. This is because the buffer +// it used to hold the UTF-8 data was resized, while some pointers were still +// kept to the old buffer that was removed. +TEST(URLCanonTest, ReplacementOverflow) { + const char src[] = "file:///C:/foo/bar"; + int src_len = static_cast<int>(strlen(src)); + url_parse::Parsed parsed; + url_parse::ParseFileURL(src, src_len, &parsed); + + // Override two components, the path with something short, and the query with + // sonething long enough to trigger the bug. + url_canon::Replacements<char16> repl; + string16 new_query; + for (int i = 0; i < 4800; i++) + new_query.push_back('a'); + + string16 new_path(WStringToUTF16(L"/foo")); + repl.SetPath(new_path.c_str(), url_parse::Component(0, 4)); + repl.SetQuery(new_query.c_str(), + url_parse::Component(0, static_cast<int>(new_query.length()))); + + // Call ReplaceComponents on the string. It doesn't matter if we call it for + // standard URLs, file URLs, etc, since they will go to the same replacement + // function that was buggy. + url_parse::Parsed repl_parsed; + std::string repl_str; + url_canon::StdStringCanonOutput repl_output(&repl_str); + url_canon::ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed); + repl_output.Complete(); + + // Generate the expected string and check. + std::string expected("file:///foo?"); + for (size_t i = 0; i < new_query.length(); i++) + expected.push_back('a'); + EXPECT_TRUE(expected == repl_str); +} diff --git a/googleurl/src/url_file.h b/googleurl/src/url_file.h new file mode 100644 index 0000000..c1b8ac9 --- /dev/null +++ b/googleurl/src/url_file.h @@ -0,0 +1,108 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Provides shared functions used by the internals of the parser and +// canonicalizer for file URLs. Do not use outside of these modules. + +#ifndef GOOGLEURL_SRC_URL_FILE_H__ +#define GOOGLEURL_SRC_URL_FILE_H__ + +#include "googleurl/src/url_parse_internal.h" + +namespace url_parse { + +#ifdef WIN32 + +// We allow both "c:" and "c|" as drive identifiers. +inline bool IsWindowsDriveSeparator(char16 ch) { + return ch == ':' || ch == '|'; +} +inline bool IsWindowsDriveLetter(char16 ch) { + return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +#endif // WIN32 + +// Returns the index of the next slash in the input after the given index, or +// spec_len if the end of the input is reached. +template<typename CHAR> +inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) { + int idx = begin_index; + while (idx < spec_len && !IsURLSlash(spec[idx])) + idx++; + return idx; +} + +#ifdef WIN32 + +// Returns true if the start_offset in the given spec looks like it begins a +// drive spec, for example "c:". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// If this returns true, the spec is guaranteed to have a valid drive letter +// plus a colon starting at |start_offset|. +template<typename CHAR> +inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + int remaining_len = spec_len - start_offset; + if (remaining_len < 2) + return false; // Not enough room. + if (!IsWindowsDriveLetter(spec[start_offset])) + return false; // Doesn't start with a valid drive letter. + if (!IsWindowsDriveSeparator(spec[start_offset + 1])) + return false; // Isn't followed with a drive separator. + return true; +} + +// Returns true if the start_offset in the given text looks like it begins a +// UNC path, for example "\\". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// When strict_slashes is set, this function will only accept backslashes as is +// standard for Windows. Otherwise, it will accept forward slashes as well +// which we use for a lot of URL handling. +template<typename CHAR> +inline bool DoesBeginUNCPath(const CHAR* text, + int start_offset, + int len, + bool strict_slashes) { + int remaining_len = len - start_offset; + if (remaining_len < 2) + return false; + + if (strict_slashes) + return text[start_offset] == '\\' && text[start_offset + 1] == '\\'; + return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]); +} + +#endif // WIN32 + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_FILE_H__ diff --git a/googleurl/src/url_parse.cc b/googleurl/src/url_parse.cc new file mode 100644 index 0000000..7c37f13 --- /dev/null +++ b/googleurl/src/url_parse.cc @@ -0,0 +1,757 @@ +/* Based on nsURLParsers.cc from Mozilla + * ------------------------------------- + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Darin Fisher (original author) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "googleurl/src/url_parse.h" + +#include <stdlib.h> + +#include "base/logging.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_parse { + +namespace { + +// Returns true if the given character is a valid digit to use in a port. +inline bool IsPortDigit(char16 ch) { + return ch >= '0' && ch <= '9'; +} + +// Returns the offset of the next authority terminator in the input starting +// from start_offset. If no terminator is found, the return value will be equal +// to spec_len. +template<typename CHAR> +int FindNextAuthorityTerminator(const CHAR* spec, + int start_offset, + int spec_len) { + for (int i = start_offset; i < spec_len; i++) { + if (IsAuthorityTerminator(spec[i])) + return i; + } + return spec_len; // Not found. +} + +// Fills in all members of the Parsed structure except for the scheme. +// +// |spec| is the full spec being parsed, of length |spec_len|. +// |after_scheme| is the character immediately following the scheme (after the +// colon) where we'll begin parsing. +// +// Compatability data points. I list "host", "path" extracted: +// Input IE6 Firefox Us +// ----- -------------- -------------- -------------- +// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" +// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" +// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// +// (*) Interestingly, although IE fails to load these URLs, its history +// canonicalizer handles them, meaning if you've been to the corresponding +// "http://foo.com/" link, it will be colored. +template <typename CHAR> +void DoParseAfterScheme(const CHAR* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + int after_slashes = after_scheme + num_slashes; + + // First split into two main parts, the authority (username, password, host, + // and port) and the full path (path, query, and reference). + Component authority; + Component full_path; + + // Found "//<some data>", looks like an authority section. Treat everything + // from there to the next slash (or end of spec) to be the authority. Note + // that we ignore the number of slashes and treat it as the authority. + int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); + authority = Component(after_slashes, end_auth - after_slashes); + + if (end_auth == spec_len) // No beginning of path found. + full_path = Component(); + else // Everything starting from the slash to the end is the path. + full_path = Component(end_auth, spec_len - end_auth); + + // Now parse those two sub-parts. + DoParseAuthority(spec, authority, &parsed->username, &parsed->password, + &parsed->host, &parsed->port); + ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); +} + +template<typename CHAR> +void ParseUserInfo(const CHAR* spec, + const Component& user, + Component* username, + Component* password) { + // Find the first colon in the user section, which separates the username and + // password. + int colon_offset = 0; + while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') + colon_offset++; + + if (colon_offset < user.len) { + // Found separator: <username>:<password> + *username = Component(user.begin, colon_offset); + *password = MakeRange(user.begin + colon_offset + 1, + user.begin + user.len); + } else { + // No separator, treat everything as the username + *username = user; + *password = Component(); + } +} + +template<typename CHAR> +void ParseServerInfo(const CHAR* spec, + const Component& serverinfo, + Component* hostname, + Component* port_num) { + if (serverinfo.len == 0) { + // No server info, host name is empty. + hostname->reset(); + port_num->reset(); + return; + } + + // If the host starts with a left-bracket, assume the entire host is an + // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. + // This assumption will be overridden if we find a right-bracket. + // + // Our IPv6 address canonicalization code requires both brackets to exist, + // but the ability to locate an incomplete address can still be useful. + int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; + int colon = -1; + + // Find the last right-bracket, and the last colon. + for (int i = serverinfo.begin; i < serverinfo.end(); i++) { + switch (spec[i]) { + case ']': + ipv6_terminator = i; + break; + case ':': + colon = i; + break; + } + } + + if (colon > ipv6_terminator) { + // Found a port number: <hostname>:<port> + *hostname = MakeRange(serverinfo.begin, colon); + if (hostname->len == 0) + hostname->reset(); + *port_num = MakeRange(colon + 1, serverinfo.end()); + } else { + // No port: <hostname> + *hostname = serverinfo; + port_num->reset(); + } +} + +// Given an already-identified auth section, breaks it into its consituent +// parts. The port number will be parsed and the resulting integer will be +// filled into the given *port variable, or -1 if there is no port number or it +// is invalid. +template<typename CHAR> +void DoParseAuthority(const CHAR* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DCHECK(auth.is_valid()) << "We should always get an authority"; + if (auth.len == 0) { + username->reset(); + password->reset(); + hostname->reset(); + port_num->reset(); + return; + } + + // Search backwards for @, which is the separator between the user info and + // the server info. + int i = auth.begin + auth.len - 1; + while (i > auth.begin && spec[i] != '@') + i--; + + if (spec[i] == '@') { + // Found user info: <user-info>@<server-info> + ParseUserInfo(spec, Component(auth.begin, i - auth.begin), + username, password); + ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), + hostname, port_num); + } else { + // No user info, everything is server info. + username->reset(); + password->reset(); + ParseServerInfo(spec, auth, hostname, port_num); + } +} + +template<typename CHAR> +void ParsePath(const CHAR* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref> + + // Special case when there is no path. + if (path.len == -1) { + filepath->reset(); + query->reset(); + ref->reset(); + return; + } + DCHECK(path.len > 0) << "We should never have 0 length paths"; + + // Search for first occurrence of either ? or #. + int path_end = path.begin + path.len; + + int query_separator = -1; // Index of the '?' + int ref_separator = -1; // Index of the '#' + for (int i = path.begin; i < path_end; i++) { + switch (spec[i]) { + case '?': + // Only match the query string if it precedes the reference fragment + // and when we haven't found one already. + if (ref_separator < 0 && query_separator < 0) + query_separator = i; + break; + case '#': + // Record the first # sign only. + if (ref_separator < 0) + ref_separator = i; + break; + } + } + + // Markers pointing to the character after each of these corresponding + // components. The code below words from the end back to the beginning, + // and will update these indices as it finds components that exist. + int file_end, query_end; + + // Ref fragment: from the # to the end of the path. + if (ref_separator >= 0) { + file_end = query_end = ref_separator; + *ref = MakeRange(ref_separator + 1, path_end); + } else { + file_end = query_end = path_end; + ref->reset(); + } + + // Query fragment: everything from the ? to the next boundary (either the end + // of the path or the ref fragment). + if (query_separator >= 0) { + file_end = query_separator; + *query = MakeRange(query_separator + 1, query_end); + } else { + query->reset(); + } + + // File path: treat an empty file path as no file path. + if (file_end != path.begin) + *filepath = MakeRange(path.begin, file_end); + else + filepath->reset(); +} + +template<typename CHAR> +bool DoExtractScheme(const CHAR* url, + int url_len, + Component* scheme) { + // Skip leading whitespace and control characters. + int begin = 0; + while (begin < url_len && ShouldTrimFromURL(url[begin])) + begin++; + if (begin == url_len) + return false; // Input is empty or all whitespace. + + // Find the first colon character. + for (int i = begin; i < url_len; i++) { + if (url[i] == ':') { + *scheme = MakeRange(begin, i); + return true; + } + } + return false; // No colon found: no scheme +} + +// The main parsing function for standard URLs. Standard URLs have a scheme, +// host, path, etc. +template<typename CHAR> +void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + int after_scheme; + if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { + after_scheme = parsed->scheme.end() + 1; // Skip past the colon. + } else { + // Say there's no scheme when there is a colon. We could also say that + // everything is the scheme. Both would produce an invalid URL, but this way + // seems less wrong in more cases. + parsed->scheme.reset(); + after_scheme = begin; + } + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +// Initializes a path URL which is merely a scheme followed by a path. Examples +// include "about:foo" and "javascript:alert('bar');" +template<typename CHAR> +void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) { + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + // For compatability with the standard URL parser, we treat no path as + // -1, rather than having a length of 0 (we normally wouldn't care so + // much for these non-standard URLs). + if (parsed->scheme.end() == spec_len - 1) + parsed->path.reset(); + else + parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len); + } else { + // No scheme found, just path. + parsed->scheme.reset(); + parsed->path = MakeRange(begin, spec_len); + } +} + +template<typename CHAR> +void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->ref.reset(); + parsed->query.reset(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin = -1; + int path_end = -1; + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() != spec_len - 1) { + path_begin = parsed->scheme.end() + 1; + path_end = spec_len; + } + } else { + // No scheme found, just path. + parsed->scheme.reset(); + path_begin = begin; + path_end = spec_len; + } + + // Split [path_begin, path_end) into a path + query. + for (int i = path_begin; i < path_end; ++i) { + if (spec[i] == '?') { + parsed->query = MakeRange(i + 1, path_end); + path_end = i; + break; + } + } + + // For compatability with the standard URL parser, treat no path as + // -1, rather than having a length of 0 + if (path_begin == path_end) { + parsed->path.reset(); + } else { + parsed->path = MakeRange(path_begin, path_end); + } +} + +// Converts a port number in a string to an integer. We'd like to just call +// sscanf but our input is not NULL-terminated, which sscanf requires. Instead, +// we copy the digits to a small stack buffer (since we know the maximum number +// of digits in a valid port number) that we can NULL terminate. +template<typename CHAR> +int DoParsePort(const CHAR* spec, const Component& component) { + // Easy success case when there is no port. + const int kMaxDigits = 5; + if (!component.is_nonempty()) + return PORT_UNSPECIFIED; + + // Skip over any leading 0s. + Component digits_comp(component.end(), 0); + for (int i = 0; i < component.len; i++) { + if (spec[component.begin + i] != '0') { + digits_comp = MakeRange(component.begin + i, component.end()); + break; + } + } + if (digits_comp.len == 0) + return 0; // All digits were 0. + + // Verify we don't have too many digits (we'll be copying to our buffer so + // we need to double-check). + if (digits_comp.len > kMaxDigits) + return PORT_INVALID; + + // Copy valid digits to the buffer. + char digits[kMaxDigits + 1]; // +1 for null terminator + for (int i = 0; i < digits_comp.len; i++) { + CHAR ch = spec[digits_comp.begin + i]; + if (!IsPortDigit(ch)) { + // Invalid port digit, fail. + return PORT_INVALID; + } + digits[i] = static_cast<char>(ch); + } + + // Null-terminate the string and convert to integer. Since we guarantee + // only digits, atoi's lack of error handling is OK. + digits[digits_comp.len] = 0; + int port = atoi(digits); + if (port > 65535) + return PORT_INVALID; // Out of range. + return port; +} + +template<typename CHAR> +void DoExtractFileName(const CHAR* spec, + const Component& path, + Component* file_name) { + // Handle empty paths: they have no file names. + if (!path.is_nonempty()) { + file_name->reset(); + return; + } + + // Search backwards for a parameter, which is a normally unused field in a + // URL delimited by a semicolon. We parse the parameter as part of the + // path, but here, we don't want to count it. The last semicolon is the + // parameter. The path should start with a slash, so we don't need to check + // the first one. + int file_end = path.end(); + for (int i = path.end() - 1; i > path.begin; i--) { + if (spec[i] == ';') { + file_end = i; + break; + } + } + + // Now search backwards from the filename end to the previous slash + // to find the beginning of the filename. + for (int i = file_end - 1; i >= path.begin; i--) { + if (IsURLSlash(spec[i])) { + // File name is everything following this character to the end + *file_name = MakeRange(i + 1, file_end); + return; + } + } + + // No slash found, this means the input was degenerate (generally paths + // will start with a slash). Let's call everything the file name. + *file_name = MakeRange(path.begin, file_end); + return; +} + +template<typename CHAR> +bool DoExtractQueryKeyValue(const CHAR* spec, + Component* query, + Component* key, + Component* value) { + if (!query->is_nonempty()) + return false; + + int start = query->begin; + int cur = start; + int end = query->end(); + + // We assume the beginning of the input is the beginning of the "key" and we + // skip to the end of it. + key->begin = cur; + while (cur < end && spec[cur] != '&' && spec[cur] != '=') + cur++; + key->len = cur - key->begin; + + // Skip the separator after the key (if any). + if (cur < end && spec[cur] == '=') + cur++; + + // Find the value part. + value->begin = cur; + while (cur < end && spec[cur] != '&') + cur++; + value->len = cur - value->begin; + + // Finally skip the next separator if any + if (cur < end && spec[cur] == '&') + cur++; + + // Save the new query + *query = url_parse::MakeRange(cur, end); + return true; +} + +} // namespace + +int Parsed::Length() const { + if (ref.is_valid()) + return ref.end(); + return CountCharactersBefore(REF, false); +} + +int Parsed::CountCharactersBefore(ComponentType type, + bool include_delimiter) const { + if (type == SCHEME) + return scheme.begin; + + // There will be some characters after the scheme like "://" and we don't + // know how many. Search forwards for the next thing until we find one. + int cur = 0; + if (scheme.is_valid()) + cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. + + if (username.is_valid()) { + if (type <= USERNAME) + return username.begin; + cur = username.end() + 1; // Advance over the '@' or ':' at the end. + } + + if (password.is_valid()) { + if (type <= PASSWORD) + return password.begin; + cur = password.end() + 1; // Advance over the '@' at the end. + } + + if (host.is_valid()) { + if (type <= HOST) + return host.begin; + cur = host.end(); + } + + if (port.is_valid()) { + if (type < PORT || (type == PORT && include_delimiter)) + return port.begin - 1; // Back over delimiter. + if (type == PORT) + return port.begin; // Don't want delimiter counted. + cur = port.end(); + } + + if (path.is_valid()) { + if (type <= PATH) + return path.begin; + cur = path.end(); + } + + if (query.is_valid()) { + if (type < QUERY || (type == QUERY && include_delimiter)) + return query.begin - 1; // Back over delimiter. + if (type == QUERY) + return query.begin; // Don't want delimiter counted. + cur = query.end(); + } + + if (ref.is_valid()) { + if (type == REF && !include_delimiter) + return ref.begin; // Back over delimiter. + + // When there is a ref and we get here, the component we wanted was before + // this and not found, so we always know the beginning of the ref is right. + return ref.begin - 1; // Don't want delimiter counted. + } + + return cur; +} + +bool ExtractScheme(const char* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +bool ExtractScheme(const char16* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +// This handles everything that may be an authority terminator, including +// backslash. For special backslash handling see DoParseAfterScheme. +bool IsAuthorityTerminator(char16 ch) { + return IsURLSlash(ch) || ch == '?' || ch == '#' || ch == ';'; +} + +void ExtractFileName(const char* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +void ExtractFileName(const char16* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +bool ExtractQueryKeyValue(const char16* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +void ParseAuthority(char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +int ParsePort(const char* url, const Component& port) { + return DoParsePort(url, port); +} + +int ParsePort(const char16* url, const Component& port) { + return DoParsePort(url, port); +} + +void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParseStandardURL(const char16* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParsePathURL(const char* url, int url_len, Parsed* parsed) { + DoParsePathURL(url, url_len, parsed); +} + +void ParsePathURL(const char16* url, int url_len, Parsed* parsed) { + DoParsePathURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParsePathInternal(const char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +void ParseAfterScheme(const char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +} // namespace url_parse diff --git a/googleurl/src/url_parse.h b/googleurl/src/url_parse.h new file mode 100644 index 0000000..bea2766 --- /dev/null +++ b/googleurl/src/url_parse.h @@ -0,0 +1,334 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_PARSE_H__ +#define GOOGLEURL_SRC_URL_PARSE_H__ + +#include <string> + +#include "base/basictypes.h" +#include "base/string16.h" + +namespace url_parse { + +// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and +// KURLGoogle.cpp still rely on this type. +typedef char16 UTF16Char; + +// Component ------------------------------------------------------------------ + +// Represents a substring for URL parsing. +struct Component { + Component() : begin(0), len(-1) {} + + // Normal constructor: takes an offset and a length. + Component(int b, int l) : begin(b), len(l) {} + + int end() const { + return begin + len; + } + + // Returns true if this component is valid, meaning the length is given. Even + // valid components may be empty to record the fact that they exist. + bool is_valid() const { + return (len != -1); + } + + // Returns true if the given component is specified on false, the component + // is either empty or invalid. + bool is_nonempty() const { + return (len > 0); + } + + void reset() { + begin = 0; + len = -1; + } + + bool operator==(const Component& other) const { + return begin == other.begin && len == other.len; + } + + int begin; // Byte offset in the string of this component. + int len; // Will be -1 if the component is unspecified. +}; + +// Helper that returns a component created with the given begin and ending +// points. The ending point is non-inclusive. +inline Component MakeRange(int begin, int end) { + return Component(begin, end - begin); +} + +// Parsed --------------------------------------------------------------------- + +// A structure that holds the identified parts of an input URL. This structure +// does NOT store the URL itself. The caller will have to store the URL text +// and its corresponding Parsed structure separately. +// +// Typical usage would be: +// +// url_parse::Parsed parsed; +// url_parse::Component scheme; +// if (!url_parse::ExtractScheme(url, url_len, &scheme)) +// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; +// +// if (IsStandardScheme(url, scheme)) // Not provided by this component +// url_parseParseStandardURL(url, url_len, &parsed); +// else if (IsFileURL(url, scheme)) // Not provided by this component +// url_parse::ParseFileURL(url, url_len, &parsed); +// else +// url_parse::ParsePathURL(url, url_len, &parsed); +// +struct Parsed { + // Identifies different components. + enum ComponentType { + SCHEME, + USERNAME, + PASSWORD, + HOST, + PORT, + PATH, + QUERY, + REF, + }; + + // The default constructor is sufficient for the components. + Parsed() {} + + // Returns the length of the URL (the end of the last component). + // + // Note that for some invalid, non-canonical URLs, this may not be the length + // of the string. For example "http://": the parsed structure will only + // contain an entry for the four-character scheme, and it doesn't know about + // the "://". For all other last-components, it will return the real length. + int Length() const; + + // Returns the number of characters before the given component if it exists, + // or where the component would be if it did exist. This will return the + // string length if the component would be appended to the end. + // + // Note that this can get a little funny for the port, query, and ref + // components which have a delimiter that is not counted as part of the + // component. The |include_delimiter| flag controls if you want this counted + // as part of the component or not when the component exists. + // + // This example shows the difference between the two flags for two of these + // delimited components that is present (the port and query) and one that + // isn't (the reference). The components that this flag affects are marked + // with a *. + // 0 1 2 + // 012345678901234567890 + // Example input: http://foo:80/?query + // include_delim=true, ...=false ("<-" indicates different) + // SCHEME: 0 0 + // USERNAME: 5 5 + // PASSWORD: 5 5 + // HOST: 7 7 + // *PORT: 10 11 <- + // PATH: 13 13 + // *QUERY: 14 15 <- + // *REF: 20 20 + // + int CountCharactersBefore(ComponentType type, bool include_delimiter) const; + + // Scheme without the colon: "http://foo"/ would have a scheme of "http". + // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there + // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed + // to start at the beginning of the string if there are preceeding whitespace + // or control characters. + Component scheme; + + // Username. Specified in URLs with an @ sign before the host. See |password| + Component username; + + // Password. The length will be -1 if unspecified, 0 if specified but empty. + // Not all URLs with a username have a password, as in "http://me@host/". + // The password is separated form the username with a colon, as in + // "http://me:secret@host/" + Component password; + + // Host name. + Component host; + + // Port number. + Component port; + + // Path, this is everything following the host name. Length will be -1 if + // unspecified. This includes the preceeding slash, so the path on + // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to + // have a 0 length path, it will be -1 in cases like "http://host?foo". + // Note that we treat backslashes the same as slashes. + Component path; + + // Stuff between the ? and the # after the path. This does not include the + // preceeding ? character. Length will be -1 if unspecified, 0 if there is + // a question mark but no query string. + Component query; + + // Indicated by a #, this is everything following the hash sign (not + // including it). If there are multiple hash signs, we'll use the last one. + // Length will be -1 if there is no hash sign, or 0 if there is one but + // nothing follows it. + Component ref; +}; + +// Initialization functions --------------------------------------------------- +// +// These functions parse the given URL, filling in all of the structure's +// components. These functions can not fail, they will always do their best +// at interpreting the input given. +// +// The string length of the URL MUST be specified, we do not check for NULLs +// at any point in the process, and will actually handle embedded NULLs. +// +// IMPORTANT: These functions do NOT hang on to the given pointer or copy it +// in any way. See the comment above the struct. +// +// The 8-bit versions require UTF-8 encoding. + +// StandardURL is for when the scheme is known to be one that has an +// authority (host) like "http". This function will not handle weird ones +// like "about:" and "javascript:", or do the right thing for "file:" URLs. +void ParseStandardURL(const char* url, int url_len, Parsed* parsed); +void ParseStandardURL(const char16* url, int url_len, Parsed* parsed); + +// PathURL is for when the scheme is known not to have an authority (host) +// section but that aren't file URLs either. The scheme is parsed, and +// everything after the scheme is considered as the path. This is used for +// things like "about:" and "javascript:" +void ParsePathURL(const char* url, int url_len, Parsed* parsed); +void ParsePathURL(const char16* url, int url_len, Parsed* parsed); + +// FileURL is for file URLs. There are some special rules for interpreting +// these. +void ParseFileURL(const char* url, int url_len, Parsed* parsed); +void ParseFileURL(const char16* url, int url_len, Parsed* parsed); + +// MailtoURL is for mailto: urls. They are made up scheme,path,query +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); +void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); + +// Helper functions ----------------------------------------------------------- + +// Locates the scheme according to the URL parser's rules. This function is +// designed so the caller can find the scheme and call the correct Init* +// function according to their known scheme types. +// +// It also does not perform any validation on the scheme. +// +// This function will return true if the scheme is found and will put the +// scheme's range into *scheme. False means no scheme could be found. Note +// that a URL beginning with a colon has a scheme, but it is empty, so this +// function will return true but *scheme will = (0,0). +// +// The scheme is found by skipping spaces and control characters at the +// beginning, and taking everything from there to the first colon to be the +// scheme. The character at scheme.end() will be the colon (we may enhance +// this to handle full width colons or something, so don't count on the +// actual character value). The character at scheme.end()+1 will be the +// beginning of the rest of the URL, be it the authority or the path (or the +// end of the string). +// +// The 8-bit version requires UTF-8 encoding. +bool ExtractScheme(const char* url, int url_len, Component* scheme); +bool ExtractScheme(const char16* url, int url_len, Component* scheme); + +// Returns true if ch is a character that terminates the authority segment +// of a URL. +bool IsAuthorityTerminator(char16 ch); + +// Does a best effort parse of input |spec|, in range |auth|. If a particular +// component is not found, it will be set to invalid. +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); +void ParseAuthority(char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); + +// Computes the integer port value from the given port component. The port +// component should have been identified by one of the init functions on +// |Parsed| for the given input url. +// +// The return value will be a positive integer between 0 and 64K, or one of +// the two special values below. +enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; +int ParsePort(const char* url, const Component& port); +int ParsePort(const char16* url, const Component& port); + +// Extracts the range of the file name in the given url. The path must +// already have been computed by the parse function, and the matching URL +// and extracted path are provided to this function. The filename is +// defined as being everything from the last slash/backslash of the path +// to the end of the path. +// +// The file name will be empty if the path is empty or there is nothing +// following the last slash. +// +// The 8-bit version requires UTF-8 encoding. +void ExtractFileName(const char* url, + const Component& path, + Component* file_name); +void ExtractFileName(const char16* url, + const Component& path, + Component* file_name); + +// Extract the first key/value from the range defined by |*query|. Updates +// |*query| to start at the end of the extracted key/value pair. This is +// designed for use in a loop: you can keep calling it with the same query +// object and it will iterate over all items in the query. +// +// Some key/value pairs may have the key, the value, or both be empty (for +// example, the query string "?&"). These will be returned. Note that an empty +// last parameter "foo.com?" or foo.com?a&" will not be returned, this case +// is the same as "done." +// +// The initial query component should not include the '?' (this is the default +// for parsed URLs). +// +// If no key/value are found |*key| and |*value| will be unchanged and it will +// return false. +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); +bool ExtractQueryKeyValue(const char16* url, + Component* query, + Component* key, + Component* value); + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_PARSE_H__ diff --git a/googleurl/src/url_parse_file.cc b/googleurl/src/url_parse_file.cc new file mode 100644 index 0000000..2e8429f --- /dev/null +++ b/googleurl/src/url_parse_file.cc @@ -0,0 +1,243 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "base/logging.h" +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_parse_internal.h" + +// Interesting IE file:isms... +// +// INPUT OUTPUT +// ========================= ============================== +// file:/foo/bar file:///foo/bar +// The result here seems totally invalid!?!? This isn't UNC. +// +// file:/ +// file:// or any other number of slashes +// IE6 doesn't do anything at all if you click on this link. No error: +// nothing. IE6's history system seems to always color this link, so I'm +// guessing that it maps internally to the empty URL. +// +// C:\ file:///C:/ +// When on a file: URL source page, this link will work. When over HTTP, +// the file: URL will appear in the status bar but the link will not work +// (security restriction for all file URLs). +// +// file:foo/ file:foo/ (invalid?!?!?) +// file:/foo/ file:///foo/ (invalid?!?!?) +// file://foo/ file://foo/ (UNC to server "foo") +// file:///foo/ file:///foo/ (invalid, seems to be a file) +// file:////foo/ file://foo/ (UNC to server "foo") +// Any more than four slashes is also treated as UNC. +// +// file:C:/ file://C:/ +// file:/C:/ file://C:/ +// The number of slashes after "file:" don't matter if the thing following +// it looks like an absolute drive path. Also, slashes and backslashes are +// equally valid here. + +namespace url_parse { + +namespace { + +// A subcomponent of DoInitFileURL, the input of this function should be a UNC +// path name, with the index of the first character after the slashes following +// the scheme given in |after_slashes|. This will initialize the host, path, +// query, and ref, and leave the other output components untouched +// (DoInitFileURL handles these for us). +template<typename CHAR> +void DoParseUNC(const CHAR* spec, + int after_slashes, + int spec_len, + Parsed* parsed) { + int next_slash = FindNextSlash(spec, after_slashes, spec_len); + if (next_slash == spec_len) { + // No additional slash found, as in "file://foo", treat the text as the + // host with no path (this will end up being UNC to server "foo"). + int host_len = spec_len - after_slashes; + if (host_len) + parsed->host = Component(after_slashes, host_len); + else + parsed->host.reset(); + parsed->path.reset(); + return; + } + +#ifdef WIN32 + // See if we have something that looks like a path following the first + // component. As in "file://localhost/c:/", we get "c:/" out. We want to + // treat this as a having no host but the path given. Works on Windows only. + if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) { + parsed->host.reset(); + ParsePathInternal(spec, MakeRange(next_slash, spec_len), + &parsed->path, &parsed->query, &parsed->ref); + return; + } +#endif + + // Otherwise, everything up until that first slash we found is the host name, + // which will end up being the UNC host. For example "file://foo/bar.txt" + // will get a server name of "foo" and a path of "/bar". Later, on Windows, + // this should be treated as the filename "\\foo\bar.txt" in proper UNC + // notation. + int host_len = next_slash - after_slashes; + if (host_len) + parsed->host = MakeRange(after_slashes, next_slash); + else + parsed->host.reset(); + if (next_slash < spec_len) { + ParsePathInternal(spec, MakeRange(next_slash, spec_len), + &parsed->path, &parsed->query, &parsed->ref); + } else { + parsed->path.reset(); + } +} + +// A subcomponent of DoParseFileURL, the input should be a local file, with the +// beginning of the path indicated by the index in |path_begin|. This will +// initialize the host, path, query, and ref, and leave the other output +// components untouched (DoInitFileURL handles these for us). +template<typename CHAR> +void DoParseLocalFile(const CHAR* spec, + int path_begin, + int spec_len, + Parsed* parsed) { + parsed->host.reset(); + ParsePathInternal(spec, MakeRange(path_begin, spec_len), + &parsed->path, &parsed->query, &parsed->ref); +} + +// Backend for the external functions that operates on either char type. +// We are handed the character after the "file:" at the beginning of the spec. +// Usually this is a slash, but needn't be; we allow paths like "file:c:\foo". +template<typename CHAR> +void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the parts we never use for file URLs out of the way. + parsed->username.reset(); + parsed->password.reset(); + parsed->port.reset(); + + // Many of the code paths don't set these, so it's convenient to just clear + // them. We'll write them in those cases we need them. + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Find the scheme. + int num_slashes; + int after_scheme; + int after_slashes; +#ifdef WIN32 + // See how many slashes there are. We want to handle cases like UNC but also + // "/c:/foo". This is when there is no scheme, so we can allow pages to do + // links like "c:/foo/bar" or "//foo/bar". This is also called by the + // relative URL resolver when it determines there is an absolute URL, which + // may give us input like "/c:/foo". + num_slashes = CountConsecutiveSlashes(spec, begin, spec_len); + after_slashes = begin + num_slashes; + if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) { + // Windows path, don't try to extract the scheme (for example, "c:\foo"). + parsed->scheme.reset(); + after_scheme = after_slashes; + } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) { + // Windows UNC path: don't try to extract the scheme, but keep the slashes. + parsed->scheme.reset(); + after_scheme = begin; + } else +#endif + { + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + after_scheme = parsed->scheme.end() + 1; + } else { + // No scheme found, remember that. + parsed->scheme.reset(); + after_scheme = begin; + } + } + + // Handle empty specs ones that contain only whitespace or control chars, + // or that are just the scheme (for example "file:"). + if (after_scheme == spec_len) { + parsed->host.reset(); + parsed->path.reset(); + return; + } + + num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + + after_slashes = after_scheme + num_slashes; +#ifdef WIN32 + // Check whether the input is a drive again. We checked above for windows + // drive specs, but that's only at the very beginning to see if we have a + // scheme at all. This test will be duplicated in that case, but will + // additionally handle all cases with a real scheme such as "file:///C:/". + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) && + num_slashes != 3) { + // Anything not beginning with a drive spec ("c:\") on Windows is treated + // as UNC, with the exception of three slashes which always means a file. + // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails. + DoParseUNC(spec, after_slashes, spec_len, parsed); + return; + } +#else + // file: URL with exactly 2 slashes is considered to have a host component. + if (num_slashes == 2) { + DoParseUNC(spec, after_slashes, spec_len, parsed); + return; + } +#endif // WIN32 + + // Easy and common case, the full path immediately follows the scheme + // (modulo slashes), as in "file://c:/foo". Just treat everything from + // there to the end as the path. Empty hosts have 0 length instead of -1. + // We include the last slash as part of the path if there is one. + DoParseLocalFile(spec, + num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme, + spec_len, parsed); +} + +} // namespace + +void ParseFileURL(const char* url, int url_len, Parsed* parsed) { + DoParseFileURL(url, url_len, parsed); +} + +void ParseFileURL(const char16* url, int url_len, Parsed* parsed) { + DoParseFileURL(url, url_len, parsed); +} + +} // namespace url_parse diff --git a/googleurl/src/url_parse_internal.h b/googleurl/src/url_parse_internal.h new file mode 100644 index 0000000..61bd068 --- /dev/null +++ b/googleurl/src/url_parse_internal.h @@ -0,0 +1,112 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Contains common inline helper functions used by the URL parsing routines. + +#ifndef GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ + +#include "googleurl/src/url_parse.h" + +namespace url_parse { + +// We treat slashes and backslashes the same for IE compatability. +inline bool IsURLSlash(char16 ch) { + return ch == '/' || ch == '\\'; +} + +// Returns true if we should trim this character from the URL because it is a +// space or a control character. +inline bool ShouldTrimFromURL(char16 ch) { + return ch <= ' '; +} + +// Given an already-initialized begin index and length, this shrinks the range +// to eliminate "should-be-trimmed" characters. Note that the length does *not* +// indicate the length of untrimmed data from |*begin|, but rather the position +// in the input string (so the string starts at character |*begin| in the spec, +// and goes until |*len|). +template<typename CHAR> +inline void TrimURL(const CHAR* spec, int* begin, int* len) { + // Strip leading whitespace and control characters. + while (*begin < *len && ShouldTrimFromURL(spec[*begin])) + (*begin)++; + + // Strip trailing whitespace and control characters. We need the >i test for + // when the input string is all blanks; we don't want to back past the input. + while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) + (*len)--; +} + +// Counts the number of consecutive slashes starting at the given offset +// in the given string of the given length. +template<typename CHAR> +inline int CountConsecutiveSlashes(const CHAR *str, + int begin_offset, int str_len) { + int count = 0; + while (begin_offset + count < str_len && + IsURLSlash(str[begin_offset + count])) + ++count; + return count; +} + +// Internal functions in url_parse.cc that parse the path, that is, everything +// following the authority section. The input is the range of everything +// following the authority section, and the output is the identified ranges. +// +// This is designed for the file URL parser or other consumers who may do +// special stuff at the beginning, but want regular path parsing, it just +// maps to the internal parsing function for paths. +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); +void ParsePathInternal(const char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); + + +// Given a spec and a pointer to the character after the colon following the +// scheme, this parses it and fills in the structure, Every item in the parsed +// structure is filled EXCEPT for the scheme, which is untouched. +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed); +void ParseAfterScheme(const char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed); + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ diff --git a/googleurl/src/url_parse_unittest.cc b/googleurl/src/url_parse_unittest.cc new file mode 100644 index 0000000..219d5a0 --- /dev/null +++ b/googleurl/src/url_parse_unittest.cc @@ -0,0 +1,583 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "base/basictypes.h" +#include "googleurl/src/url_parse.h" +#include "testing/gtest/include/gtest/gtest.h" + +// Some implementations of base/basictypes.h may define ARRAYSIZE. +// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro +// which is in our version of basictypes.h. +#ifndef ARRAYSIZE +#define ARRAYSIZE ARRAYSIZE_UNSAFE +#endif + +// Interesting IE file:isms... +// +// file:/foo/bar file:///foo/bar +// The result here seems totally invalid!?!? This isn't UNC. +// +// file:/ +// file:// or any other number of slashes +// IE6 doesn't do anything at all if you click on this link. No error: +// nothing. IE6's history system seems to always color this link, so I'm +// guessing that it maps internally to the empty URL. +// +// C:\ file:///C:/ +// / file:///C:/ +// /foo file:///C:/foo +// Interestingly, IE treats "/" as an alias for "c:\", which makes sense, +// but is weird to think about on Windows. +// +// file:foo/ file:foo/ (invalid?!?!?) +// file:/foo/ file:///foo/ (invalid?!?!?) +// file://foo/ file://foo/ (UNC to server "foo") +// file:///foo/ file:///foo/ (invalid) +// file:////foo/ file://foo/ (UNC to server "foo") +// Any more than four slashes is also treated as UNC. +// +// file:C:/ file://C:/ +// file:/C:/ file://C:/ +// The number of slashes after "file:" don't matter if the thing following +// it looks like an absolute drive path. Also, slashes and backslashes are +// equally valid here. + +namespace { + +// Used for regular URL parse cases. +struct URLParseCase { + const char* input; + + const char* scheme; + const char* username; + const char* password; + const char* host; + int port; + const char* path; + const char* query; + const char* ref; +}; + +// Simpler version of URLParseCase for testing path URLs. +struct PathURLParseCase { + const char* input; + + const char* scheme; + const char* path; +}; + +// Simpler version of URLParseCase for testing mailto URLs. +struct MailtoURLParseCase { + const char* input; + + const char* scheme; + const char* path; + const char* query; +}; + + +bool ComponentMatches(const char* input, + const char* reference, + const url_parse::Component& component) { + // If the component is nonexistant (length == -1), it should begin at 0. + EXPECT_TRUE(component.len >= 0 || component.len == -1); + + // Begin should be valid. + EXPECT_LE(0, component.begin); + + // A NULL reference means the component should be nonexistant. + if (!reference) + return component.len == -1; + if (component.len < 0) + return false; // Reference is not NULL but we don't have anything + + if (strlen(reference) != static_cast<size_t>(component.len)) + return false; // Lengths don't match + + // Now check the actual characters. + return strncmp(reference, &input[component.begin], component.len) == 0; +} + +void ExpectInvalidComponent(const url_parse::Component& component) { + EXPECT_EQ(0, component.begin); + EXPECT_EQ(-1, component.len); +} + +} // namespace + +// Parsed ---------------------------------------------------------------------- + +TEST(URLParser, Length) { + const char* length_cases[] = { + // One with everything in it. + "http://user:pass@host:99/foo?bar#baz", + // One with nothing in it. + "", + // Working backwards, let's start taking off stuff from the full one. + "http://user:pass@host:99/foo?bar#", + "http://user:pass@host:99/foo?bar", + "http://user:pass@host:99/foo?", + "http://user:pass@host:99/foo", + "http://user:pass@host:99/", + "http://user:pass@host:99", + "http://user:pass@host:", + "http://user:pass@host", + "http://host", + "http://user@", + "http:", + }; + for (size_t i = 0; i < arraysize(length_cases); i++) { + int true_length = static_cast<int>(strlen(length_cases[i])); + + url_parse::Parsed parsed; + url_parse::ParseStandardURL(length_cases[i], true_length, &parsed); + + EXPECT_EQ(true_length, parsed.Length()); + } +} + +TEST(URLParser, CountCharactersBefore) { + using namespace url_parse; + struct CountCase { + const char* url; + Parsed::ComponentType component; + bool include_delimiter; + int expected_count; + } count_cases[] = { + // Test each possibility in the case where all components are present. +// 0 1 2 +// 0123456789012345678901 + {"http://u:p@h:8/p?q#r", Parsed::SCHEME, true, 0}, + {"http://u:p@h:8/p?q#r", Parsed::SCHEME, false, 0}, + {"http://u:p@h:8/p?q#r", Parsed::USERNAME, true, 7}, + {"http://u:p@h:8/p?q#r", Parsed::USERNAME, false, 7}, + {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, true, 9}, + {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, false, 9}, + {"http://u:p@h:8/p?q#r", Parsed::HOST, true, 11}, + {"http://u:p@h:8/p?q#r", Parsed::HOST, false, 11}, + {"http://u:p@h:8/p?q#r", Parsed::PORT, true, 12}, + {"http://u:p@h:8/p?q#r", Parsed::PORT, false, 13}, + {"http://u:p@h:8/p?q#r", Parsed::PATH, false, 14}, + {"http://u:p@h:8/p?q#r", Parsed::PATH, true, 14}, + {"http://u:p@h:8/p?q#r", Parsed::QUERY, true, 16}, + {"http://u:p@h:8/p?q#r", Parsed::QUERY, false, 17}, + {"http://u:p@h:8/p?q#r", Parsed::REF, true, 18}, + {"http://u:p@h:8/p?q#r", Parsed::REF, false, 19}, + // Now test when the requested component is missing. + {"http://u:p@h:8/p?", Parsed::REF, true, 17}, + {"http://u:p@h:8/p?q", Parsed::REF, true, 18}, + {"http://u:p@h:8/p#r", Parsed::QUERY, true, 16}, + {"http://u:p@h:8#r", Parsed::PATH, true, 14}, + {"http://u:p@h/", Parsed::PORT, true, 12}, + {"http://u:p@/", Parsed::HOST, true, 11}, + // This case is a little weird. It will report that the password would + // start where the host begins. This is arguably correct, although you + // could also argue that it should start at the '@' sign. Doing it + // starting with the '@' sign is actually harder, so we don't bother. + {"http://u@h/", Parsed::PASSWORD, true, 9}, + {"http://h/", Parsed::USERNAME, true, 7}, + {"http:", Parsed::USERNAME, true, 5}, + {"", Parsed::SCHEME, true, 0}, + // Make sure a random component still works when there's nothing there. + {"", Parsed::REF, true, 0}, + // File URLs are special with no host, so we test those. + {"file:///c:/foo", Parsed::USERNAME, true, 7}, + {"file:///c:/foo", Parsed::PASSWORD, true, 7}, + {"file:///c:/foo", Parsed::HOST, true, 7}, + {"file:///c:/foo", Parsed::PATH, true, 7}, + }; + for (size_t i = 0; i < ARRAYSIZE(count_cases); i++) { + int length = static_cast<int>(strlen(count_cases[i].url)); + + // Simple test to distinguish file and standard URLs. + url_parse::Parsed parsed; + if (length > 0 && count_cases[i].url[0] == 'f') + url_parse::ParseFileURL(count_cases[i].url, length, &parsed); + else + url_parse::ParseStandardURL(count_cases[i].url, length, &parsed); + + int chars_before = parsed.CountCharactersBefore( + count_cases[i].component, count_cases[i].include_delimiter); + EXPECT_EQ(count_cases[i].expected_count, chars_before); + } +} + +// Standard -------------------------------------------------------------------- + +// Input Scheme Usrname Passwd Host Port Path Query Ref +// ------------------------------------ ------- ------- ---------- ------------ --- ---------- ------------ ----- +static URLParseCase cases[] = { + // Regular URL with all the parts +{"http://user:pass@foo:21/bar;par?b#c", "http", "user", "pass", "foo", 21, "/bar;par","b", "c"}, + + // Known schemes should lean towards authority identification +{"http:foo.com", "http", NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, + + // Spaces! +{"\t :foo.com \n", "", NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, +{" foo.com ", NULL, NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, +{"a:\t foo.com", "a", NULL, NULL, "\t foo.com", -1, NULL, NULL, NULL}, +{"http://f:21/ b ? d # e ", "http", NULL, NULL, "f", 21, "/ b ", " d ", " e"}, + + // Invalid port numbers should be identified and turned into -2, empty port + // numbers should be -1. Spaces aren't allowed in port numbers +{"http://f:/c", "http", NULL, NULL, "f", -1, "/c", NULL, NULL}, +{"http://f:0/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL}, +{"http://f:00000000000000/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL}, +{"http://f:00000000000000000000080/c", "http", NULL, NULL, "f", 80, "/c", NULL, NULL}, +{"http://f:b/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f: /c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:\n/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:fifty-two/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:999999/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f: 21 / b ? d # e ", "http", NULL, NULL, "f", -2, "/ b ", " d ", " e"}, + + // Creative URLs missing key elements +{"", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{" \t", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":foo.com/", "", NULL, NULL, "foo.com", -1, "/", NULL, NULL}, +{":foo.com\\", "", NULL, NULL, "foo.com", -1, "\\", NULL, NULL}, +{":", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":a", "", NULL, NULL, "a", -1, NULL, NULL, NULL}, +{":/", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":\\", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":#", "", NULL, NULL, NULL, -1, NULL, NULL, ""}, +{"#", NULL, NULL, NULL, NULL, -1, NULL, NULL, ""}, +{"#/", NULL, NULL, NULL, NULL, -1, NULL, NULL, "/"}, +{"#\\", NULL, NULL, NULL, NULL, -1, NULL, NULL, "\\"}, +{"#;?", NULL, NULL, NULL, NULL, -1, NULL, NULL, ";?"}, +{"?", NULL, NULL, NULL, NULL, -1, NULL, "", NULL}, +{"/", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":23", "", NULL, NULL, "23", -1, NULL, NULL, NULL}, +{"/:23", "/", NULL, NULL, "23", -1, NULL, NULL, NULL}, +{"//", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"::", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"::23", "", NULL, NULL, NULL, 23, NULL, NULL, NULL}, +{"foo://", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + + // Username/passwords and things that look like them +{"http://a:b@c:29/d", "http", "a", "b", "c", 29, "/d", NULL, NULL}, +{"http::@c:29", "http", "", "", "c", 29, NULL, NULL, NULL}, + // ... "]" in the password field isn't allowed, but we tolerate it here... +{"http://&a:foo(b]c@d:2/", "http", "&a", "foo(b]c", "d", 2, "/", NULL, NULL}, +{"http://::@c@d:2", "http", "", ":@c", "d", 2, NULL, NULL, NULL}, +{"http://foo.com:b@d/", "http", "foo.com", "b", "d", -1, "/", NULL, NULL}, + +{"http://foo.com/\\@", "http", NULL, NULL, "foo.com", -1, "/\\@", NULL, NULL}, +{"http:\\\\foo.com\\", "http", NULL, NULL, "foo.com", -1, "\\", NULL, NULL}, +{"http:\\\\a\\b:c\\d@foo.com\\", "http", NULL, NULL, "a", -1, "\\b:c\\d@foo.com\\", NULL, NULL}, + + // Tolerate different numbers of slashes. +{"foo:/", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"foo:/bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL}, +{"foo://///////", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"foo://///////bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL}, +{"foo:////://///", "foo", NULL, NULL, NULL, -1, "/////", NULL, NULL}, + + // Raw file paths on Windows aren't handled by the parser. +{"c:/foo", "c", NULL, NULL, "foo", -1, NULL, NULL, NULL}, +{"//foo/bar", NULL, NULL, NULL, "foo", -1, "/bar", NULL, NULL}, + + // Use the first question mark for the query and the ref. +{"http://foo/path;a??e#f#g", "http", NULL, NULL, "foo", -1, "/path;a", "?e", "f#g"}, +{"http://foo/abcd?efgh?ijkl", "http", NULL, NULL, "foo", -1, "/abcd", "efgh?ijkl", NULL}, +{"http://foo/abcd#foo?bar", "http", NULL, NULL, "foo", -1, "/abcd", NULL, "foo?bar"}, + + // IPv6, check also interesting uses of colons. +{"[61:24:74]:98", "[61", NULL, NULL, "24:74]", 98, NULL, NULL, NULL}, +{"http://[61:27]:98", "http", NULL, NULL, "[61:27]", 98, NULL, NULL, NULL}, +{"http:[61:27]/:foo", "http", NULL, NULL, "[61:27]", -1, "/:foo", NULL, NULL}, +{"http://[1::2]:3:4", "http", NULL, NULL, "[1::2]:3", 4, NULL, NULL, NULL}, + + // Partially-complete IPv6 literals, and related cases. +{"http://2001::1", "http", NULL, NULL, "2001:", 1, NULL, NULL, NULL}, +{"http://[2001::1", "http", NULL, NULL, "[2001::1", -1, NULL, NULL, NULL}, +{"http://2001::1]", "http", NULL, NULL, "2001::1]", -1, NULL, NULL, NULL}, +{"http://2001::1]:80", "http", NULL, NULL, "2001::1]", 80, NULL, NULL, NULL}, +{"http://[2001::1]", "http", NULL, NULL, "[2001::1]", -1, NULL, NULL, NULL}, +{"http://[2001::1]:80", "http", NULL, NULL, "[2001::1]", 80, NULL, NULL, NULL}, +{"http://[[::]]", "http", NULL, NULL, "[[::]]", -1, NULL, NULL, NULL}, + +}; + +TEST(URLParser, Standard) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + url_parse::Parsed parsed; + for (size_t i = 0; i < arraysize(cases); i++) { + const char* url = cases[i].input; + url_parse::ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed); + int port = url_parse::ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, cases[i].username, parsed.username)); + EXPECT_TRUE(ComponentMatches(url, cases[i].password, parsed.password)); + EXPECT_TRUE(ComponentMatches(url, cases[i].host, parsed.host)); + EXPECT_EQ(cases[i].port, port); + EXPECT_TRUE(ComponentMatches(url, cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, cases[i].query, parsed.query)); + EXPECT_TRUE(ComponentMatches(url, cases[i].ref, parsed.ref)); + } +} + +// PathURL -------------------------------------------------------------------- + +// Various incarnations of path URLs. +static PathURLParseCase path_cases[] = { +{"", NULL, NULL}, +{":", "", NULL}, +{":/", "", "/"}, +{"/", NULL, "/"}, +{" This is \\interesting// \t", NULL, "This is \\interesting//"}, +{"about:", "about", NULL}, +{"about:blank", "about", "blank"}, +{" about: blank ", "about", " blank"}, +{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\");"}, +}; + +TEST(URLParser, PathURL) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + url_parse::Parsed parsed; + for (size_t i = 0; i < arraysize(path_cases); i++) { + const char* url = path_cases[i].input; + url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), &parsed); + + EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.path)); + + // The remaining components are never used for path urls. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.host); + ExpectInvalidComponent(parsed.port); + ExpectInvalidComponent(parsed.query); + ExpectInvalidComponent(parsed.ref); + } +} + +#ifdef WIN32 + +// WindowsFile ---------------------------------------------------------------- + +// Various incarnations of file URLs. These are for Windows only. +static URLParseCase file_cases[] = { +{"file:server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL}, +{" file: server \t", "file", NULL, NULL, " server",-1, NULL, NULL, NULL}, +{"FiLe:c|", "FiLe", NULL, NULL, NULL, -1, "c|", NULL, NULL}, +{"FILE:/\\\\/server/file", "FILE", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL}, +{"file://localhost/c:/", "file", NULL, NULL, NULL, -1, "/c:/", NULL, NULL}, +{"file://127.0.0.1/c|\\", "file", NULL, NULL, NULL, -1, "/c|\\", NULL, NULL}, +{"file:/", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + // If there is a Windows drive letter, treat any number of slashes as the + // path part. +{"file:c:\\fo\\b", "file", NULL, NULL, NULL, -1, "c:\\fo\\b", NULL, NULL}, +{"file:/c:\\foo/bar", "file", NULL, NULL, NULL, -1, "/c:\\foo/bar",NULL, NULL}, +{"file://c:/f\\b", "file", NULL, NULL, NULL, -1, "/c:/f\\b", NULL, NULL}, +{"file:///C:/foo", "file", NULL, NULL, NULL, -1, "/C:/foo", NULL, NULL}, +{"file://///\\/\\/c:\\f\\b", "file", NULL, NULL, NULL, -1, "/c:\\f\\b", NULL, NULL}, + // If there is not a drive letter, we should treat is as UNC EXCEPT for + // three slashes, which we treat as a Unix style path. +{"file:server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file:/server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file://server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file:///server/file", "file", NULL, NULL, NULL, -1, "/server/file",NULL, NULL}, +{"file://\\server/file", "file", NULL, NULL, NULL, -1, "\\server/file",NULL, NULL}, +{"file:////server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, + // Queries and refs are valid for file URLs as well. +{"file:///C:/foo.html?#", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "", ""}, +{"file:///C:/foo.html?query=yes#ref", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "query=yes", "ref"}, +}; + +TEST(URLParser, WindowsFile) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + url_parse::Parsed parsed; + for (int i = 0; i < arraysize(file_cases); i++) { + const char* url = file_cases[i].input; + url_parse::ParseFileURL(url, static_cast<int>(strlen(url)), &parsed); + int port = url_parse::ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].username, parsed.username)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].password, parsed.password)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].host, parsed.host)); + EXPECT_EQ(file_cases[i].port, port); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].query, parsed.query)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].ref, parsed.ref)); + } +} + +#endif // WIN32 + +TEST(URLParser, ExtractFileName) { + struct FileCase { + const char* input; + const char* expected; + } file_cases[] = { + {"http://www.google.com", NULL}, + {"http://www.google.com/", ""}, + {"http://www.google.com/search", "search"}, + {"http://www.google.com/search/", ""}, + {"http://www.google.com/foo/bar.html?baz=22", "bar.html"}, + {"http://www.google.com/foo/bar.html#ref", "bar.html"}, + {"http://www.google.com/search/;param", ""}, + {"http://www.google.com/foo/bar.html;param#ref", "bar.html"}, + {"http://www.google.com/foo/bar.html;foo;param#ref", "bar.html;foo"}, + {"http://www.google.com/foo/bar.html?query#ref", "bar.html"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(file_cases); i++) { + const char* url = file_cases[i].input; + int len = static_cast<int>(strlen(url)); + + url_parse::Parsed parsed; + url_parse::ParseStandardURL(url, len, &parsed); + + url_parse::Component file_name; + url_parse::ExtractFileName(url, parsed.path, &file_name); + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].expected, file_name)); + } +} + +// Returns true if the parameter with index |parameter| in the given URL's +// query string. The expected key can be NULL to indicate no such key index +// should exist. The parameter number is 1-based. +static bool NthParameterIs(const char* url, + int parameter, + const char* expected_key, + const char* expected_value) { + url_parse::Parsed parsed; + url_parse::ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed); + + url_parse::Component query = parsed.query; + + for (int i = 1; i <= parameter; i++) { + url_parse::Component key, value; + if (!url_parse::ExtractQueryKeyValue(url, &query, &key, &value)) { + if (parameter >= i && !expected_key) + return true; // Expected nonexistant key, got one. + return false; // Not enough keys. + } + + if (i == parameter) { + if (!expected_key) + return false; + + if (strncmp(&url[key.begin], expected_key, key.len) != 0) + return false; + if (strncmp(&url[value.begin], expected_value, value.len) != 0) + return false; + return true; + } + } + return expected_key == NULL; // We didn't find that many parameters. +} + +TEST(URLParser, ExtractQueryKeyValue) { + EXPECT_TRUE(NthParameterIs("http://www.google.com", 1, NULL, NULL)); + + // Basic case. + char a[] = "http://www.google.com?arg1=1&arg2=2&bar"; + EXPECT_TRUE(NthParameterIs(a, 1, "arg1", "1")); + EXPECT_TRUE(NthParameterIs(a, 2, "arg2", "2")); + EXPECT_TRUE(NthParameterIs(a, 3, "bar", "")); + EXPECT_TRUE(NthParameterIs(a, 4, NULL, NULL)); + + // Empty param at the end. + char b[] = "http://www.google.com?foo=bar&"; + EXPECT_TRUE(NthParameterIs(b, 1, "foo", "bar")); + EXPECT_TRUE(NthParameterIs(b, 2, NULL, NULL)); + + // Empty param at the beginning. + char c[] = "http://www.google.com?&foo=bar"; + EXPECT_TRUE(NthParameterIs(c, 1, "", "")); + EXPECT_TRUE(NthParameterIs(c, 2, "foo", "bar")); + EXPECT_TRUE(NthParameterIs(c, 3, NULL, NULL)); + + // Empty key with value. + char d[] = "http://www.google.com?=foo"; + EXPECT_TRUE(NthParameterIs(d, 1, "", "foo")); + EXPECT_TRUE(NthParameterIs(d, 2, NULL, NULL)); + + // Empty value with key. + char e[] = "http://www.google.com?foo="; + EXPECT_TRUE(NthParameterIs(e, 1, "foo", "")); + EXPECT_TRUE(NthParameterIs(e, 2, NULL, NULL)); + + // Empty key and values. + char f[] = "http://www.google.com?&&==&="; + EXPECT_TRUE(NthParameterIs(f, 1, "", "")); + EXPECT_TRUE(NthParameterIs(f, 2, "", "")); + EXPECT_TRUE(NthParameterIs(f, 3, "", "=")); + EXPECT_TRUE(NthParameterIs(f, 4, "", "")); + EXPECT_TRUE(NthParameterIs(f, 5, NULL, NULL)); +} + +// MailtoURL -------------------------------------------------------------------- + +static MailtoURLParseCase mailto_cases[] = { +//|input |scheme |path |query +{"mailto:foo@gmail.com", "mailto", "foo@gmail.com", NULL}, +{" mailto: to \t", "mailto", " to", NULL}, +{"mailto:addr1%2C%20addr2 ", "mailto", "addr1%2C%20addr2", NULL}, +{"Mailto:addr1, addr2 ", "Mailto", "addr1, addr2", NULL}, +{"mailto:addr1:addr2 ", "mailto", "addr1:addr2", NULL}, +{"mailto:?to=addr1,addr2", "mailto", NULL, "to=addr1,addr2"}, +{"mailto:?to=addr1%2C%20addr2", "mailto", NULL, "to=addr1%2C%20addr2"}, +{"mailto:addr1?to=addr2", "mailto", "addr1", "to=addr2"}, +{"mailto:?body=#foobar#", "mailto", NULL, "body=#foobar#",}, +{"mailto:#?body=#foobar#", "mailto", "#", "body=#foobar#"}, +}; + +TEST(URLParser, MailtoUrl) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + url_parse::Parsed parsed; + for (size_t i = 0; i < arraysize(mailto_cases); ++i) { + const char* url = mailto_cases[i].input; + url_parse::ParseMailtoURL(url, static_cast<int>(strlen(url)), &parsed); + int port = url_parse::ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query)); + EXPECT_EQ(url_parse::PORT_UNSPECIFIED, port); + + // The remaining components are never used for mailto urls. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.port); + ExpectInvalidComponent(parsed.ref); + } +} diff --git a/googleurl/src/url_test_utils.h b/googleurl/src/url_test_utils.h new file mode 100644 index 0000000..5294202 --- /dev/null +++ b/googleurl/src/url_test_utils.h @@ -0,0 +1,85 @@ +// Copyright 2007 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Convenience functions for string conversions. +// These are mostly intended for use in unit tests. + +#ifndef GOOGLEURL_SRC_URL_TEST_UTILS_H__ +#define GOOGLEURL_SRC_URL_TEST_UTILS_H__ + +#include <string> + +#include "base/string16.h" +#include "googleurl/src/url_canon_internal.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace url_test_utils { + +// Converts a UTF-16 string from native wchar_t format to char16, by +// truncating the high 32 bits. This is not meant to handle true UTF-32 +// encoded strings. +inline string16 WStringToUTF16(const wchar_t* src) { + string16 str; + int length = static_cast<int>(wcslen(src)); + for (int i = 0; i < length; ++i) { + str.push_back(static_cast<char16>(src[i])); + } + return str; +} + +// Converts a string from UTF-8 to UTF-16 +inline string16 ConvertUTF8ToUTF16(const std::string& src) { + int length = static_cast<int>(src.length()); + EXPECT_LT(length, 1024); + url_canon::RawCanonOutputW<1024> output; + EXPECT_TRUE(url_canon::ConvertUTF8ToUTF16(src.data(), length, &output)); + return string16(output.data(), output.length()); +} + +// Converts a string from UTF-16 to UTF-8 +inline std::string ConvertUTF16ToUTF8(const string16& src) { + std::string str; + url_canon::StdStringCanonOutput output(&str); + EXPECT_TRUE(url_canon::ConvertUTF16ToUTF8(src.data(), + static_cast<int>(src.length()), + &output)); + output.Complete(); + return str; +} + +} // namespace url_test_utils + +// This operator allows EXPECT_EQ(astring16, anotherstring16); to work. +inline std::ostream& operator<<(std::ostream& os, + const string16& str) { + // Convert to UTF-8 and print the string + return os << url_test_utils::ConvertUTF16ToUTF8(str); +} + +#endif // GOOGLEURL_SRC_URL_TEST_UTILS_H__ diff --git a/googleurl/src/url_util.cc b/googleurl/src/url_util.cc new file mode 100644 index 0000000..d623b45 --- /dev/null +++ b/googleurl/src/url_util.cc @@ -0,0 +1,453 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <string.h> +#include <vector> + +#include "googleurl/src/url_util.h" + +#include "base/logging.h" +#include "googleurl/src/url_file.h" + +namespace url_util { + +namespace { + +// ASCII-specific tolower. The standard library's tolower is locale sensitive, +// so we don't want to use it here. +template <class Char> inline Char ToLowerASCII(Char c) { + return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; +} + +// Backend for LowerCaseEqualsASCII. +template<typename Iter> +inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) { + for (Iter it = a_begin; it != a_end; ++it, ++b) { + if (!*b || ToLowerASCII(*it) != *b) + return false; + } + return *b == 0; +} + +const char kFileScheme[] = "file"; // Used in a number of places. +const char kMailtoScheme[] = "mailto"; + +const int kNumStandardURLSchemes = 5; +const char* kStandardURLSchemes[kNumStandardURLSchemes] = { + "http", + "https", + kFileScheme, // Yes, file urls can have a hostname! + "ftp", + "gopher", +}; + +// List of the currently installed standard schemes. This list is lazily +// initialized by InitStandardSchemes and is leaked on shutdown to prevent +// any destructors from being called that will slow us down or cause problems. +std::vector<const char*>* standard_schemes = NULL; + +// Ensures that the standard_schemes list is initialized, does nothing if it +// already has values. +void InitStandardSchemes() { + if (standard_schemes) + return; + standard_schemes = new std::vector<const char*>; + for (int i = 0; i < kNumStandardURLSchemes; i++) + standard_schemes->push_back(kStandardURLSchemes[i]); +} + +// Given a string and a range inside the string, compares it to the given +// lower-case |compare_to| buffer. +template<typename CHAR> +inline bool CompareSchemeComponent(const CHAR* spec, + const url_parse::Component& component, + const char* compare_to) { + if (!component.is_nonempty()) + return compare_to[0] == 0; // When component is empty, match empty scheme. + return LowerCaseEqualsASCII(&spec[component.begin], + &spec[component.end()], + compare_to); +} + +// Returns true if the given scheme identified by |scheme| within |spec| is one +// of the registered "standard" schemes. Note that this does not check for +// "://", use IsStandard for that. +template<typename CHAR> +bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) { + if (!scheme.is_nonempty()) + return false; // Empty or invalid schemes are non-standard. + + InitStandardSchemes(); + for (size_t i = 0; i < standard_schemes->size(); i++) { + if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()], + standard_schemes->at(i))) + return true; + } + return false; +} + +// Returns true if the stuff following the scheme in the given spec indicates +// a "standard" URL. The presence of "://" after the scheme indicates that +// there is a hostname, etc. which we call a standard URL. +template<typename CHAR> +bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len, + const url_parse::Component& scheme) { + int after_scheme = scheme.end(); + if (spec_len < after_scheme + 3) + return false; + return spec[after_scheme] == ':' && + spec[after_scheme + 1] == '/' && + spec[after_scheme + 2] == '/'; +} + +template<typename CHAR> +bool DoIsStandard(const CHAR* spec, int spec_len, + const url_parse::Component& scheme) { + return HasStandardSchemeSeparator(spec, spec_len, scheme) || + IsStandardScheme(spec, scheme); +} + +template<typename CHAR> +bool DoFindAndCompareScheme(const CHAR* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme) { + url_parse::Component our_scheme; + if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) { + // No scheme. + if (found_scheme) + *found_scheme = url_parse::Component(); + return false; + } + if (found_scheme) + *found_scheme = our_scheme; + return CompareSchemeComponent(str, our_scheme, compare); +} + +template<typename CHAR> +bool DoCanonicalize(const CHAR* in_spec, int in_spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + // Remove any whitespace from the middle of the relative URL, possibly + // copying to the new buffer. + url_canon::RawCanonOutputT<CHAR> whitespace_buffer; + int spec_len; + const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len, + &whitespace_buffer, &spec_len); + + url_parse::Parsed parsed_input; +#ifdef WIN32 + // For Windows, we allow things that look like absolute Windows paths to be + // fixed up magically to file URLs. This is done for IE compatability. For + // example, this will change "c:/foo" into a file URL rather than treating + // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt"). + // There is similar logic in url_canon_relative.cc for + // + // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which + // has no meaning as an absolute path name. This is because browsers on Mac + // & Unix don't generally do this, so there is no compatibility reason for + // doing so. + if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) || + url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) { + url_parse::ParseFileURL(spec, spec_len, &parsed_input); + return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input, + charset_converter, + output, output_parsed); + } +#endif + + url_parse::Component scheme; + if(!url_parse::ExtractScheme(spec, spec_len, &scheme)) + return false; + + // This is the parsed version of the input URL, we have to canonicalize it + // before storing it in our object. + bool success; + if (CompareSchemeComponent(spec, scheme, kFileScheme)) { + // File URLs are special. + url_parse::ParseFileURL(spec, spec_len, &parsed_input); + success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input, + charset_converter, + output, output_parsed); + + } else if (IsStandard(spec, spec_len, scheme)) { + // All "normal" URLs. + url_parse::ParseStandardURL(spec, spec_len, &parsed_input); + success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input, + charset_converter, + output, output_parsed); + + } else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) { + // Mailto are treated like a standard url with only a scheme, path, query + url_parse::ParseMailtoURL(spec, spec_len, &parsed_input); + success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input, + output, output_parsed); + + } else { + // "Weird" URLs like data: and javascript: + url_parse::ParsePathURL(spec, spec_len, &parsed_input); + success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input, + output, output_parsed); + } + return success; +} + +template<typename CHAR> +bool DoResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const CHAR* in_relative, + int in_relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + // Remove any whitespace from the middle of the relative URL, possibly + // copying to the new buffer. + url_canon::RawCanonOutputT<CHAR> whitespace_buffer; + int relative_length; + const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length, + &whitespace_buffer, + &relative_length); + + // See if our base URL should be treated as "standard". + bool standard_base_scheme = + base_parsed.scheme.is_nonempty() && + IsStandard(base_spec, base_spec_len, base_parsed.scheme); + + bool is_relative; + url_parse::Component relative_component; + if (!url_canon::IsRelativeURL(base_spec, base_parsed, + relative, relative_length, + standard_base_scheme, + &is_relative, + &relative_component)) { + // Error resolving. + return false; + } + + if (is_relative) { + // Relative, resolve and canonicalize. + bool file_base_scheme = base_parsed.scheme.is_nonempty() && + CompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme); + return url_canon::ResolveRelativeURL(base_spec, base_parsed, + file_base_scheme, relative, + relative_component, charset_converter, + output, output_parsed); + } + + // Not relative, canonicalize the input. + return DoCanonicalize(relative, relative_length, charset_converter, + output, output_parsed); +} + +template<typename CHAR> +bool DoReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<CHAR>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed) { + // Note that we dispatch to the parser according the the scheme type of + // the OUTPUT URL. Normally, this is the same as our scheme, but if the + // scheme is being overridden, we need to test that. + + if (// Either the scheme is not replaced and the old one is a file, + (!replacements.IsSchemeOverridden() && + CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) || + // ...or it is being replaced and the new one is a file. + (replacements.IsSchemeOverridden() && + CompareSchemeComponent(replacements.sources().scheme, + replacements.components().scheme, + kFileScheme))) { + return url_canon::ReplaceFileURL(spec, parsed, replacements, + charset_converter, output, out_parsed); + } + + if (// Either the scheme is not replaced and the old one is standard, + (!replacements.IsSchemeOverridden() && + IsStandard(spec, spec_len, parsed.scheme)) || + // ...or it is being replaced and the new one is standard. + (replacements.IsSchemeOverridden() && + IsStandardScheme(replacements.sources().scheme, + replacements.components().scheme))) { + // Standard URL with all parts. + return url_canon::ReplaceStandardURL(spec, parsed, replacements, + charset_converter, output, out_parsed); + } + + if (// Either the scheme is not replaced and the old one is mailto, + (!replacements.IsSchemeOverridden() && + CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) || + // ...or it is being replaced and the new one is a mailto. + (replacements.IsSchemeOverridden() && + CompareSchemeComponent(replacements.sources().scheme, + replacements.components().scheme, + kMailtoScheme))) { + return url_canon::ReplaceMailtoURL(spec, parsed, replacements, + output, out_parsed); + } + + return url_canon::ReplacePathURL(spec, parsed, replacements, + output, out_parsed); +} + +} // namespace + +void AddStandardScheme(const char* new_scheme) { + size_t scheme_len = strlen(new_scheme); + if (scheme_len == 0) + return; + + // Dulicate the scheme into a new buffer and add it to the list of standard + // schemes. This pointer will be leaked on shutdown. + char* dup_scheme = new char[scheme_len + 1]; + memcpy(dup_scheme, new_scheme, scheme_len + 1); + + InitStandardSchemes(); + standard_schemes->push_back(dup_scheme); +} + +bool IsStandard(const char* spec, int spec_len, + const url_parse::Component& scheme) { + return DoIsStandard(spec, spec_len, scheme); +} + +bool IsStandard(const char16* spec, int spec_len, + const url_parse::Component& scheme) { + return DoIsStandard(spec, spec_len, scheme); +} + +bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme) { + return DoFindAndCompareScheme(str, str_len, compare, found_scheme); +} + +bool FindAndCompareScheme(const char16* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme) { + return DoFindAndCompareScheme(str, str_len, compare, found_scheme); +} + +bool Canonicalize(const char* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + return DoCanonicalize(spec, spec_len, charset_converter, + output, output_parsed); +} + +bool Canonicalize(const char16* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + return DoCanonicalize(spec, spec_len, charset_converter, + output, output_parsed); +} + +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + return DoResolveRelative(base_spec, base_spec_len, base_parsed, + relative, relative_length, + charset_converter, output, output_parsed); +} + +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char16* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + return DoResolveRelative(base_spec, base_spec_len, base_parsed, + relative, relative_length, + charset_converter, output, output_parsed); +} + +bool ReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed) { + return DoReplaceComponents(spec, spec_len, parsed, replacements, + charset_converter, output, out_parsed); +} + +bool ReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char16>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed) { + return DoReplaceComponents(spec, spec_len, parsed, replacements, + charset_converter, output, out_parsed); +} + +// Front-ends for LowerCaseEqualsASCII. +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} + +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b_begin, + const char* b_end) { + while (a_begin != a_end && b_begin != b_end && + ToLowerASCII(*a_begin) == *b_begin) { + a_begin++; + b_begin++; + } + return a_begin == a_end && b_begin == b_end; +} + +bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} + +} // namespace url_util diff --git a/googleurl/src/url_util.h b/googleurl/src/url_util.h new file mode 100644 index 0000000..62813a6 --- /dev/null +++ b/googleurl/src/url_util.h @@ -0,0 +1,170 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_UTIL_H__ +#define GOOGLEURL_SRC_URL_UTIL_H__ + +#include <string> + +#include "base/string16.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_canon.h" + +namespace url_util { + +// Schemes -------------------------------------------------------------------- + +// Adds an application-defined scheme to the internal list of "standard" URL +// schemes. +void AddStandardScheme(const char* new_scheme); + +// Locates the scheme in the given string and places it into |found_scheme|, +// which may be NULL to indicate the caller does not care about the range. +// Returns whether the given |compare| scheme matches the scheme found in the +// input (if any). +bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +bool FindAndCompareScheme(const char16* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +inline bool FindAndCompareScheme(const std::string& str, + const char* compare, + url_parse::Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast<int>(str.size()), + compare, found_scheme); +} +inline bool FindAndCompareScheme(const string16& str, + const char* compare, + url_parse::Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast<int>(str.size()), + compare, found_scheme); +} + +// Returns true if the given string represents a standard URL. This means that +// either the scheme is in the list of known standard schemes, or there is a +// "://" following the scheme. +bool IsStandard(const char* spec, int spec_len, + const url_parse::Component& scheme); +bool IsStandard(const char16* spec, int spec_len, + const url_parse::Component& scheme); + +// URL library wrappers ------------------------------------------------------- + +// Parses the given spec according to the extracted scheme type. Normal users +// should use the URL object, although this may be useful if performance is +// critical and you don't want to do the heap allocation for the std::string. +// +// As with the url_canon::Canonicalize* functions, the charset converter can +// be NULL to use UTF-8 (it will be faster in this case). +// +// Returns true if a valid URL was produced, false if not. On failure, the +// output and parsed structures will still be filled and will be consistent, +// but they will not represent a loadable URL. +bool Canonicalize(const char* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +bool Canonicalize(const char16* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); + +// Resolves a potentially relative URL relative to the given parsed base URL. +// The base MUST be valid. The resulting canonical URL and parsed information +// will be placed in to the given out variables. +// +// The relative need not be relative. If we discover that it's absolute, this +// will produce a canonical version of that URL. See Canonicalize() for more +// about the charset_converter. +// +// Returns true if the output is valid, false if the input could not produce +// a valid URL. +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char16* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); + +// Replaces components in the given VALID input url. The new canonical URL info +// is written to output and out_parsed. +// +// Returns true if the resulting URL is valid. +bool ReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); +bool ReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char16>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); + +// String helper functions ---------------------------------------------------- + +// Compare the lower-case form of the given string against the given ASCII +// string. This is useful for doing checking if an input string matches some +// token, and it is optimized to avoid intermediate string copies. +// +// The versions of this function that don't take a b_end assume that the b +// string is NULL terminated. +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b); +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b_begin, + const char* b_end); +bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b); + +} // namespace url_util + +#endif // GOOGLEURL_SRC_URL_UTIL_H__ diff --git a/googleurl/src/url_util_unittest.cc b/googleurl/src/url_util_unittest.cc new file mode 100644 index 0000000..12e5254 --- /dev/null +++ b/googleurl/src/url_util_unittest.cc @@ -0,0 +1,98 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +TEST(URLUtilTest, FindAndCompareScheme) { + url_parse::Component found_scheme; + + // Simple case where the scheme is found and matches. + const char kStr1[] = "http://www.com/"; + EXPECT_TRUE(url_util::FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "http", NULL)); + EXPECT_TRUE(url_util::FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "http", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component(0, 4)); + + // A case where the scheme is found and doesn't match. + EXPECT_FALSE(url_util::FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "https", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component(0, 4)); + + // A case where there is no scheme. + const char kStr2[] = "httpfoobar"; + EXPECT_FALSE(url_util::FindAndCompareScheme( + kStr2, static_cast<int>(strlen(kStr2)), "http", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component()); + + // When there is an empty scheme, it should match the empty scheme. + const char kStr3[] = ":foo.com/"; + EXPECT_TRUE(url_util::FindAndCompareScheme( + kStr3, static_cast<int>(strlen(kStr3)), "", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component(0, 0)); + + // But when there is no scheme, it should fail. + EXPECT_FALSE(url_util::FindAndCompareScheme("", 0, "", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component()); +} + +TEST(URLUtilTest, ReplaceComponents) { + url_parse::Parsed parsed; + url_canon::RawCanonOutputT<char> output; + url_parse::Parsed new_parsed; + + // Check that the following calls do not cause crash + url_canon::Replacements<char> replacements; + replacements.SetRef("test", url_parse::Component(0, 4)); + url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output, + &new_parsed); + replacements.ClearRef(); + replacements.SetHost("test", url_parse::Component(0, 4)); + url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output, + &new_parsed); + + replacements.ClearHost(); + url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output, + &new_parsed); +} + |