summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbrettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-04-10 20:10:52 +0000
committerbrettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-04-10 20:10:52 +0000
commite7bba5f84f6ef996d0d16621bacc4b84adbc51e0 (patch)
tree3d5abf63b86c9c08369d7410aeb22a391719e171
parent495a448b3b3104301ebf3e63fd0079284126f6d8 (diff)
downloadchromium_src-e7bba5f84f6ef996d0d16621bacc4b84adbc51e0.zip
chromium_src-e7bba5f84f6ef996d0d16621bacc4b84adbc51e0.tar.gz
chromium_src-e7bba5f84f6ef996d0d16621bacc4b84adbc51e0.tar.bz2
Move googleurl into the Chrome repo.
Original location: https://code.google.com/p/google-url/ This includes changes up to r184. These files are unchanged from the Google Code repo and do not yet build. Updating includes, etc. will be done in a separate pass. Review URL: https://codereview.chromium.org/13821004 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@193439 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--url/OWNERS1
-rw-r--r--url/googleurl.gyp109
-rw-r--r--url/gurl.cc529
-rw-r--r--url/gurl.h392
-rw-r--r--url/gurl_test_main.cc102
-rw-r--r--url/gurl_unittest.cc488
-rw-r--r--url/url_canon.h912
-rw-r--r--url/url_canon_etc.cc392
-rw-r--r--url/url_canon_filesystemurl.cc158
-rw-r--r--url/url_canon_fileurl.cc215
-rw-r--r--url/url_canon_host.cc401
-rw-r--r--url/url_canon_icu.cc210
-rw-r--r--url/url_canon_icu.h63
-rw-r--r--url/url_canon_internal.cc427
-rw-r--r--url/url_canon_internal.h461
-rw-r--r--url/url_canon_internal_file.h157
-rw-r--r--url/url_canon_ip.cc730
-rw-r--r--url/url_canon_ip.h109
-rw-r--r--url/url_canon_mailtourl.cc137
-rw-r--r--url/url_canon_path.cc378
-rw-r--r--url/url_canon_pathurl.cc128
-rw-r--r--url/url_canon_query.cc189
-rw-r--r--url/url_canon_relative.cc579
-rw-r--r--url/url_canon_stdstring.h134
-rw-r--r--url/url_canon_stdurl.cc211
-rw-r--r--url/url_canon_unittest.cc2133
-rw-r--r--url/url_common.h54
-rw-r--r--url/url_file.h108
-rw-r--r--url/url_parse.cc923
-rw-r--r--url/url_parse.h373
-rw-r--r--url/url_parse_file.cc243
-rw-r--r--url/url_parse_internal.h112
-rw-r--r--url/url_parse_unittest.cc649
-rw-r--r--url/url_test_utils.h78
-rw-r--r--url/url_util.cc618
-rw-r--r--url/url_util.h228
-rw-r--r--url/url_util_internal.h56
-rw-r--r--url/url_util_unittest.cc310
38 files changed, 13497 insertions, 0 deletions
diff --git a/url/OWNERS b/url/OWNERS
new file mode 100644
index 0000000..06fefbf
--- /dev/null
+++ b/url/OWNERS
@@ -0,0 +1 @@
+brettw@chromium.org
diff --git a/url/googleurl.gyp b/url/googleurl.gyp
new file mode 100644
index 0000000..2f6f89a
--- /dev/null
+++ b/url/googleurl.gyp
@@ -0,0 +1,109 @@
+# Copyright (c) 2012 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# TODO(mark): Upstream this file to googleurl.
+{
+ 'variables': {
+ 'chromium_code': 1,
+ },
+ 'targets': [
+ {
+ 'target_name': 'googleurl',
+ 'type': '<(component)',
+ 'dependencies': [
+ '../../base/base.gyp:base',
+ '../../third_party/icu/icu.gyp:icudata',
+ '../../third_party/icu/icu.gyp:icui18n',
+ '../../third_party/icu/icu.gyp:icuuc',
+ ],
+ 'sources': [
+ '../../googleurl/src/gurl.cc',
+ '../../googleurl/src/gurl.h',
+ '../../googleurl/src/url_canon.h',
+ '../../googleurl/src/url_canon_etc.cc',
+ '../../googleurl/src/url_canon_fileurl.cc',
+ '../../googleurl/src/url_canon_filesystemurl.cc',
+ '../../googleurl/src/url_canon_host.cc',
+ '../../googleurl/src/url_canon_icu.cc',
+ '../../googleurl/src/url_canon_icu.h',
+ '../../googleurl/src/url_canon_internal.cc',
+ '../../googleurl/src/url_canon_internal.h',
+ '../../googleurl/src/url_canon_internal_file.h',
+ '../../googleurl/src/url_canon_ip.cc',
+ '../../googleurl/src/url_canon_ip.h',
+ '../../googleurl/src/url_canon_mailtourl.cc',
+ '../../googleurl/src/url_canon_path.cc',
+ '../../googleurl/src/url_canon_pathurl.cc',
+ '../../googleurl/src/url_canon_query.cc',
+ '../../googleurl/src/url_canon_relative.cc',
+ '../../googleurl/src/url_canon_stdstring.h',
+ '../../googleurl/src/url_canon_stdurl.cc',
+ '../../googleurl/src/url_file.h',
+ '../../googleurl/src/url_parse.cc',
+ '../../googleurl/src/url_parse.h',
+ '../../googleurl/src/url_parse_file.cc',
+ '../../googleurl/src/url_parse_internal.h',
+ '../../googleurl/src/url_util.cc',
+ '../../googleurl/src/url_util.h',
+ ],
+ 'direct_dependent_settings': {
+ 'include_dirs': [
+ '../..',
+ ],
+ },
+ 'defines': [
+ 'FULL_FILESYSTEM_URL_SUPPORT=1',
+ ],
+ 'conditions': [
+ ['component=="shared_library"', {
+ 'defines': [
+ 'GURL_DLL',
+ 'GURL_IMPLEMENTATION=1',
+ ],
+ 'direct_dependent_settings': {
+ 'defines': [
+ 'GURL_DLL',
+ ],
+ },
+ }],
+ ],
+ # TODO(jschuh): crbug.com/167187 fix size_t to int truncations.
+ 'msvs_disabled_warnings': [4267, ],
+ },
+ {
+ 'target_name': 'googleurl_unittests',
+ 'type': 'executable',
+ 'dependencies': [
+ 'googleurl',
+ '../../base/base.gyp:base_i18n',
+ '../../base/base.gyp:run_all_unittests',
+ '../../testing/gtest.gyp:gtest',
+ '../../third_party/icu/icu.gyp:icuuc',
+ ],
+ 'sources': [
+ '../../googleurl/src/gurl_unittest.cc',
+ '../../googleurl/src/url_canon_unittest.cc',
+ '../../googleurl/src/url_parse_unittest.cc',
+ '../../googleurl/src/url_test_utils.h',
+ '../../googleurl/src/url_util_unittest.cc',
+ ],
+ 'defines': [
+ 'FULL_FILESYSTEM_URL_SUPPORT=1',
+ ],
+ 'conditions': [
+ ['os_posix==1 and OS!="mac" and OS!="ios"', {
+ 'conditions': [
+ ['linux_use_tcmalloc==1', {
+ 'dependencies': [
+ '../../base/allocator/allocator.gyp:allocator',
+ ],
+ }],
+ ],
+ }],
+ ],
+ # TODO(jschuh): crbug.com/167187 fix size_t to int truncations.
+ 'msvs_disabled_warnings': [4267, ],
+ },
+ ],
+}
diff --git a/url/gurl.cc b/url/gurl.cc
new file mode 100644
index 0000000..05f65da
--- /dev/null
+++ b/url/gurl.cc
@@ -0,0 +1,529 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <pthread.h>
+#endif
+
+#include <algorithm>
+#include <ostream>
+
+#include "googleurl/src/gurl.h"
+
+#include "base/logging.h"
+#include "googleurl/src/url_canon_stdstring.h"
+#include "googleurl/src/url_util.h"
+
+namespace {
+
+// External template that can handle initialization of either character type.
+// The input spec is given, and the canonical version will be placed in
+// |*canonical|, along with the parsing of the canonical spec in |*parsed|.
+template<typename STR>
+bool InitCanonical(const STR& input_spec,
+ std::string* canonical,
+ url_parse::Parsed* parsed) {
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ canonical->reserve(input_spec.size() + 32);
+ url_canon::StdStringCanonOutput output(canonical);
+ bool success = url_util::Canonicalize(
+ input_spec.data(), static_cast<int>(input_spec.length()),
+ NULL, &output, parsed);
+
+ output.Complete(); // Must be done before using string.
+ return success;
+}
+
+static std::string* empty_string = NULL;
+static GURL* empty_gurl = NULL;
+
+#ifdef WIN32
+
+// Returns a static reference to an empty string for returning a reference
+// when there is no underlying string.
+const std::string& EmptyStringForGURL() {
+ // Avoid static object construction/destruction on startup/shutdown.
+ if (!empty_string) {
+ // Create the string. Be careful that we don't break in the case that this
+ // is being called from multiple threads. Statics are not threadsafe.
+ std::string* new_empty_string = new std::string;
+ if (InterlockedCompareExchangePointer(
+ reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
+ // The old value was non-NULL, so no replacement was done. Another
+ // thread did the initialization out from under us.
+ delete new_empty_string;
+ }
+ }
+ return *empty_string;
+}
+
+#else
+
+static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
+static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
+
+void EmptyStringForGURLOnce(void) {
+ empty_string = new std::string;
+}
+
+const std::string& EmptyStringForGURL() {
+ // Avoid static object construction/destruction on startup/shutdown.
+ pthread_once(&empty_string_once, EmptyStringForGURLOnce);
+ return *empty_string;
+}
+
+#endif // WIN32
+
+} // namespace
+
+GURL::GURL() : is_valid_(false), inner_url_(NULL) {
+}
+
+GURL::GURL(const GURL& other)
+ : spec_(other.spec_),
+ is_valid_(other.is_valid_),
+ parsed_(other.parsed_),
+ inner_url_(NULL) {
+ if (other.inner_url_)
+ inner_url_ = new GURL(*other.inner_url_);
+ // Valid filesystem urls should always have an inner_url_.
+ DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
+}
+
+GURL::GURL(const std::string& url_string) : inner_url_(NULL) {
+ is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
+ if (is_valid_ && SchemeIsFileSystem()) {
+ inner_url_ =
+ new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
+ }
+}
+
+GURL::GURL(const string16& url_string) : inner_url_(NULL) {
+ is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
+ if (is_valid_ && SchemeIsFileSystem()) {
+ inner_url_ =
+ new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
+ }
+}
+
+GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
+ const url_parse::Parsed& parsed, bool is_valid)
+ : spec_(canonical_spec, canonical_spec_len),
+ is_valid_(is_valid),
+ parsed_(parsed),
+ inner_url_(NULL) {
+ if (is_valid_ && SchemeIsFileSystem()) {
+ inner_url_ =
+ new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true);
+ }
+
+#ifndef NDEBUG
+ // For testing purposes, check that the parsed canonical URL is identical to
+ // what we would have produced. Skip checking for invalid URLs have no meaning
+ // and we can't always canonicalize then reproducabely.
+ if (is_valid_) {
+ url_parse::Component scheme;
+ if (!url_util::FindAndCompareScheme(canonical_spec, canonical_spec_len,
+ "filesystem", &scheme) ||
+ scheme.begin == parsed.scheme.begin) {
+ // We can't do this check on the inner_url of a filesystem URL, as
+ // canonical_spec actually points to the start of the outer URL, so we'd
+ // end up with infinite recursion in this constructor.
+ GURL test_url(spec_);
+
+ DCHECK(test_url.is_valid_ == is_valid_);
+ DCHECK(test_url.spec_ == spec_);
+
+ DCHECK(test_url.parsed_.scheme == parsed_.scheme);
+ DCHECK(test_url.parsed_.username == parsed_.username);
+ DCHECK(test_url.parsed_.password == parsed_.password);
+ DCHECK(test_url.parsed_.host == parsed_.host);
+ DCHECK(test_url.parsed_.port == parsed_.port);
+ DCHECK(test_url.parsed_.path == parsed_.path);
+ DCHECK(test_url.parsed_.query == parsed_.query);
+ DCHECK(test_url.parsed_.ref == parsed_.ref);
+ }
+ }
+#endif
+}
+
+GURL::~GURL() {
+ delete inner_url_;
+}
+
+GURL& GURL::operator=(const GURL& other) {
+ spec_ = other.spec_;
+ is_valid_ = other.is_valid_;
+ parsed_ = other.parsed_;
+ delete inner_url_;
+ inner_url_ = NULL;
+ if (other.inner_url_)
+ inner_url_ = new GURL(*other.inner_url_);
+ // Valid filesystem urls should always have an inner_url_.
+ DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
+ return *this;
+}
+
+const std::string& GURL::spec() const {
+ if (is_valid_ || spec_.empty())
+ return spec_;
+
+ DCHECK(false) << "Trying to get the spec of an invalid URL!";
+ return EmptyStringForGURL();
+}
+
+GURL GURL::Resolve(const std::string& relative) const {
+ return ResolveWithCharsetConverter(relative, NULL);
+}
+GURL GURL::Resolve(const string16& relative) const {
+ return ResolveWithCharsetConverter(relative, NULL);
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::ResolveWithCharsetConverter(
+ const std::string& relative,
+ url_canon::CharsetConverter* charset_converter) const {
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ GURL result;
+
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ result.spec_.reserve(spec_.size() + 32);
+ url_canon::StdStringCanonOutput output(&result.spec_);
+
+ if (!url_util::ResolveRelative(
+ spec_.data(), static_cast<int>(spec_.length()), parsed_,
+ relative.data(), static_cast<int>(relative.length()),
+ charset_converter, &output, &result.parsed_)) {
+ // Error resolving, return an empty URL.
+ return GURL();
+ }
+
+ output.Complete();
+ result.is_valid_ = true;
+ if (result.SchemeIsFileSystem()) {
+ result.inner_url_ = new GURL(result.spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true);
+ }
+ return result;
+}
+
+// Note: code duplicated above (it's inconvenient to use a template here).
+GURL GURL::ResolveWithCharsetConverter(
+ const string16& relative,
+ url_canon::CharsetConverter* charset_converter) const {
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ GURL result;
+
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ result.spec_.reserve(spec_.size() + 32);
+ url_canon::StdStringCanonOutput output(&result.spec_);
+
+ if (!url_util::ResolveRelative(
+ spec_.data(), static_cast<int>(spec_.length()), parsed_,
+ relative.data(), static_cast<int>(relative.length()),
+ charset_converter, &output, &result.parsed_)) {
+ // Error resolving, return an empty URL.
+ return GURL();
+ }
+
+ output.Complete();
+ result.is_valid_ = true;
+ if (result.SchemeIsFileSystem()) {
+ result.inner_url_ = new GURL(result.spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true);
+ }
+ return result;
+}
+
+// Note: code duplicated below (it's inconvenient to use a template here).
+GURL GURL::ReplaceComponents(
+ const url_canon::Replacements<char>& replacements) const {
+ GURL result;
+
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ result.spec_.reserve(spec_.size() + 32);
+ url_canon::StdStringCanonOutput output(&result.spec_);
+
+ result.is_valid_ = url_util::ReplaceComponents(
+ spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+ NULL, &output, &result.parsed_);
+
+ output.Complete();
+ if (result.is_valid_ && result.SchemeIsFileSystem()) {
+ result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true);
+ }
+ return result;
+}
+
+// Note: code duplicated above (it's inconvenient to use a template here).
+GURL GURL::ReplaceComponents(
+ const url_canon::Replacements<char16>& replacements) const {
+ GURL result;
+
+ // Not allowed for invalid URLs.
+ if (!is_valid_)
+ return GURL();
+
+ // Reserve enough room in the output for the input, plus some extra so that
+ // we have room if we have to escape a few things without reallocating.
+ result.spec_.reserve(spec_.size() + 32);
+ url_canon::StdStringCanonOutput output(&result.spec_);
+
+ result.is_valid_ = url_util::ReplaceComponents(
+ spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
+ NULL, &output, &result.parsed_);
+
+ output.Complete();
+ if (result.is_valid_ && result.SchemeIsFileSystem()) {
+ result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(),
+ *result.parsed_.inner_parsed(), true);
+ }
+ return result;
+}
+
+GURL GURL::GetOrigin() const {
+ // This doesn't make sense for invalid or nonstandard URLs, so return
+ // the empty URL
+ if (!is_valid_ || !IsStandard())
+ return GURL();
+
+ if (SchemeIsFileSystem())
+ return inner_url_->GetOrigin();
+
+ url_canon::Replacements<char> replacements;
+ replacements.ClearUsername();
+ replacements.ClearPassword();
+ replacements.ClearPath();
+ replacements.ClearQuery();
+ replacements.ClearRef();
+
+ return ReplaceComponents(replacements);
+}
+
+GURL GURL::GetWithEmptyPath() const {
+ // This doesn't make sense for invalid or nonstandard URLs, so return
+ // the empty URL.
+ if (!is_valid_ || !IsStandard())
+ return GURL();
+
+ // We could optimize this since we know that the URL is canonical, and we are
+ // appending a canonical path, so avoiding re-parsing.
+ GURL other(*this);
+ if (parsed_.path.len == 0)
+ return other;
+
+ // Clear everything after the path.
+ other.parsed_.query.reset();
+ other.parsed_.ref.reset();
+
+ // Set the path, since the path is longer than one, we can just set the
+ // first character and resize.
+ other.spec_[other.parsed_.path.begin] = '/';
+ other.parsed_.path.len = 1;
+ other.spec_.resize(other.parsed_.path.begin + 1);
+ return other;
+}
+
+bool GURL::IsStandard() const {
+ return url_util::IsStandard(spec_.data(), parsed_.scheme);
+}
+
+bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
+ if (parsed_.scheme.len <= 0)
+ return lower_ascii_scheme == NULL;
+ return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
+ spec_.data() + parsed_.scheme.end(),
+ lower_ascii_scheme);
+}
+
+int GURL::IntPort() const {
+ if (parsed_.port.is_nonempty())
+ return url_parse::ParsePort(spec_.data(), parsed_.port);
+ return url_parse::PORT_UNSPECIFIED;
+}
+
+int GURL::EffectiveIntPort() const {
+ int int_port = IntPort();
+ if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
+ return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
+ parsed_.scheme.len);
+ return int_port;
+}
+
+std::string GURL::ExtractFileName() const {
+ url_parse::Component file_component;
+ url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
+ return ComponentString(file_component);
+}
+
+std::string GURL::PathForRequest() const {
+ DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
+ if (parsed_.ref.len >= 0) {
+ // Clip off the reference when it exists. The reference starts after the #
+ // sign, so we have to subtract one to also remove it.
+ return std::string(spec_, parsed_.path.begin,
+ parsed_.ref.begin - parsed_.path.begin - 1);
+ }
+ // Compute the actual path length, rather than depending on the spec's
+ // terminator. If we're an inner_url, our spec continues on into our outer
+ // url's path/query/ref.
+ int path_len = parsed_.path.len;
+ if (parsed_.query.is_valid())
+ path_len = parsed_.query.end() - parsed_.path.begin;
+
+ return std::string(spec_, parsed_.path.begin, path_len);
+}
+
+std::string GURL::HostNoBrackets() const {
+ // If host looks like an IPv6 literal, strip the square brackets.
+ url_parse::Component h(parsed_.host);
+ if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
+ h.begin++;
+ h.len -= 2;
+ }
+ return ComponentString(h);
+}
+
+bool GURL::HostIsIPAddress() const {
+ if (!is_valid_ || spec_.empty())
+ return false;
+
+ url_canon::RawCanonOutputT<char, 128> ignored_output;
+ url_canon::CanonHostInfo host_info;
+ url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
+ &ignored_output, &host_info);
+ return host_info.IsIPAddress();
+}
+
+#ifdef WIN32
+
+const GURL& GURL::EmptyGURL() {
+ // Avoid static object construction/destruction on startup/shutdown.
+ if (!empty_gurl) {
+ // Create the string. Be careful that we don't break in the case that this
+ // is being called from multiple threads.
+ GURL* new_empty_gurl = new GURL;
+ if (InterlockedCompareExchangePointer(
+ reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
+ // The old value was non-NULL, so no replacement was done. Another
+ // thread did the initialization out from under us.
+ delete new_empty_gurl;
+ }
+ }
+ return *empty_gurl;
+}
+
+#else
+
+void EmptyGURLOnce(void) {
+ empty_gurl = new GURL;
+}
+
+const GURL& GURL::EmptyGURL() {
+ // Avoid static object construction/destruction on startup/shutdown.
+ pthread_once(&empty_gurl_once, EmptyGURLOnce);
+ return *empty_gurl;
+}
+
+#endif // WIN32
+
+bool GURL::DomainIs(const char* lower_ascii_domain,
+ int domain_len) const {
+ // Return false if this URL is not valid or domain is empty.
+ if (!is_valid_ || !domain_len)
+ return false;
+
+ // FileSystem URLs have empty parsed_.host, so check this first.
+ if (SchemeIsFileSystem() && inner_url_)
+ return inner_url_->DomainIs(lower_ascii_domain, domain_len);
+
+ if (!parsed_.host.is_nonempty())
+ return false;
+
+ // Check whether the host name is end with a dot. If yes, treat it
+ // the same as no-dot unless the input comparison domain is end
+ // with dot.
+ const char* last_pos = spec_.data() + parsed_.host.end() - 1;
+ int host_len = parsed_.host.len;
+ if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
+ last_pos--;
+ host_len--;
+ }
+
+ // Return false if host's length is less than domain's length.
+ if (host_len < domain_len)
+ return false;
+
+ // Compare this url whether belong specific domain.
+ const char* start_pos = spec_.data() + parsed_.host.begin +
+ host_len - domain_len;
+
+ if (!url_util::LowerCaseEqualsASCII(start_pos,
+ last_pos + 1,
+ lower_ascii_domain,
+ lower_ascii_domain + domain_len))
+ return false;
+
+ // Check whether host has right domain start with dot, make sure we got
+ // right domain range. For example www.google.com has domain
+ // "google.com" but www.iamnotgoogle.com does not.
+ if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
+ '.' != *(start_pos - 1))
+ return false;
+
+ return true;
+}
+
+void GURL::Swap(GURL* other) {
+ spec_.swap(other->spec_);
+ std::swap(is_valid_, other->is_valid_);
+ std::swap(parsed_, other->parsed_);
+ std::swap(inner_url_, other->inner_url_);
+}
+
+std::ostream& operator<<(std::ostream& out, const GURL& url) {
+ return out << url.possibly_invalid_spec();
+}
diff --git a/url/gurl.h b/url/gurl.h
new file mode 100644
index 0000000..76c595d
--- /dev/null
+++ b/url/gurl.h
@@ -0,0 +1,392 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_GURL_H__
+#define GOOGLEURL_SRC_GURL_H__
+
+#include <iosfwd>
+#include <string>
+
+#include "base/string16.h"
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_stdstring.h"
+#include "googleurl/src/url_common.h"
+#include "googleurl/src/url_parse.h"
+
+class GURL {
+ public:
+ typedef url_canon::StdStringReplacements<std::string> Replacements;
+ typedef url_canon::StdStringReplacements<string16> ReplacementsW;
+
+ // Creates an empty, invalid URL.
+ GURL_API GURL();
+
+ // Copy construction is relatively inexpensive, with most of the time going
+ // to reallocating the string. It does not re-parse.
+ GURL_API GURL(const GURL& other);
+
+ // The narrow version requires the input be UTF-8. Invalid UTF-8 input will
+ // result in an invalid URL.
+ //
+ // The wide version should also take an encoding parameter so we know how to
+ // encode the query parameters. It is probably sufficient for the narrow
+ // version to assume the query parameter encoding should be the same as the
+ // input encoding.
+ GURL_API explicit GURL(const std::string& url_string
+ /*, output_param_encoding*/);
+ GURL_API explicit GURL(const string16& url_string
+ /*, output_param_encoding*/);
+
+ // Constructor for URLs that have already been parsed and canonicalized. This
+ // is used for conversions from KURL, for example. The caller must supply all
+ // information associated with the URL, which must be correct and consistent.
+ GURL_API GURL(const char* canonical_spec, size_t canonical_spec_len,
+ const url_parse::Parsed& parsed, bool is_valid);
+
+ GURL_API ~GURL();
+
+ GURL_API GURL& operator=(const GURL& other);
+
+ // Returns true when this object represents a valid parsed URL. When not
+ // valid, other functions will still succeed, but you will not get canonical
+ // data out in the format you may be expecting. Instead, we keep something
+ // "reasonable looking" so that the user can see how it's busted if
+ // displayed to them.
+ bool is_valid() const {
+ return is_valid_;
+ }
+
+ // Returns true if the URL is zero-length. Note that empty URLs are also
+ // invalid, and is_valid() will return false for them. This is provided
+ // because some users may want to treat the empty case differently.
+ bool is_empty() const {
+ return spec_.empty();
+ }
+
+ // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8,
+ // if the URL is valid. If the URL is not valid, this will assert and return
+ // the empty string (for safety in release builds, to keep them from being
+ // misused which might be a security problem).
+ //
+ // The URL will be ASCII except the reference fragment, which may be UTF-8.
+ // It is guaranteed to be valid UTF-8.
+ //
+ // The exception is for empty() URLs (which are !is_valid()) but this will
+ // return the empty string without asserting.
+ //
+ // Used invalid_spec() below to get the unusable spec of an invalid URL. This
+ // separation is designed to prevent errors that may cause security problems
+ // that could result from the mistaken use of an invalid URL.
+ GURL_API const std::string& spec() const;
+
+ // Returns the potentially invalid spec for a the URL. This spec MUST NOT be
+ // modified or sent over the network. It is designed to be displayed in error
+ // messages to the user, as the apperance of the spec may explain the error.
+ // If the spec is valid, the valid spec will be returned.
+ //
+ // The returned string is guaranteed to be valid UTF-8.
+ const std::string& possibly_invalid_spec() const {
+ return spec_;
+ }
+
+ // Getter for the raw parsed structure. This allows callers to locate parts
+ // of the URL within the spec themselves. Most callers should consider using
+ // the individual component getters below.
+ //
+ // The returned parsed structure will reference into the raw spec, which may
+ // or may not be valid. If you are using this to index into the spec, BE
+ // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you
+ // don't do anything "important" with invalid specs.
+ const url_parse::Parsed& parsed_for_possibly_invalid_spec() const {
+ return parsed_;
+ }
+
+ // Defiant equality operator!
+ bool operator==(const GURL& other) const {
+ return spec_ == other.spec_;
+ }
+ bool operator!=(const GURL& other) const {
+ return spec_ != other.spec_;
+ }
+
+ // Allows GURL to used as a key in STL (for example, a std::set or std::map).
+ bool operator<(const GURL& other) const {
+ return spec_ < other.spec_;
+ }
+
+ // Resolves a URL that's possibly relative to this object's URL, and returns
+ // it. Absolute URLs are also handled according to the rules of URLs on web
+ // pages.
+ //
+ // It may be impossible to resolve the URLs properly. If the input is not
+ // "standard" (SchemeIsStandard() == false) and the input looks relative, we
+ // can't resolve it. In these cases, the result will be an empty, invalid
+ // GURL.
+ //
+ // The result may also be a nonempty, invalid URL if the input has some kind
+ // of encoding error. In these cases, we will try to construct a "good" URL
+ // that may have meaning to the user, but it will be marked invalid.
+ //
+ // It is an error to resolve a URL relative to an invalid URL. The result
+ // will be the empty URL.
+ GURL_API GURL Resolve(const std::string& relative) const;
+ GURL_API GURL Resolve(const string16& relative) const;
+
+ // Like Resolve() above but takes a character set encoder which will be used
+ // for any query text specified in the input. The charset converter parameter
+ // may be NULL, in which case it will be treated as UTF-8.
+ //
+ // TODO(brettw): These should be replaced with versions that take something
+ // more friendly than a raw CharsetConverter (maybe like an ICU character set
+ // name).
+ GURL_API GURL ResolveWithCharsetConverter(
+ const std::string& relative,
+ url_canon::CharsetConverter* charset_converter) const;
+ GURL_API GURL ResolveWithCharsetConverter(
+ const string16& relative,
+ url_canon::CharsetConverter* charset_converter) const;
+
+ // Creates a new GURL by replacing the current URL's components with the
+ // supplied versions. See the Replacements class in url_canon.h for more.
+ //
+ // These are not particularly quick, so avoid doing mutations when possible.
+ // Prefer the 8-bit version when possible.
+ //
+ // It is an error to replace components of an invalid URL. The result will
+ // be the empty URL.
+ //
+ // Note that we use the more general url_canon::Replacements type to give
+ // callers extra flexibility rather than our override.
+ GURL_API GURL ReplaceComponents(
+ const url_canon::Replacements<char>& replacements) const;
+ GURL_API GURL ReplaceComponents(
+ const url_canon::Replacements<char16>& replacements) const;
+
+ // A helper function that is equivalent to replacing the path with a slash
+ // and clearing out everything after that. We sometimes need to know just the
+ // scheme and the authority. If this URL is not a standard URL (it doesn't
+ // have the regular authority and path sections), then the result will be
+ // an empty, invalid GURL. Note that this *does* work for file: URLs, which
+ // some callers may want to filter out before calling this.
+ //
+ // It is an error to get an empty path on an invalid URL. The result
+ // will be the empty URL.
+ GURL_API GURL GetWithEmptyPath() const;
+
+ // A helper function to return a GURL containing just the scheme, host,
+ // and port from a URL. Equivalent to clearing any username and password,
+ // replacing the path with a slash, and clearing everything after that. If
+ // this URL is not a standard URL, then the result will be an empty,
+ // invalid GURL. If the URL has neither username nor password, this
+ // degenerates to GetWithEmptyPath().
+ //
+ // It is an error to get the origin of an invalid URL. The result
+ // will be the empty URL.
+ GURL_API GURL GetOrigin() const;
+
+ // Returns true if the scheme for the current URL is a known "standard"
+ // scheme. Standard schemes have an authority and a path section. This
+ // includes file: and filesystem:, which some callers may want to filter out
+ // explicitly by calling SchemeIsFile[System].
+ GURL_API bool IsStandard() const;
+
+ // Returns true if the given parameter (should be lower-case ASCII to match
+ // the canonicalized scheme) is the scheme for this URL. This call is more
+ // efficient than getting the scheme and comparing it because no copies or
+ // object constructions are done.
+ GURL_API bool SchemeIs(const char* lower_ascii_scheme) const;
+
+ // We often need to know if this is a file URL. File URLs are "standard", but
+ // are often treated separately by some programs.
+ bool SchemeIsFile() const {
+ return SchemeIs("file");
+ }
+
+ // FileSystem URLs need to be treated differently in some cases.
+ bool SchemeIsFileSystem() const {
+ return SchemeIs("filesystem");
+ }
+
+ // If the scheme indicates a secure connection
+ bool SchemeIsSecure() const {
+ return SchemeIs("https") || SchemeIs("wss") ||
+ (SchemeIsFileSystem() && inner_url() && inner_url()->SchemeIsSecure());
+ }
+
+ // Returns true if the hostname is an IP address. Note: this function isn't
+ // as cheap as a simple getter because it re-parses the hostname to verify.
+ // This currently identifies only IPv4 addresses (bug 822685).
+ GURL_API bool HostIsIPAddress() const;
+
+ // Getters for various components of the URL. The returned string will be
+ // empty if the component is empty or is not present.
+ std::string scheme() const { // Not including the colon. See also SchemeIs.
+ return ComponentString(parsed_.scheme);
+ }
+ std::string username() const {
+ return ComponentString(parsed_.username);
+ }
+ std::string password() const {
+ return ComponentString(parsed_.password);
+ }
+ // Note that this may be a hostname, an IPv4 address, or an IPv6 literal
+ // surrounded by square brackets, like "[2001:db8::1]". To exclude these
+ // brackets, use HostNoBrackets() below.
+ std::string host() const {
+ return ComponentString(parsed_.host);
+ }
+ std::string port() const { // Returns -1 if "default"
+ return ComponentString(parsed_.port);
+ }
+ std::string path() const { // Including first slash following host
+ return ComponentString(parsed_.path);
+ }
+ std::string query() const { // Stuff following '?'
+ return ComponentString(parsed_.query);
+ }
+ std::string ref() const { // Stuff following '#'
+ return ComponentString(parsed_.ref);
+ }
+
+ // Existance querying. These functions will return true if the corresponding
+ // URL component exists in this URL. Note that existance is different than
+ // being nonempty. http://www.google.com/? has a query that just happens to
+ // be empty, and has_query() will return true.
+ bool has_scheme() const {
+ return parsed_.scheme.len >= 0;
+ }
+ bool has_username() const {
+ return parsed_.username.len >= 0;
+ }
+ bool has_password() const {
+ return parsed_.password.len >= 0;
+ }
+ bool has_host() const {
+ // Note that hosts are special, absense of host means length 0.
+ return parsed_.host.len > 0;
+ }
+ bool has_port() const {
+ return parsed_.port.len >= 0;
+ }
+ bool has_path() const {
+ // Note that http://www.google.com/" has a path, the path is "/". This can
+ // return false only for invalid or nonstandard URLs.
+ return parsed_.path.len >= 0;
+ }
+ bool has_query() const {
+ return parsed_.query.len >= 0;
+ }
+ bool has_ref() const {
+ return parsed_.ref.len >= 0;
+ }
+
+ // Returns a parsed version of the port. Can also be any of the special
+ // values defined in Parsed for ExtractPort.
+ GURL_API int IntPort() const;
+
+ // Returns the port number of the url, or the default port number.
+ // If the scheme has no concept of port (or unknown default) returns
+ // PORT_UNSPECIFIED.
+ GURL_API int EffectiveIntPort() const;
+
+ // Extracts the filename portion of the path and returns it. The filename
+ // is everything after the last slash in the path. This may be empty.
+ GURL_API std::string ExtractFileName() const;
+
+ // Returns the path that should be sent to the server. This is the path,
+ // parameter, and query portions of the URL. It is guaranteed to be ASCII.
+ GURL_API std::string PathForRequest() const;
+
+ // Returns the host, excluding the square brackets surrounding IPv6 address
+ // literals. This can be useful for passing to getaddrinfo().
+ GURL_API std::string HostNoBrackets() const;
+
+ // Returns true if this URL's host matches or is in the same domain as
+ // the given input string. For example if this URL was "www.google.com",
+ // this would match "com", "google.com", and "www.google.com
+ // (input domain should be lower-case ASCII to match the canonicalized
+ // scheme). This call is more efficient than getting the host and check
+ // whether host has the specific domain or not because no copies or
+ // object constructions are done.
+ //
+ // If function DomainIs has parameter domain_len, which means the parameter
+ // lower_ascii_domain does not gurantee to terminate with NULL character.
+ GURL_API bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
+
+ // If function DomainIs only has parameter lower_ascii_domain, which means
+ // domain string should be terminate with NULL character.
+ bool DomainIs(const char* lower_ascii_domain) const {
+ return DomainIs(lower_ascii_domain,
+ static_cast<int>(strlen(lower_ascii_domain)));
+ }
+
+ // Swaps the contents of this GURL object with the argument without doing
+ // any memory allocations.
+ GURL_API void Swap(GURL* other);
+
+ // Returns a reference to a singleton empty GURL. This object is for callers
+ // who return references but don't have anything to return in some cases.
+ // This function may be called from any thread.
+ GURL_API static const GURL& EmptyGURL();
+
+ // Returns the inner URL of a nested URL [currently only non-null for
+ // filesystem: URLs].
+ const GURL* inner_url() const {
+ return inner_url_;
+ }
+
+ private:
+ // Returns the substring of the input identified by the given component.
+ std::string ComponentString(const url_parse::Component& comp) const {
+ if (comp.len <= 0)
+ return std::string();
+ return std::string(spec_, comp.begin, comp.len);
+ }
+
+ // The actual text of the URL, in canonical ASCII form.
+ std::string spec_;
+
+ // Set when the given URL is valid. Otherwise, we may still have a spec and
+ // components, but they may not identify valid resources (for example, an
+ // invalid port number, invalid characters in the scheme, etc.).
+ bool is_valid_;
+
+ // Identified components of the canonical spec.
+ url_parse::Parsed parsed_;
+
+ // Used for nested schemes [currently only filesystem:].
+ GURL* inner_url_;
+
+ // TODO bug 684583: Add encoding for query params.
+};
+
+// Stream operator so GURL can be used in assertion statements.
+GURL_API std::ostream& operator<<(std::ostream& out, const GURL& url);
+
+#endif // GOOGLEURL_SRC_GURL_H__
diff --git a/url/gurl_test_main.cc b/url/gurl_test_main.cc
new file mode 100644
index 0000000..43f19df
--- /dev/null
+++ b/url/gurl_test_main.cc
@@ -0,0 +1,102 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "build/build_config.h"
+
+#if defined(OS_WIN)
+#include <windows.h>
+#endif
+
+#include <string>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "unicode/putil.h"
+#include "unicode/udata.h"
+
+#define ICU_UTIL_DATA_SHARED 1
+#define ICU_UTIL_DATA_STATIC 2
+
+#ifndef ICU_UTIL_DATA_IMPL
+
+#if defined(OS_WIN)
+#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_SHARED
+#elif defined(OS_MACOSX)
+#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_STATIC
+#elif defined(OS_LINUX)
+#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_FILE
+#endif
+
+#endif // ICU_UTIL_DATA_IMPL
+
+#if defined(OS_WIN)
+#define ICU_UTIL_DATA_SYMBOL "icudt" U_ICU_VERSION_SHORT "_dat"
+#define ICU_UTIL_DATA_SHARED_MODULE_NAME "icudt" U_ICU_VERSION_SHORT ".dll"
+#endif
+
+bool InitializeICU() {
+#if (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_SHARED)
+ // We expect to find the ICU data module alongside the current module.
+ // Because the module name is ASCII-only, "A" API should be safe.
+ // Chrome's copy of ICU dropped a version number XX from icudt dll,
+ // but 3rd-party embedders may need it. So, we try both.
+ HMODULE module = LoadLibraryA("icudt.dll");
+ if (!module) {
+ module = LoadLibraryA(ICU_UTIL_DATA_SHARED_MODULE_NAME);
+ if (!module)
+ return false;
+ }
+
+ FARPROC addr = GetProcAddress(module, ICU_UTIL_DATA_SYMBOL);
+ if (!addr)
+ return false;
+
+ UErrorCode err = U_ZERO_ERROR;
+ udata_setCommonData(reinterpret_cast<void*>(addr), &err);
+ return err == U_ZERO_ERROR;
+#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_STATIC)
+ // Mac bundles the ICU data in.
+ return true;
+#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE)
+ // We expect to find the ICU data module alongside the current module.
+ u_setDataDirectory(".");
+ // Only look for the packaged data file;
+ // the default behavior is to look for individual files.
+ UErrorCode err = U_ZERO_ERROR;
+ udata_setFileAccess(UDATA_ONLY_PACKAGES, &err);
+ return err == U_ZERO_ERROR;
+#endif
+}
+
+int main(int argc, char **argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+
+ InitializeICU();
+
+ return RUN_ALL_TESTS();
+}
diff --git a/url/gurl_unittest.cc b/url/gurl_unittest.cc
new file mode 100644
index 0000000..670d2df
--- /dev/null
+++ b/url/gurl_unittest.cc
@@ -0,0 +1,488 @@
+// Copyright 2007 Google Inc. All Rights Reserved.
+// Author: brettw@google.com (Brett Wilson)
+
+#include "googleurl/src/gurl.h"
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_test_utils.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+// Some implementations of base/basictypes.h may define ARRAYSIZE.
+// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro
+// which is in our version of basictypes.h.
+#ifndef ARRAYSIZE
+#define ARRAYSIZE ARRAYSIZE_UNSAFE
+#endif
+
+using url_test_utils::WStringToUTF16;
+using url_test_utils::ConvertUTF8ToUTF16;
+
+namespace {
+
+template<typename CHAR>
+void SetupReplacement(void (url_canon::Replacements<CHAR>::*func)(const CHAR*,
+ const url_parse::Component&),
+ url_canon::Replacements<CHAR>* replacements,
+ const CHAR* str) {
+ if (str) {
+ url_parse::Component comp;
+ if (str[0])
+ comp.len = static_cast<int>(strlen(str));
+ (replacements->*func)(str, comp);
+ }
+}
+
+// Returns the canonicalized string for the given URL string for the
+// GURLTest.Types test.
+std::string TypesTestCase(const char* src) {
+ GURL gurl(src);
+ return gurl.possibly_invalid_spec();
+}
+
+} // namespace
+
+// Different types of URLs should be handled differently by url_util, and
+// handed off to different canonicalizers.
+TEST(GURLTest, Types) {
+ // URLs with unknown schemes should be treated as path URLs, even when they
+ // have things like "://".
+ EXPECT_EQ("something:///HOSTNAME.com/",
+ TypesTestCase("something:///HOSTNAME.com/"));
+
+ // In the reverse, known schemes should always trigger standard URL handling.
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com"));
+ EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com"));
+
+#ifdef WIN32
+ // URLs that look like absolute Windows drive specs.
+ EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt"));
+ EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt"));
+ EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt"));
+ EXPECT_EQ("file://server/foo.txt", TypesTestCase("//server/foo.txt"));
+#endif
+}
+
+// Test the basic creation and querying of components in a GURL. We assume
+// the parser is already tested and works, so we are mostly interested if the
+// object does the right thing with the results.
+TEST(GURLTest, Components) {
+ GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref"));
+ EXPECT_TRUE(url.is_valid());
+ EXPECT_TRUE(url.SchemeIs("http"));
+ EXPECT_FALSE(url.SchemeIsFile());
+
+ // This is the narrow version of the URL, which should match the wide input.
+ EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url.spec());
+
+ EXPECT_EQ("http", url.scheme());
+ EXPECT_EQ("user", url.username());
+ EXPECT_EQ("pass", url.password());
+ EXPECT_EQ("google.com", url.host());
+ EXPECT_EQ("99", url.port());
+ EXPECT_EQ(99, url.IntPort());
+ EXPECT_EQ("/foo;bar", url.path());
+ EXPECT_EQ("q=a", url.query());
+ EXPECT_EQ("ref", url.ref());
+}
+
+TEST(GURLTest, Empty) {
+ GURL url;
+ EXPECT_FALSE(url.is_valid());
+ EXPECT_EQ("", url.spec());
+
+ EXPECT_EQ("", url.scheme());
+ EXPECT_EQ("", url.username());
+ EXPECT_EQ("", url.password());
+ EXPECT_EQ("", url.host());
+ EXPECT_EQ("", url.port());
+ EXPECT_EQ(url_parse::PORT_UNSPECIFIED, url.IntPort());
+ EXPECT_EQ("", url.path());
+ EXPECT_EQ("", url.query());
+ EXPECT_EQ("", url.ref());
+}
+
+TEST(GURLTest, Copy) {
+ GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref"));
+
+ GURL url2(url);
+ EXPECT_TRUE(url2.is_valid());
+
+ EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec());
+ EXPECT_EQ("http", url2.scheme());
+ EXPECT_EQ("user", url2.username());
+ EXPECT_EQ("pass", url2.password());
+ EXPECT_EQ("google.com", url2.host());
+ EXPECT_EQ("99", url2.port());
+ EXPECT_EQ(99, url2.IntPort());
+ EXPECT_EQ("/foo;bar", url2.path());
+ EXPECT_EQ("q=a", url2.query());
+ EXPECT_EQ("ref", url2.ref());
+
+ // Copying of invalid URL should be invalid
+ GURL invalid;
+ GURL invalid2(invalid);
+ EXPECT_FALSE(invalid2.is_valid());
+ EXPECT_EQ("", invalid2.spec());
+ EXPECT_EQ("", invalid2.scheme());
+ EXPECT_EQ("", invalid2.username());
+ EXPECT_EQ("", invalid2.password());
+ EXPECT_EQ("", invalid2.host());
+ EXPECT_EQ("", invalid2.port());
+ EXPECT_EQ(url_parse::PORT_UNSPECIFIED, invalid2.IntPort());
+ EXPECT_EQ("", invalid2.path());
+ EXPECT_EQ("", invalid2.query());
+ EXPECT_EQ("", invalid2.ref());
+}
+
+TEST(GURLTest, CopyFileSystem) {
+ GURL url(WStringToUTF16(L"filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref"));
+
+ GURL url2(url);
+ EXPECT_TRUE(url2.is_valid());
+
+ EXPECT_EQ("filesystem:https://user:pass@google.com:99/t/foo;bar?q=a#ref", url2.spec());
+ EXPECT_EQ("filesystem", url2.scheme());
+ EXPECT_EQ("", url2.username());
+ EXPECT_EQ("", url2.password());
+ EXPECT_EQ("", url2.host());
+ EXPECT_EQ("", url2.port());
+ EXPECT_EQ(url_parse::PORT_UNSPECIFIED, url2.IntPort());
+ EXPECT_EQ("/foo;bar", url2.path());
+ EXPECT_EQ("q=a", url2.query());
+ EXPECT_EQ("ref", url2.ref());
+
+ const GURL* inner = url2.inner_url();
+ ASSERT_TRUE(inner);
+ EXPECT_EQ("https", inner->scheme());
+ EXPECT_EQ("user", inner->username());
+ EXPECT_EQ("pass", inner->password());
+ EXPECT_EQ("google.com", inner->host());
+ EXPECT_EQ("99", inner->port());
+ EXPECT_EQ(99, inner->IntPort());
+ EXPECT_EQ("/t", inner->path());
+ EXPECT_EQ("", inner->query());
+ EXPECT_EQ("", inner->ref());
+}
+
+// Given an invalid URL, we should still get most of the components.
+TEST(GURLTest, Invalid) {
+ GURL url("http:google.com:foo");
+ EXPECT_FALSE(url.is_valid());
+ EXPECT_EQ("http://google.com:foo/", url.possibly_invalid_spec());
+
+ EXPECT_EQ("http", url.scheme());
+ EXPECT_EQ("", url.username());
+ EXPECT_EQ("", url.password());
+ EXPECT_EQ("google.com", url.host());
+ EXPECT_EQ("foo", url.port());
+ EXPECT_EQ(url_parse::PORT_INVALID, url.IntPort());
+ EXPECT_EQ("/", url.path());
+ EXPECT_EQ("", url.query());
+ EXPECT_EQ("", url.ref());
+}
+
+TEST(GURLTest, Resolve) {
+ // The tricky cases for relative URL resolving are tested in the
+ // canonicalizer unit test. Here, we just test that the GURL integration
+ // works properly.
+ struct ResolveCase {
+ const char* base;
+ const char* relative;
+ bool expected_valid;
+ const char* expected;
+ } resolve_cases[] = {
+ {"http://www.google.com/", "foo.html", true, "http://www.google.com/foo.html"},
+ {"http://www.google.com/", "http://images.google.com/foo.html", true, "http://images.google.com/foo.html"},
+ {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"},
+ {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"},
+ {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"},
+ // A non-standard base can be replaced with a standard absolute URL.
+ {"data:blahblah", "http://google.com/", true, "http://google.com/"},
+ {"data:blahblah", "http:google.com", true, "http://google.com/"},
+ // Filesystem URLs have different paths to test.
+ {"filesystem:http://www.google.com/type/", "foo.html", true, "filesystem:http://www.google.com/type/foo.html"},
+ {"filesystem:http://www.google.com/type/", "../foo.html", true, "filesystem:http://www.google.com/type/foo.html"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(resolve_cases); i++) {
+ // 8-bit code path.
+ GURL input(resolve_cases[i].base);
+ GURL output = input.Resolve(resolve_cases[i].relative);
+ EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()) << i;
+ EXPECT_EQ(resolve_cases[i].expected, output.spec()) << i;
+ EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL);
+
+ // Wide code path.
+ GURL inputw(ConvertUTF8ToUTF16(resolve_cases[i].base));
+ GURL outputw =
+ input.Resolve(ConvertUTF8ToUTF16(resolve_cases[i].relative));
+ EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()) << i;
+ EXPECT_EQ(resolve_cases[i].expected, outputw.spec()) << i;
+ EXPECT_EQ(outputw.SchemeIsFileSystem(), outputw.inner_url() != NULL);
+ }
+}
+
+TEST(GURLTest, GetOrigin) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ {"http://www.google.com", "http://www.google.com/"},
+ {"javascript:window.alert(\"hello,world\");", ""},
+ {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/"},
+ {"http://user@www.google.com", "http://www.google.com/"},
+ {"http://:pass@www.google.com", "http://www.google.com/"},
+ {"http://:@www.google.com", "http://www.google.com/"},
+ {"filesystem:http://www.google.com/temp/foo?q#b", "http://www.google.com/"},
+ {"filesystem:http://user:pass@google.com:21/blah#baz", "http://google.com:21/"},
+ };
+ for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+ GURL url(cases[i].input);
+ GURL origin = url.GetOrigin();
+ EXPECT_EQ(cases[i].expected, origin.spec());
+ }
+}
+
+TEST(GURLTest, GetWithEmptyPath) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ } cases[] = {
+ {"http://www.google.com", "http://www.google.com/"},
+ {"javascript:window.alert(\"hello, world\");", ""},
+ {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"},
+ {"filesystem:http://www.google.com/temporary/bar.html?baz=22", "filesystem:http://www.google.com/temporary/"},
+ {"filesystem:file:///temporary/bar.html?baz=22", "filesystem:file:///temporary/"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+ GURL url(cases[i].input);
+ GURL empty_path = url.GetWithEmptyPath();
+ EXPECT_EQ(cases[i].expected, empty_path.spec());
+ }
+}
+
+TEST(GURLTest, Replacements) {
+ // The url canonicalizer replacement test will handle most of these case.
+ // The most important thing to do here is to check that the proper
+ // canonicalizer gets called based on the scheme of the input.
+ struct ReplaceCase {
+ const char* base;
+ const char* scheme;
+ const char* username;
+ const char* password;
+ const char* host;
+ const char* port;
+ const char* path;
+ const char* query;
+ const char* ref;
+ const char* expected;
+ } replace_cases[] = {
+ {"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "http://www.google.com/"},
+ {"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", "", "window.open('foo');", "", "", "javascript:window.open('foo');"},
+ {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo","search", "ref", "http://www.google.com:99/foo?search#ref"},
+#ifdef WIN32
+ {"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", "c:\\", "", "", "file:///C:/"},
+#endif
+ {"filesystem:http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "filesystem:http://www.google.com/foo/"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ GURL url(cur.base);
+ GURL::Replacements repl;
+ SetupReplacement(&GURL::Replacements::SetScheme, &repl, cur.scheme);
+ SetupReplacement(&GURL::Replacements::SetUsername, &repl, cur.username);
+ SetupReplacement(&GURL::Replacements::SetPassword, &repl, cur.password);
+ SetupReplacement(&GURL::Replacements::SetHost, &repl, cur.host);
+ SetupReplacement(&GURL::Replacements::SetPort, &repl, cur.port);
+ SetupReplacement(&GURL::Replacements::SetPath, &repl, cur.path);
+ SetupReplacement(&GURL::Replacements::SetQuery, &repl, cur.query);
+ SetupReplacement(&GURL::Replacements::SetRef, &repl, cur.ref);
+ GURL output = url.ReplaceComponents(repl);
+
+ EXPECT_EQ(replace_cases[i].expected, output.spec());
+ EXPECT_EQ(output.SchemeIsFileSystem(), output.inner_url() != NULL);
+ }
+}
+
+TEST(GURLTest, PathForRequest) {
+ struct TestCase {
+ const char* input;
+ const char* expected;
+ const char* inner_expected;
+ } cases[] = {
+ {"http://www.google.com", "/", NULL},
+ {"http://www.google.com/", "/", NULL},
+ {"http://www.google.com/foo/bar.html?baz=22", "/foo/bar.html?baz=22", NULL},
+ {"http://www.google.com/foo/bar.html#ref", "/foo/bar.html", NULL},
+ {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query", NULL},
+ {"filesystem:http://www.google.com/temporary/foo/bar.html?query#ref", "/foo/bar.html?query", "/temporary"},
+ {"filesystem:http://www.google.com/temporary/foo/bar.html?query", "/foo/bar.html?query", "/temporary"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+ GURL url(cases[i].input);
+ std::string path_request = url.PathForRequest();
+ EXPECT_EQ(cases[i].expected, path_request);
+ EXPECT_EQ(cases[i].inner_expected == NULL, url.inner_url() == NULL);
+ if (url.inner_url() && cases[i].inner_expected)
+ EXPECT_EQ(cases[i].inner_expected, url.inner_url()->PathForRequest());
+ }
+}
+
+TEST(GURLTest, EffectiveIntPort) {
+ struct PortTest {
+ const char* spec;
+ int expected_int_port;
+ } port_tests[] = {
+ // http
+ {"http://www.google.com/", 80},
+ {"http://www.google.com:80/", 80},
+ {"http://www.google.com:443/", 443},
+
+ // https
+ {"https://www.google.com/", 443},
+ {"https://www.google.com:443/", 443},
+ {"https://www.google.com:80/", 80},
+
+ // ftp
+ {"ftp://www.google.com/", 21},
+ {"ftp://www.google.com:21/", 21},
+ {"ftp://www.google.com:80/", 80},
+
+ // gopher
+ {"gopher://www.google.com/", 70},
+ {"gopher://www.google.com:70/", 70},
+ {"gopher://www.google.com:80/", 80},
+
+ // file - no port
+ {"file://www.google.com/", url_parse::PORT_UNSPECIFIED},
+ {"file://www.google.com:443/", url_parse::PORT_UNSPECIFIED},
+
+ // data - no port
+ {"data:www.google.com:90", url_parse::PORT_UNSPECIFIED},
+ {"data:www.google.com", url_parse::PORT_UNSPECIFIED},
+
+ // filesystem - no port
+ {"filesystem:http://www.google.com:90/t/foo", url_parse::PORT_UNSPECIFIED},
+ {"filesystem:file:///t/foo", url_parse::PORT_UNSPECIFIED},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(port_tests); i++) {
+ GURL url(port_tests[i].spec);
+ EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort());
+ }
+}
+
+TEST(GURLTest, IPAddress) {
+ struct IPTest {
+ const char* spec;
+ bool expected_ip;
+ } ip_tests[] = {
+ {"http://www.google.com/", false},
+ {"http://192.168.9.1/", true},
+ {"http://192.168.9.1.2/", false},
+ {"http://192.168.m.1/", false},
+ {"http://2001:db8::1/", false},
+ {"http://[2001:db8::1]/", true},
+ {"", false},
+ {"some random input!", false},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(ip_tests); i++) {
+ GURL url(ip_tests[i].spec);
+ EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress());
+ }
+}
+
+TEST(GURLTest, HostNoBrackets) {
+ struct TestCase {
+ const char* input;
+ const char* expected_host;
+ const char* expected_plainhost;
+ } cases[] = {
+ {"http://www.google.com", "www.google.com", "www.google.com"},
+ {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"},
+ {"http://[::]/", "[::]", "::"},
+
+ // Don't require a valid URL, but don't crash either.
+ {"http://[]/", "[]", ""},
+ {"http://[x]/", "[x]", "x"},
+ {"http://[x/", "[x", "[x"},
+ {"http://x]/", "x]", "x]"},
+ {"http://[/", "[", "["},
+ {"http://]/", "]", "]"},
+ {"", "", ""},
+ };
+ for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+ GURL url(cases[i].input);
+ EXPECT_EQ(cases[i].expected_host, url.host());
+ EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets());
+ }
+}
+
+TEST(GURLTest, DomainIs) {
+ const char google_domain[] = "google.com";
+
+ GURL url_1("http://www.google.com:99/foo");
+ EXPECT_TRUE(url_1.DomainIs(google_domain));
+
+ GURL url_2("http://google.com:99/foo");
+ EXPECT_TRUE(url_2.DomainIs(google_domain));
+
+ GURL url_3("http://google.com./foo");
+ EXPECT_TRUE(url_3.DomainIs(google_domain));
+
+ GURL url_4("http://google.com/foo");
+ EXPECT_FALSE(url_4.DomainIs("google.com."));
+
+ GURL url_5("http://google.com./foo");
+ EXPECT_TRUE(url_5.DomainIs("google.com."));
+
+ GURL url_6("http://www.google.com./foo");
+ EXPECT_TRUE(url_6.DomainIs(".com."));
+
+ GURL url_7("http://www.balabala.com/foo");
+ EXPECT_FALSE(url_7.DomainIs(google_domain));
+
+ GURL url_8("http://www.google.com.cn/foo");
+ EXPECT_FALSE(url_8.DomainIs(google_domain));
+
+ GURL url_9("http://www.iamnotgoogle.com/foo");
+ EXPECT_FALSE(url_9.DomainIs(google_domain));
+
+ GURL url_10("http://www.iamnotgoogle.com../foo");
+ EXPECT_FALSE(url_10.DomainIs(".com"));
+
+ GURL url_11("filesystem:http://www.google.com:99/foo/");
+ EXPECT_TRUE(url_11.DomainIs(google_domain));
+
+ GURL url_12("filesystem:http://www.iamnotgoogle.com/foo/");
+ EXPECT_FALSE(url_12.DomainIs(google_domain));
+}
+
+// Newlines should be stripped from inputs.
+TEST(GURLTest, Newlines) {
+ // Constructor.
+ GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n ");
+ EXPECT_EQ("http://www.google.com/asdf", url_1.spec());
+
+ // Relative path resolver.
+ GURL url_2 = url_1.Resolve(" \n /fo\to\r ");
+ EXPECT_EQ("http://www.google.com/foo", url_2.spec());
+
+ // Note that newlines are NOT stripped from ReplaceComponents.
+}
+
+TEST(GURLTest, IsStandard) {
+ GURL a("http:foo/bar");
+ EXPECT_TRUE(a.IsStandard());
+
+ GURL b("foo:bar/baz");
+ EXPECT_FALSE(b.IsStandard());
+
+ GURL c("foo://bar/baz");
+ EXPECT_FALSE(c.IsStandard());
+}
diff --git a/url/url_canon.h b/url/url_canon.h
new file mode 100644
index 0000000..00ae715
--- /dev/null
+++ b/url/url_canon.h
@@ -0,0 +1,912 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#ifndef GOOGLEURL_SRC_URL_CANON_H__
+#define GOOGLEURL_SRC_URL_CANON_H__
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "base/string16.h"
+#include "googleurl/src/url_common.h"
+#include "googleurl/src/url_parse.h"
+
+namespace url_canon {
+
+// Canonicalizer output -------------------------------------------------------
+
+// Base class for the canonicalizer output, this maintains a buffer and
+// supports simple resizing and append operations on it.
+//
+// It is VERY IMPORTANT that no virtual function calls be made on the common
+// code path. We only have two virtual function calls, the destructor and a
+// resize function that is called when the existing buffer is not big enough.
+// The derived class is then in charge of setting up our buffer which we will
+// manage.
+template<typename T>
+class CanonOutputT {
+ public:
+ CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {
+ }
+ virtual ~CanonOutputT() {
+ }
+
+ // Implemented to resize the buffer. This function should update the buffer
+ // pointer to point to the new buffer, and any old data up to |cur_len_| in
+ // the buffer must be copied over.
+ //
+ // The new size |sz| must be larger than buffer_len_.
+ virtual void Resize(int sz) = 0;
+
+ // Accessor for returning a character at a given position. The input offset
+ // must be in the valid range.
+ inline char at(int offset) const {
+ return buffer_[offset];
+ }
+
+ // Sets the character at the given position. The given position MUST be less
+ // than the length().
+ inline void set(int offset, int ch) {
+ buffer_[offset] = ch;
+ }
+
+ // Returns the number of characters currently in the buffer.
+ inline int length() const {
+ return cur_len_;
+ }
+
+ // Returns the current capacity of the buffer. The length() is the number of
+ // characters that have been declared to be written, but the capacity() is
+ // the number that can be written without reallocation. If the caller must
+ // write many characters at once, it can make sure there is enough capacity,
+ // write the data, then use set_size() to declare the new length().
+ int capacity() const {
+ return buffer_len_;
+ }
+
+ // Called by the user of this class to get the output. The output will NOT
+ // be NULL-terminated. Call length() to get the
+ // length.
+ const T* data() const {
+ return buffer_;
+ }
+ T* data() {
+ return buffer_;
+ }
+
+ // Shortens the URL to the new length. Used for "backing up" when processing
+ // relative paths. This can also be used if an external function writes a lot
+ // of data to the buffer (when using the "Raw" version below) beyond the end,
+ // to declare the new length.
+ //
+ // This MUST NOT be used to expand the size of the buffer beyond capacity().
+ void set_length(int new_len) {
+ cur_len_ = new_len;
+ }
+
+ // This is the most performance critical function, since it is called for
+ // every character.
+ void push_back(T ch) {
+ // In VC2005, putting this common case first speeds up execution
+ // dramatically because this branch is predicted as taken.
+ if (cur_len_ < buffer_len_) {
+ buffer_[cur_len_] = ch;
+ cur_len_++;
+ return;
+ }
+
+ // Grow the buffer to hold at least one more item. Hopefully we won't have
+ // to do this very often.
+ if (!Grow(1))
+ return;
+
+ // Actually do the insertion.
+ buffer_[cur_len_] = ch;
+ cur_len_++;
+ }
+
+ // Appends the given string to the output.
+ void Append(const T* str, int str_len) {
+ if (cur_len_ + str_len > buffer_len_) {
+ if (!Grow(cur_len_ + str_len - buffer_len_))
+ return;
+ }
+ for (int i = 0; i < str_len; i++)
+ buffer_[cur_len_ + i] = str[i];
+ cur_len_ += str_len;
+ }
+
+ protected:
+ // Grows the given buffer so that it can fit at least |min_additional|
+ // characters. Returns true if the buffer could be resized, false on OOM.
+ bool Grow(int min_additional) {
+ static const int kMinBufferLen = 16;
+ int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;
+ do {
+ if (new_len >= (1 << 30)) // Prevent overflow below.
+ return false;
+ new_len *= 2;
+ } while (new_len < buffer_len_ + min_additional);
+ Resize(new_len);
+ return true;
+ }
+
+ T* buffer_;
+ int buffer_len_;
+
+ // Used characters in the buffer.
+ int cur_len_;
+};
+
+// Simple implementation of the CanonOutput using new[]. This class
+// also supports a static buffer so if it is allocated on the stack, most
+// URLs can be canonicalized with no heap allocations.
+template<typename T, int fixed_capacity = 1024>
+class RawCanonOutputT : public CanonOutputT<T> {
+ public:
+ RawCanonOutputT() : CanonOutputT<T>() {
+ this->buffer_ = fixed_buffer_;
+ this->buffer_len_ = fixed_capacity;
+ }
+ virtual ~RawCanonOutputT() {
+ if (this->buffer_ != fixed_buffer_)
+ delete[] this->buffer_;
+ }
+
+ virtual void Resize(int sz) {
+ T* new_buf = new T[sz];
+ memcpy(new_buf, this->buffer_,
+ sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));
+ if (this->buffer_ != fixed_buffer_)
+ delete[] this->buffer_;
+ this->buffer_ = new_buf;
+ this->buffer_len_ = sz;
+ }
+
+ protected:
+ T fixed_buffer_[fixed_capacity];
+};
+
+// Normally, all canonicalization output is in narrow characters. We support
+// the templates so it can also be used internally if a wide buffer is
+// required.
+typedef CanonOutputT<char> CanonOutput;
+typedef CanonOutputT<char16> CanonOutputW;
+
+template<int fixed_capacity>
+class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};
+template<int fixed_capacity>
+class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {};
+
+// Character set converter ----------------------------------------------------
+//
+// Converts query strings into a custom encoding. The embedder can supply an
+// implementation of this class to interface with their own character set
+// conversion libraries.
+//
+// Embedders will want to see the unit test for the ICU version.
+
+class CharsetConverter {
+ public:
+ CharsetConverter() {}
+ virtual ~CharsetConverter() {}
+
+ // Converts the given input string from UTF-16 to whatever output format the
+ // converter supports. This is used only for the query encoding conversion,
+ // which does not fail. Instead, the converter should insert "invalid
+ // character" characters in the output for invalid sequences, and do the
+ // best it can.
+ //
+ // If the input contains a character not representable in the output
+ // character set, the converter should append the HTML entity sequence in
+ // decimal, (such as "&#20320;") with escaping of the ampersand, number
+ // sign, and semicolon (in the previous example it would be
+ // "%26%2320320%3B"). This rule is based on what IE does in this situation.
+ virtual void ConvertFromUTF16(const char16* input,
+ int input_len,
+ CanonOutput* output) = 0;
+};
+
+// Whitespace -----------------------------------------------------------------
+
+// Searches for whitespace that should be removed from the middle of URLs, and
+// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces
+// are preserved, which is what most browsers do. A pointer to the output will
+// be returned, and the length of that output will be in |output_len|.
+//
+// This should be called before parsing if whitespace removal is desired (which
+// it normally is when you are canonicalizing).
+//
+// If no whitespace is removed, this function will not use the buffer and will
+// return a pointer to the input, to avoid the extra copy. If modification is
+// required, the given |buffer| will be used and the returned pointer will
+// point to the beginning of the buffer.
+//
+// Therefore, callers should not use the buffer, since it may actuall be empty,
+// use the computed pointer and |*output_len| instead.
+GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,
+ CanonOutputT<char>* buffer,
+ int* output_len);
+GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,
+ CanonOutputT<char16>* buffer,
+ int* output_len);
+
+// IDN ------------------------------------------------------------------------
+
+// Converts the Unicode input representing a hostname to ASCII using IDN rules.
+// The output must fall in the ASCII range, but will be encoded in UTF-16.
+//
+// On success, the output will be filled with the ASCII host name and it will
+// return true. Unlike most other canonicalization functions, this assumes that
+// the output is empty. The beginning of the host will be at offset 0, and
+// the length of the output will be set to the length of the new host name.
+//
+// On error, returns false. The output in this case is undefined.
+GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);
+
+// Piece-by-piece canonicalizers ----------------------------------------------
+//
+// These individual canonicalizers append the canonicalized versions of the
+// corresponding URL component to the given std::string. The spec and the
+// previously-identified range of that component are the input. The range of
+// the canonicalized component will be written to the output component.
+//
+// These functions all append to the output so they can be chained. Make sure
+// the output is empty when you start.
+//
+// These functions returns boolean values indicating success. On failure, they
+// will attempt to write something reasonable to the output so that, if
+// displayed to the user, they will recognise it as something that's messed up.
+// Nothing more should ever be done with these invalid URLs, however.
+
+// Scheme: Appends the scheme and colon to the URL. The output component will
+// indicate the range of characters up to but not including the colon.
+//
+// Canonical URLs always have a scheme. If the scheme is not present in the
+// input, this will just write the colon to indicate an empty scheme. Does not
+// append slashes which will be needed before any authority components for most
+// URLs.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool CanonicalizeScheme(const char* spec,
+ const url_parse::Component& scheme,
+ CanonOutput* output,
+ url_parse::Component* out_scheme);
+GURL_API bool CanonicalizeScheme(const char16* spec,
+ const url_parse::Component& scheme,
+ CanonOutput* output,
+ url_parse::Component* out_scheme);
+
+// User info: username/password. If present, this will add the delimiters so
+// the output will be "<username>:<password>@" or "<username>@". Empty
+// username/password pairs, or empty passwords, will get converted to
+// nonexistant in the canonical version.
+//
+// The components for the username and password refer to ranges in the
+// respective source strings. Usually, these will be the same string, which
+// is legal as long as the two components don't overlap.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool CanonicalizeUserInfo(const char* username_source,
+ const url_parse::Component& username,
+ const char* password_source,
+ const url_parse::Component& password,
+ CanonOutput* output,
+ url_parse::Component* out_username,
+ url_parse::Component* out_password);
+GURL_API bool CanonicalizeUserInfo(const char16* username_source,
+ const url_parse::Component& username,
+ const char16* password_source,
+ const url_parse::Component& password,
+ CanonOutput* output,
+ url_parse::Component* out_username,
+ url_parse::Component* out_password);
+
+
+// This structure holds detailed state exported from the IP/Host canonicalizers.
+// Additional fields may be added as callers require them.
+struct CanonHostInfo {
+ CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}
+
+ // Convenience function to test if family is an IP address.
+ bool IsIPAddress() const { return family == IPV4 || family == IPV6; }
+
+ // This field summarizes how the input was classified by the canonicalizer.
+ enum Family {
+ NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
+ // canonicalizer is concerned, it should be treated as a
+ // hostname.
+ BROKEN, // - Almost an IP, but was not canonicalized. This could be an
+ // IPv4 address where truncation occurred, or something
+ // containing the special characters :[] which did not parse
+ // as an IPv6 address. Never attempt to connect to this
+ // address, because it might actually succeed!
+ IPV4, // - Successfully canonicalized as an IPv4 address.
+ IPV6, // - Successfully canonicalized as an IPv6 address.
+ };
+ Family family;
+
+ // If |family| is IPV4, then this is the number of nonempty dot-separated
+ // components in the input text, from 1 to 4. If |family| is not IPV4,
+ // this value is undefined.
+ int num_ipv4_components;
+
+ // Location of host within the canonicalized output.
+ // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6.
+ // CanonicalizeHostVerbose() always sets it.
+ url_parse::Component out_host;
+
+ // |address| contains the parsed IP Address (if any) in its first
+ // AddressLength() bytes, in network order. If IsIPAddress() is false
+ // AddressLength() will return zero and the content of |address| is undefined.
+ unsigned char address[16];
+
+ // Convenience function to calculate the length of an IP address corresponding
+ // to the current IP version in |family|, if any. For use with |address|.
+ int AddressLength() const {
+ return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0);
+ }
+};
+
+
+// Host.
+//
+// The 8-bit version requires UTF-8 encoding. Use this version when you only
+// need to know whether canonicalization succeeded.
+GURL_API bool CanonicalizeHost(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ url_parse::Component* out_host);
+GURL_API bool CanonicalizeHost(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ url_parse::Component* out_host);
+
+// Extended version of CanonicalizeHost, which returns additional information.
+// Use this when you need to know whether the hostname was an IP address.
+// A successful return is indicated by host_info->family != BROKEN. See the
+// definition of CanonHostInfo above for details.
+GURL_API void CanonicalizeHostVerbose(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+GURL_API void CanonicalizeHostVerbose(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+
+
+// IP addresses.
+//
+// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is
+// an IP address, it will canonicalize it as such, appending it to |output|.
+// Additional status information is returned via the |*host_info| parameter.
+// See the definition of CanonHostInfo above for details.
+//
+// This is called AUTOMATICALLY from the host canonicalizer, which ensures that
+// the input is unescaped and name-prepped, etc. It should not normally be
+// necessary or wise to call this directly.
+GURL_API void CanonicalizeIPAddress(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+GURL_API void CanonicalizeIPAddress(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info);
+
+// Port: this function will add the colon for the port if a port is present.
+// The caller can pass url_parse::PORT_UNSPECIFIED as the
+// default_port_for_scheme argument if there is no default port.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool CanonicalizePort(const char* spec,
+ const url_parse::Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ url_parse::Component* out_port);
+GURL_API bool CanonicalizePort(const char16* spec,
+ const url_parse::Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ url_parse::Component* out_port);
+
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);
+
+// Path. If the input does not begin in a slash (including if the input is
+// empty), we'll prepend a slash to the path to make it canonical.
+//
+// The 8-bit version assumes UTF-8 encoding, but does not verify the validity
+// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid
+// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't
+// an issue. Somebody giving us an 8-bit path is responsible for generating
+// the path that the server expects (we'll escape high-bit characters), so
+// if something is invalid, it's their problem.
+GURL_API bool CanonicalizePath(const char* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+GURL_API bool CanonicalizePath(const char16* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+
+// Canonicalizes the input as a file path. This is like CanonicalizePath except
+// that it also handles Windows drive specs. For example, the path can begin
+// with "c|\" and it will get properly canonicalized to "C:/".
+// The string will be appended to |*output| and |*out_path| will be updated.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool FileCanonicalizePath(const char* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+GURL_API bool FileCanonicalizePath(const char16* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+
+// Query: Prepends the ? if needed.
+//
+// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly
+// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode
+// "invalid character." This function can not fail, we always just try to do
+// our best for crazy input here since web pages can set it themselves.
+//
+// This will convert the given input into the output encoding that the given
+// character set converter object provides. The converter will only be called
+// if necessary, for ASCII input, no conversions are necessary.
+//
+// The converter can be NULL. In this case, the output encoding will be UTF-8.
+GURL_API void CanonicalizeQuery(const char* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ url_parse::Component* out_query);
+GURL_API void CanonicalizeQuery(const char16* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ url_parse::Component* out_query);
+
+// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only
+// canonicalizer that does not produce ASCII output). The output is
+// guaranteed to be valid UTF-8.
+//
+// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use
+// the "Unicode replacement character" for the confusing bits and copy the rest.
+GURL_API void CanonicalizeRef(const char* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+GURL_API void CanonicalizeRef(const char16* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path);
+
+// Full canonicalizer ---------------------------------------------------------
+//
+// These functions replace any string contents, rather than append as above.
+// See the above piece-by-piece functions for information specific to
+// canonicalizing individual components.
+//
+// The output will be ASCII except the reference fragment, which may be UTF-8.
+//
+// The 8-bit versions require UTF-8 encoding.
+
+// Use for standard URLs with authorities and paths.
+GURL_API bool CanonicalizeStandardURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeStandardURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Use for file URLs.
+GURL_API bool CanonicalizeFileURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeFileURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Use for filesystem URLs.
+GURL_API bool CanonicalizeFileSystemURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeFileSystemURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Use for path URLs such as javascript. This does not modify the path in any
+// way, for example, by escaping it.
+GURL_API bool CanonicalizePathURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizePathURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Use for mailto URLs. This "canonicalizes" the url into a path and query
+// component. It does not attempt to merge "to" fields. It uses UTF-8 for
+// the query encoding if there is a query. This is because a mailto URL is
+// really intended for an external mail program, and the encoding of a page,
+// etc. which would influence a query encoding normally are irrelevant.
+GURL_API bool CanonicalizeMailtoURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool CanonicalizeMailtoURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Part replacer --------------------------------------------------------------
+
+// Internal structure used for storing separate strings for each component.
+// The basic canonicalization functions use this structure internally so that
+// component replacement (different strings for different components) can be
+// treated on the same code path as regular canonicalization (the same string
+// for each component).
+//
+// A url_parse::Parsed structure usually goes along with this. Those
+// components identify offsets within these strings, so that they can all be
+// in the same string, or spread arbitrarily across different ones.
+//
+// This structures does not own any data. It is the caller's responsibility to
+// ensure that the data the pointers point to stays in scope and is not
+// modified.
+template<typename CHAR>
+struct URLComponentSource {
+ // Constructor normally used by callers wishing to replace components. This
+ // will make them all NULL, which is no replacement. The caller would then
+ // override the components they want to replace.
+ URLComponentSource()
+ : scheme(NULL),
+ username(NULL),
+ password(NULL),
+ host(NULL),
+ port(NULL),
+ path(NULL),
+ query(NULL),
+ ref(NULL) {
+ }
+
+ // Constructor normally used internally to initialize all the components to
+ // point to the same spec.
+ explicit URLComponentSource(const CHAR* default_value)
+ : scheme(default_value),
+ username(default_value),
+ password(default_value),
+ host(default_value),
+ port(default_value),
+ path(default_value),
+ query(default_value),
+ ref(default_value) {
+ }
+
+ const CHAR* scheme;
+ const CHAR* username;
+ const CHAR* password;
+ const CHAR* host;
+ const CHAR* port;
+ const CHAR* path;
+ const CHAR* query;
+ const CHAR* ref;
+};
+
+// This structure encapsulates information on modifying a URL. Each component
+// may either be left unchanged, replaced, or deleted.
+//
+// By default, each component is unchanged. For those components that should be
+// modified, call either Set* or Clear* to modify it.
+//
+// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT
+// IN SCOPE BY THE CALLER for as long as this object exists!
+//
+// Prefer the 8-bit replacement version if possible since it is more efficient.
+template<typename CHAR>
+class Replacements {
+ public:
+ Replacements() {
+ }
+
+ // Scheme
+ void SetScheme(const CHAR* s, const url_parse::Component& comp) {
+ sources_.scheme = s;
+ components_.scheme = comp;
+ }
+ // Note: we don't have a ClearScheme since this doesn't make any sense.
+ bool IsSchemeOverridden() const { return sources_.scheme != NULL; }
+
+ // Username
+ void SetUsername(const CHAR* s, const url_parse::Component& comp) {
+ sources_.username = s;
+ components_.username = comp;
+ }
+ void ClearUsername() {
+ sources_.username = Placeholder();
+ components_.username = url_parse::Component();
+ }
+ bool IsUsernameOverridden() const { return sources_.username != NULL; }
+
+ // Password
+ void SetPassword(const CHAR* s, const url_parse::Component& comp) {
+ sources_.password = s;
+ components_.password = comp;
+ }
+ void ClearPassword() {
+ sources_.password = Placeholder();
+ components_.password = url_parse::Component();
+ }
+ bool IsPasswordOverridden() const { return sources_.password != NULL; }
+
+ // Host
+ void SetHost(const CHAR* s, const url_parse::Component& comp) {
+ sources_.host = s;
+ components_.host = comp;
+ }
+ void ClearHost() {
+ sources_.host = Placeholder();
+ components_.host = url_parse::Component();
+ }
+ bool IsHostOverridden() const { return sources_.host != NULL; }
+
+ // Port
+ void SetPort(const CHAR* s, const url_parse::Component& comp) {
+ sources_.port = s;
+ components_.port = comp;
+ }
+ void ClearPort() {
+ sources_.port = Placeholder();
+ components_.port = url_parse::Component();
+ }
+ bool IsPortOverridden() const { return sources_.port != NULL; }
+
+ // Path
+ void SetPath(const CHAR* s, const url_parse::Component& comp) {
+ sources_.path = s;
+ components_.path = comp;
+ }
+ void ClearPath() {
+ sources_.path = Placeholder();
+ components_.path = url_parse::Component();
+ }
+ bool IsPathOverridden() const { return sources_.path != NULL; }
+
+ // Query
+ void SetQuery(const CHAR* s, const url_parse::Component& comp) {
+ sources_.query = s;
+ components_.query = comp;
+ }
+ void ClearQuery() {
+ sources_.query = Placeholder();
+ components_.query = url_parse::Component();
+ }
+ bool IsQueryOverridden() const { return sources_.query != NULL; }
+
+ // Ref
+ void SetRef(const CHAR* s, const url_parse::Component& comp) {
+ sources_.ref = s;
+ components_.ref = comp;
+ }
+ void ClearRef() {
+ sources_.ref = Placeholder();
+ components_.ref = url_parse::Component();
+ }
+ bool IsRefOverridden() const { return sources_.ref != NULL; }
+
+ // Getters for the itnernal data. See the variables below for how the
+ // information is encoded.
+ const URLComponentSource<CHAR>& sources() const { return sources_; }
+ const url_parse::Parsed& components() const { return components_; }
+
+ private:
+ // Returns a pointer to a static empty string that is used as a placeholder
+ // to indicate a component should be deleted (see below).
+ const CHAR* Placeholder() {
+ static const CHAR empty_string = 0;
+ return &empty_string;
+ }
+
+ // We support three states:
+ //
+ // Action | Source Component
+ // -----------------------+--------------------------------------------------
+ // Don't change component | NULL (unused)
+ // Replace component | (replacement string) (replacement component)
+ // Delete component | (non-NULL) (invalid component: (0,-1))
+ //
+ // We use a pointer to the empty string for the source when the component
+ // should be deleted.
+ URLComponentSource<CHAR> sources_;
+ url_parse::Parsed components_;
+};
+
+// The base must be an 8-bit canonical URL.
+GURL_API bool ReplaceStandardURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceStandardURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Filesystem URLs can only have the path, query, or ref replaced.
+// All other components will be ignored.
+GURL_API bool ReplaceFileSystemURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceFileSystemURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Replacing some parts of a file URL is not permitted. Everything except
+// the host, path, query, and ref will be ignored.
+GURL_API bool ReplaceFileURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceFileURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Path URLs can only have the scheme and path replaced. All other components
+// will be ignored.
+GURL_API bool ReplacePathURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplacePathURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Mailto URLs can only have the scheme, path, and query replaced.
+// All other components will be ignored.
+GURL_API bool ReplaceMailtoURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+GURL_API bool ReplaceMailtoURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed);
+
+// Relative URL ---------------------------------------------------------------
+
+// Given an input URL or URL fragment |fragment|, determines if it is a
+// relative or absolute URL and places the result into |*is_relative|. If it is
+// relative, the relevant portion of the URL will be placed into
+// |*relative_component| (there may have been trimmed whitespace, for example).
+// This value is passed to ResolveRelativeURL. If the input is not relative,
+// this value is UNDEFINED (it may be changed by the function).
+//
+// Returns true on success (we successfully determined the URL is relative or
+// not). Failure means that the combination of URLs doesn't make any sense.
+//
+// The base URL should always be canonical, therefore is ASCII.
+GURL_API bool IsRelativeURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const char* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ url_parse::Component* relative_component);
+GURL_API bool IsRelativeURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const char16* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ url_parse::Component* relative_component);
+
+// Given a canonical parsed source URL, a URL fragment known to be relative,
+// and the identified relevant portion of the relative URL (computed by
+// IsRelativeURL), this produces a new parsed canonical URL in |output| and
+// |out_parsed|.
+//
+// It also requires a flag indicating whether the base URL is a file: URL
+// which triggers additional logic.
+//
+// The base URL should be canonical and have a host (may be empty for file
+// URLs) and a path. If it doesn't have these, we can't resolve relative
+// URLs off of it and will return the base as the output with an error flag.
+// Becausee it is canonical is should also be ASCII.
+//
+// The query charset converter follows the same rules as CanonicalizeQuery.
+//
+// Returns true on success. On failure, the output will be "something
+// reasonable" that will be consistent and valid, just probably not what
+// was intended by the web page author or caller.
+GURL_API bool ResolveRelativeURL(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ bool base_is_file,
+ const char* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed);
+GURL_API bool ResolveRelativeURL(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ bool base_is_file,
+ const char16* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed);
+
+} // namespace url_canon
+
+#endif // GOOGLEURL_SRC_URL_CANON_H__
diff --git a/url/url_canon_etc.cc b/url/url_canon_etc.cc
new file mode 100644
index 0000000..318c906
--- /dev/null
+++ b/url/url_canon_etc.cc
@@ -0,0 +1,392 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Canonicalizers for random bits that aren't big enough for their own files.
+
+#include <string.h>
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// Returns true if the given character should be removed from the middle of a
+// URL.
+inline bool IsRemovableURLWhitespace(int ch) {
+ return ch == '\r' || ch == '\n' || ch == '\t';
+}
+
+// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
+// It sucks that we have to do this, since this takes about 13% of the total URL
+// canonicalization time.
+template<typename CHAR>
+const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
+ CanonOutputT<CHAR>* buffer,
+ int* output_len) {
+ // Fast verification that there's nothing that needs removal. This is the 99%
+ // case, so we want it to be fast and don't care about impacting the speed
+ // when we do find whitespace.
+ int found_whitespace = false;
+ for (int i = 0; i < input_len; i++) {
+ if (!IsRemovableURLWhitespace(input[i]))
+ continue;
+ found_whitespace = true;
+ break;
+ }
+
+ if (!found_whitespace) {
+ // Didn't find any whitespace, we don't need to do anything. We can just
+ // return the input as the output.
+ *output_len = input_len;
+ return input;
+ }
+
+ // Remove the whitespace into the new buffer and return it.
+ for (int i = 0; i < input_len; i++) {
+ if (!IsRemovableURLWhitespace(input[i]))
+ buffer->push_back(input[i]);
+ }
+ *output_len = buffer->length();
+ return buffer->data();
+}
+
+// Contains the canonical version of each possible input letter in the scheme
+// (basically, lower-cased). The corresponding entry will be 0 if the letter
+// is not allowed in a scheme.
+const char kSchemeCanonical[0x80] = {
+// 00-1f: all are invalid
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// ' ' ! " # $ % & ' ( ) * + , - . /
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
+// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
+// @ A B C D E F G H I J K L M N O
+ 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+// P Q R S T U V W X Y Z [ \ ] ^ _
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
+// ` a b c d e f g h i j k l m n o
+ 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+// p q r s t u v w x y z { | } ~
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
+
+// This could be a table lookup as well by setting the high bit for each
+// valid character, but it's only called once per URL, and it makes the lookup
+// table easier to read not having extra stuff in it.
+inline bool IsSchemeFirstChar(unsigned char c) {
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoScheme(const CHAR* spec,
+ const url_parse::Component& scheme,
+ CanonOutput* output,
+ url_parse::Component* out_scheme) {
+ if (scheme.len <= 0) {
+ // Scheme is unspecified or empty, convert to empty by appending a colon.
+ *out_scheme = url_parse::Component(output->length(), 0);
+ output->push_back(':');
+ return true;
+ }
+
+ // The output scheme starts from the current position.
+ out_scheme->begin = output->length();
+
+ // Danger: it's important that this code does not strip any characters: it
+ // only emits the canonical version (be it valid or escaped) of each of
+ // the input characters. Stripping would put it out of sync with
+ // url_util::FindAndCompareScheme, which could cause some security checks on
+ // schemes to be incorrect.
+ bool success = true;
+ int end = scheme.end();
+ for (int i = scheme.begin; i < end; i++) {
+ UCHAR ch = static_cast<UCHAR>(spec[i]);
+ char replacement = 0;
+ if (ch < 0x80) {
+ if (i == scheme.begin) {
+ // Need to do a special check for the first letter of the scheme.
+ if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
+ replacement = kSchemeCanonical[ch];
+ } else {
+ replacement = kSchemeCanonical[ch];
+ }
+ }
+
+ if (replacement) {
+ output->push_back(replacement);
+ } else if (ch == '%') {
+ // Canonicalizing the scheme multiple times should lead to the same
+ // result. Since invalid characters will be escaped, we need to preserve
+ // the percent to avoid multiple escaping. The scheme will be invalid.
+ success = false;
+ output->push_back('%');
+ } else {
+ // Invalid character, store it but mark this scheme as invalid.
+ success = false;
+
+ // This will escape the output and also handle encoding issues.
+ // Ignore the return value since we already failed.
+ AppendUTF8EscapedChar(spec, &i, end, output);
+ }
+ }
+
+ // The output scheme ends with the the current position, before appending
+ // the colon.
+ out_scheme->len = output->length() - out_scheme->begin;
+ output->push_back(':');
+ return success;
+}
+
+// The username and password components reference ranges in the corresponding
+// *_spec strings. Typically, these specs will be the same (we're
+// canonicalizing a single source string), but may be different when
+// replacing components.
+template<typename CHAR, typename UCHAR>
+bool DoUserInfo(const CHAR* username_spec,
+ const url_parse::Component& username,
+ const CHAR* password_spec,
+ const url_parse::Component& password,
+ CanonOutput* output,
+ url_parse::Component* out_username,
+ url_parse::Component* out_password) {
+ if (username.len <= 0 && password.len <= 0) {
+ // Common case: no user info. We strip empty username/passwords.
+ *out_username = url_parse::Component();
+ *out_password = url_parse::Component();
+ return true;
+ }
+
+ // Write the username.
+ out_username->begin = output->length();
+ if (username.len > 0) {
+ // This will escape characters not valid for the username.
+ AppendStringOfType(&username_spec[username.begin], username.len,
+ CHAR_USERINFO, output);
+ }
+ out_username->len = output->length() - out_username->begin;
+
+ // When there is a password, we need the separator. Note that we strip
+ // empty but specified passwords.
+ if (password.len > 0) {
+ output->push_back(':');
+ out_password->begin = output->length();
+ AppendStringOfType(&password_spec[password.begin], password.len,
+ CHAR_USERINFO, output);
+ out_password->len = output->length() - out_password->begin;
+ } else {
+ *out_password = url_parse::Component();
+ }
+
+ output->push_back('@');
+ return true;
+}
+
+// Helper functions for converting port integers to strings.
+inline void WritePortInt(char* output, int output_len, int port) {
+ _itoa_s(port, output, output_len, 10);
+}
+
+// This function will prepend the colon if there will be a port.
+template<typename CHAR, typename UCHAR>
+bool DoPort(const CHAR* spec,
+ const url_parse::Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ url_parse::Component* out_port) {
+ int port_num = url_parse::ParsePort(spec, port);
+ if (port_num == url_parse::PORT_UNSPECIFIED ||
+ port_num == default_port_for_scheme) {
+ *out_port = url_parse::Component();
+ return true; // Leave port empty.
+ }
+
+ if (port_num == url_parse::PORT_INVALID) {
+ // Invalid port: We'll copy the text from the input so the user can see
+ // what the error was, and mark the URL as invalid by returning false.
+ output->push_back(':');
+ out_port->begin = output->length();
+ AppendInvalidNarrowString(spec, port.begin, port.end(), output);
+ out_port->len = output->length() - out_port->begin;
+ return false;
+ }
+
+ // Convert port number back to an integer. Max port value is 5 digits, and
+ // the Parsed::ExtractPort will have made sure the integer is in range.
+ const int buf_size = 6;
+ char buf[buf_size];
+ WritePortInt(buf, buf_size, port_num);
+
+ // Append the port number to the output, preceeded by a colon.
+ output->push_back(':');
+ out_port->begin = output->length();
+ for (int i = 0; i < buf_size && buf[i]; i++)
+ output->push_back(buf[i]);
+
+ out_port->len = output->length() - out_port->begin;
+ return true;
+}
+
+template<typename CHAR, typename UCHAR>
+void DoCanonicalizeRef(const CHAR* spec,
+ const url_parse::Component& ref,
+ CanonOutput* output,
+ url_parse::Component* out_ref) {
+ if (ref.len < 0) {
+ // Common case of no ref.
+ *out_ref = url_parse::Component();
+ return;
+ }
+
+ // Append the ref separator. Note that we need to do this even when the ref
+ // is empty but present.
+ output->push_back('#');
+ out_ref->begin = output->length();
+
+ // Now iterate through all the characters, converting to UTF-8 and validating.
+ int end = ref.end();
+ for (int i = ref.begin; i < end; i++) {
+ if (spec[i] == 0) {
+ // IE just strips NULLs, so we do too.
+ continue;
+ } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
+ // Unline IE seems to, we escape control characters. This will probably
+ // make the reference fragment unusable on a web page, but people
+ // shouldn't be using control characters in their anchor names.
+ AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
+ } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
+ // Normal ASCII characters are just appended.
+ output->push_back(static_cast<char>(spec[i]));
+ } else {
+ // Non-ASCII characters are appended unescaped, but only when they are
+ // valid. Invalid Unicode characters are replaced with the "invalid
+ // character" as IE seems to (ReadUTFChar puts the unicode replacement
+ // character in the output on failure for us).
+ unsigned code_point;
+ ReadUTFChar(spec, &i, end, &code_point);
+ AppendUTF8Value(code_point, output);
+ }
+ }
+
+ out_ref->len = output->length() - out_ref->begin;
+}
+
+} // namespace
+
+const char* RemoveURLWhitespace(const char* input, int input_len,
+ CanonOutputT<char>* buffer,
+ int* output_len) {
+ return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
+}
+
+const char16* RemoveURLWhitespace(const char16* input, int input_len,
+ CanonOutputT<char16>* buffer,
+ int* output_len) {
+ return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
+}
+
+char CanonicalSchemeChar(char16 ch) {
+ if (ch >= 0x80)
+ return 0; // Non-ASCII is not supported by schemes.
+ return kSchemeCanonical[ch];
+}
+
+bool CanonicalizeScheme(const char* spec,
+ const url_parse::Component& scheme,
+ CanonOutput* output,
+ url_parse::Component* out_scheme) {
+ return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
+}
+
+bool CanonicalizeScheme(const char16* spec,
+ const url_parse::Component& scheme,
+ CanonOutput* output,
+ url_parse::Component* out_scheme) {
+ return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
+}
+
+bool CanonicalizeUserInfo(const char* username_source,
+ const url_parse::Component& username,
+ const char* password_source,
+ const url_parse::Component& password,
+ CanonOutput* output,
+ url_parse::Component* out_username,
+ url_parse::Component* out_password) {
+ return DoUserInfo<char, unsigned char>(
+ username_source, username, password_source, password,
+ output, out_username, out_password);
+}
+
+bool CanonicalizeUserInfo(const char16* username_source,
+ const url_parse::Component& username,
+ const char16* password_source,
+ const url_parse::Component& password,
+ CanonOutput* output,
+ url_parse::Component* out_username,
+ url_parse::Component* out_password) {
+ return DoUserInfo<char16, char16>(
+ username_source, username, password_source, password,
+ output, out_username, out_password);
+}
+
+bool CanonicalizePort(const char* spec,
+ const url_parse::Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ url_parse::Component* out_port) {
+ return DoPort<char, unsigned char>(spec, port,
+ default_port_for_scheme,
+ output, out_port);
+}
+
+bool CanonicalizePort(const char16* spec,
+ const url_parse::Component& port,
+ int default_port_for_scheme,
+ CanonOutput* output,
+ url_parse::Component* out_port) {
+ return DoPort<char16, char16>(spec, port, default_port_for_scheme,
+ output, out_port);
+}
+
+void CanonicalizeRef(const char* spec,
+ const url_parse::Component& ref,
+ CanonOutput* output,
+ url_parse::Component* out_ref) {
+ DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
+}
+
+void CanonicalizeRef(const char16* spec,
+ const url_parse::Component& ref,
+ CanonOutput* output,
+ url_parse::Component* out_ref) {
+ DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_filesystemurl.cc b/url/url_canon_filesystemurl.cc
new file mode 100644
index 0000000..7f79208
--- /dev/null
+++ b/url/url_canon_filesystemurl.cc
@@ -0,0 +1,158 @@
+// Copyright 2012, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "filesystem:file:" URLs.
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_file.h"
+#include "googleurl/src/url_parse_internal.h"
+#include "googleurl/src/url_util.h"
+#include "googleurl/src/url_util_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// We use the URLComponentSource for the outer URL, as it can have replacements,
+// whereas the inner_url can't, so it uses spec.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeFileSystemURL(const CHAR* spec,
+ const URLComponentSource<CHAR>& source,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ // filesystem only uses {scheme, path, query, ref} -- clear the rest.
+ new_parsed->username = url_parse::Component();
+ new_parsed->password = url_parse::Component();
+ new_parsed->host = url_parse::Component();
+ new_parsed->port = url_parse::Component();
+
+ const url_parse::Parsed* inner_parsed = parsed.inner_parsed();
+ url_parse::Parsed new_inner_parsed;
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->Append("filesystem:", 11);
+ new_parsed->scheme.len = 10;
+
+ if (!parsed.inner_parsed() || !parsed.inner_parsed()->scheme.is_valid())
+ return false;
+
+ bool success = true;
+ if (url_util::CompareSchemeComponent(spec, inner_parsed->scheme,
+ url_util::kFileScheme)) {
+ new_inner_parsed.scheme.begin = output->length();
+ output->Append("file://", 7);
+ new_inner_parsed.scheme.len = 4;
+ success &= CanonicalizePath(spec, inner_parsed->path, output,
+ &new_inner_parsed.path);
+ } else if (url_util::IsStandard(spec, inner_parsed->scheme)) {
+ success =
+ url_canon::CanonicalizeStandardURL(spec,
+ parsed.inner_parsed()->Length(),
+ *parsed.inner_parsed(),
+ charset_converter, output,
+ &new_inner_parsed);
+ } else {
+ // TODO(ericu): The URL is wrong, but should we try to output more of what
+ // we were given? Echoing back filesystem:mailto etc. doesn't seem all that
+ // useful.
+ return false;
+ }
+ // The filesystem type must be more than just a leading slash for validity.
+ success &= parsed.inner_parsed()->path.len > 1;
+
+ success &= CanonicalizePath(source.path, parsed.path, output,
+ &new_parsed->path);
+
+ // Ignore failures for query/ref since the URL can probably still be loaded.
+ CanonicalizeQuery(source.query, parsed.query, charset_converter,
+ output, &new_parsed->query);
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+ if (success)
+ new_parsed->set_inner_parsed(new_inner_parsed);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeFileSystemURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ spec, URLComponentSource<char>(spec), parsed, charset_converter, output,
+ new_parsed);
+}
+
+bool CanonicalizeFileSystemURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizeFileSystemURL<char16, char16>(
+ spec, URLComponentSource<char16>(spec), parsed, charset_converter, output,
+ new_parsed);
+}
+
+bool ReplaceFileSystemURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ base, source, parsed, charset_converter, output, new_parsed);
+}
+
+bool ReplaceFileSystemURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CharsetConverter* charset_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeFileSystemURL<char, unsigned char>(
+ base, source, parsed, charset_converter, output, new_parsed);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_fileurl.cc b/url/url_canon_fileurl.cc
new file mode 100644
index 0000000..97023eb
--- /dev/null
+++ b/url/url_canon_fileurl.cc
@@ -0,0 +1,215 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "file:" URLs.
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_file.h"
+#include "googleurl/src/url_parse_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+#ifdef WIN32
+
+// Given a pointer into the spec, this copies and canonicalizes the drive
+// letter and colon to the output, if one is found. If there is not a drive
+// spec, it won't do anything. The index of the next character in the input
+// spec is returned (after the colon when a drive spec is found, the begin
+// offset if one is not).
+template<typename CHAR>
+int FileDoDriveSpec(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
+ // (with backslashes instead of slashes as well).
+ int num_slashes = url_parse::CountConsecutiveSlashes(spec, begin, end);
+ int after_slashes = begin + num_slashes;
+
+ if (!url_parse::DoesBeginWindowsDriveSpec(spec, after_slashes, end))
+ return begin; // Haven't consumed any characters
+
+ // A drive spec is the start of a path, so we need to add a slash for the
+ // authority terminator (typically the third slash).
+ output->push_back('/');
+
+ // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
+ // and that it is followed by a colon/pipe.
+
+ // Normalize Windows drive letters to uppercase
+ if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
+ output->push_back(spec[after_slashes] - 'a' + 'A');
+ else
+ output->push_back(static_cast<char>(spec[after_slashes]));
+
+ // Normalize the character following it to a colon rather than pipe.
+ output->push_back(':');
+ return after_slashes + 2;
+}
+
+#endif // WIN32
+
+template<typename CHAR, typename UCHAR>
+bool DoFileCanonicalizePath(const CHAR* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path) {
+ // Copies and normalizes the "c:" at the beginning, if present.
+ out_path->begin = output->length();
+ int after_drive;
+#ifdef WIN32
+ after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output);
+#else
+ after_drive = path.begin;
+#endif
+
+ // Copies the rest of the path, starting from the slash following the
+ // drive colon (if any, Windows only), or the first slash of the path.
+ bool success = true;
+ if (after_drive < path.end()) {
+ // Use the regular path canonicalizer to canonicalize the rest of the
+ // path. Give it a fake output component to write into. DoCanonicalizeFile
+ // will compute the full path component.
+ url_parse::Component sub_path =
+ url_parse::MakeRange(after_drive, path.end());
+ url_parse::Component fake_output_path;
+ success = CanonicalizePath(spec, sub_path, output, &fake_output_path);
+ } else {
+ // No input path, canonicalize to a slash.
+ output->push_back('/');
+ }
+
+ out_path->len = output->length() - out_path->begin;
+ return success;
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ // Things we don't set in file: URLs.
+ new_parsed->username = url_parse::Component();
+ new_parsed->password = url_parse::Component();
+ new_parsed->port = url_parse::Component();
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->Append("file://", 7);
+ new_parsed->scheme.len = 4;
+
+ // Append the host. For many file URLs, this will be empty. For UNC, this
+ // will be present.
+ // TODO(brettw) This doesn't do any checking for host name validity. We
+ // should probably handle validity checking of UNC hosts differently than
+ // for regular IP hosts.
+ bool success = CanonicalizeHost(source.host, parsed.host,
+ output, &new_parsed->host);
+ success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path,
+ output, &new_parsed->path);
+ CanonicalizeQuery(source.query, parsed.query, query_converter,
+ output, &new_parsed->query);
+
+ // Ignore failure for refs since the URL can probably still be loaded.
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeFileURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, query_converter,
+ output, new_parsed);
+}
+
+bool CanonicalizeFileURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizeFileURL<char16, char16>(
+ URLComponentSource<char16>(spec), parsed, query_converter,
+ output, new_parsed);
+}
+
+bool FileCanonicalizePath(const char* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path) {
+ return DoFileCanonicalizePath<char, unsigned char>(spec, path,
+ output, out_path);
+}
+
+bool FileCanonicalizePath(const char16* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path) {
+ return DoFileCanonicalizePath<char16, char16>(spec, path,
+ output, out_path);
+}
+
+bool ReplaceFileURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ source, parsed, query_converter, output, new_parsed);
+}
+
+bool ReplaceFileURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeFileURL<char, unsigned char>(
+ source, parsed, query_converter, output, new_parsed);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_host.cc b/url/url_canon_host.cc
new file mode 100644
index 0000000..6642004
--- /dev/null
+++ b/url/url_canon_host.cc
@@ -0,0 +1,401 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "base/logging.h"
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// For reference, here's what IE supports:
+// Key: 0 (disallowed: failure if present in the input)
+// + (allowed either escaped or unescaped, and unmodified)
+// U (allowed escaped or unescaped but always unescaped if present in
+// escaped form)
+// E (allowed escaped or unescaped but always escaped if present in
+// unescaped form)
+// % (only allowed escaped in the input, will be unmodified).
+// I left blank alpha numeric characters.
+//
+// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+// -----------------------------------------------
+// 0 0 E E E E E E E E E E E E E E E
+// 1 E E E E E E E E E E E E E E E E
+// 2 E + E E + E + + + + + + + U U 0
+// 3 % % E + E 0 <-- Those are : ; < = > ?
+// 4 %
+// 5 U 0 U U U <-- Those are [ \ ] ^ _
+// 6 E <-- That's `
+// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE)
+//
+// NOTE: I didn't actually test all the control characters. Some may be
+// disallowed in the input, but they are all accepted escaped except for 0.
+// I also didn't test if characters affecting HTML parsing are allowed
+// unescaped, eg. (") or (#), which would indicate the beginning of the path.
+// Surprisingly, space is accepted in the input and always escaped.
+
+// This table lists the canonical version of all characters we allow in the
+// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
+// value to indicate that this character should be escaped. We are a little more
+// restrictive than IE, but less restrictive than Firefox.
+//
+// Note that we disallow the % character. We will allow it when part of an
+// escape sequence, of course, but this disallows "%25". Even though IE allows
+// it, allowing it would put us in a funny state. If there was an invalid
+// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
+// Allowing percents means we'll succeed a second time, so validity would change
+// based on how many times you run the canonicalizer. We prefer to always report
+// the same vailidity, so reject this.
+const unsigned char kEsc = 0xff;
+const unsigned char kHostCharLookup[0x80] = {
+// 00-1f: all are invalid
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// ' ' ! " # $ % & ' ( ) * + , - . /
+ kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
+// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
+// @ A B C D E F G H I J K L M N O
+ kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+// P Q R S T U V W X Y Z [ \ ] ^ _
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
+// ` a b c d e f g h i j k l m n o
+ kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+// p q r s t u v w x y z { | } ~
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
+
+const int kTempHostBufferLen = 1024;
+typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
+typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
+
+// Scans a host name and fills in the output flags according to what we find.
+// |has_non_ascii| will be true if there are any non-7-bit characters, and
+// |has_escaped| will be true if there is a percent sign.
+template<typename CHAR, typename UCHAR>
+void ScanHostname(const CHAR* spec, const url_parse::Component& host,
+ bool* has_non_ascii, bool* has_escaped) {
+ int end = host.end();
+ *has_non_ascii = false;
+ *has_escaped = false;
+ for (int i = host.begin; i < end; i++) {
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
+ *has_non_ascii = true;
+ else if (spec[i] == '%')
+ *has_escaped = true;
+ }
+}
+
+// Canonicalizes a host name that is entirely 8-bit characters (even though
+// the type holding them may be 16 bits. Escaped characters will be unescaped.
+// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
+//
+// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
+// the output.
+//
+// This function is used in two situations:
+//
+// * When the caller knows there is no non-ASCII or percent escaped
+// characters. This is what DoHost does. The result will be a completely
+// canonicalized host since we know nothing weird can happen (escaped
+// characters could be unescaped to non-7-bit, so they have to be treated
+// with suspicion at this point). It does not use the |has_non_ascii| flag.
+//
+// * When the caller has an 8-bit string that may need unescaping.
+// DoComplexHost calls us this situation to do unescaping and validation.
+// After this, it may do other IDN operations depending on the value of the
+// |*has_non_ascii| flag.
+//
+// The return value indicates if the output is a potentially valid host name.
+template<typename INCHAR, typename OUTCHAR>
+bool DoSimpleHost(const INCHAR* host,
+ int host_len,
+ CanonOutputT<OUTCHAR>* output,
+ bool* has_non_ascii) {
+ *has_non_ascii = false;
+
+ bool success = true;
+ for (int i = 0; i < host_len; ++i) {
+ unsigned int source = host[i];
+ if (source == '%') {
+ // Unescape first, if possible.
+ // Source will be used only if decode operation was successful.
+ if (!DecodeEscaped(host, &i, host_len,
+ reinterpret_cast<unsigned char*>(&source))) {
+ // Invalid escaped character. There is nothing that can make this
+ // host valid. We append an escaped percent so the URL looks reasonable
+ // and mark as failed.
+ AppendEscapedChar('%', output);
+ success = false;
+ continue;
+ }
+ }
+
+ if (source < 0x80) {
+ // We have ASCII input, we can use our lookup table.
+ unsigned char replacement = kHostCharLookup[source];
+ if (!replacement) {
+ // Invalid character, add it as percent-escaped and mark as failed.
+ AppendEscapedChar(source, output);
+ success = false;
+ } else if (replacement == kEsc) {
+ // This character is valid but should be escaped.
+ AppendEscapedChar(source, output);
+ } else {
+ // Common case, the given character is valid in a hostname, the lookup
+ // table tells us the canonical representation of that character (lower
+ // cased).
+ output->push_back(replacement);
+ }
+ } else {
+ // It's a non-ascii char. Just push it to the output.
+ // In case where we have char16 input, and char output it's safe to
+ // cast char16->char only if input string was converted to ASCII.
+ output->push_back(static_cast<OUTCHAR>(source));
+ *has_non_ascii = true;
+ }
+ }
+
+ return success;
+}
+
+// Canonicalizes a host that requires IDN conversion. Returns true on success
+bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
+ // We need to escape URL before doing IDN conversion, since punicode strings
+ // cannot be escaped after they are created.
+ RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
+ bool has_non_ascii;
+ DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
+
+ StackBufferW wide_output;
+ if (!IDNToASCII(url_escaped_host.data(),
+ url_escaped_host.length(),
+ &wide_output)) {
+ // Some error, give up. This will write some reasonable looking
+ // representation of the string to the output.
+ AppendInvalidNarrowString(src, 0, src_len, output);
+ return false;
+ }
+
+ // Now we check the ASCII output like a normal host. It will also handle
+ // unescaping. Although we unescaped everything before this function call, if
+ // somebody does %00 as fullwidth, ICU will convert this to ASCII.
+ bool success = DoSimpleHost(wide_output.data(),
+ wide_output.length(),
+ output, &has_non_ascii);
+ DCHECK(!has_non_ascii);
+ return success;
+}
+
+// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
+// UTF-16. The has_escaped flag should be set if the input string requires
+// unescaping.
+bool DoComplexHost(const char* host, int host_len,
+ bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+ // Save the current position in the output. We may write stuff and rewind it
+ // below, so we need to know where to rewind to.
+ int begin_length = output->length();
+
+ // Points to the UTF-8 data we want to convert. This will either be the
+ // input or the unescaped version written to |*output| if necessary.
+ const char* utf8_source;
+ int utf8_source_len;
+ if (has_escaped) {
+ // Unescape before converting to UTF-16 for IDN. We write this into the
+ // output because it most likely does not require IDNization, and we can
+ // save another huge stack buffer. It will be replaced below if it requires
+ // IDN. This will also update our non-ASCII flag so we know whether the
+ // unescaped input requires IDN.
+ if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
+ // Error with some escape sequence. We'll call the current output
+ // complete. DoSimpleHost will have written some "reasonable" output.
+ return false;
+ }
+
+ // Unescaping may have left us with ASCII input, in which case the
+ // unescaped version we wrote to output is complete.
+ if (!has_non_ascii) {
+ return true;
+ }
+
+ // Save the pointer into the data was just converted (it may be appended to
+ // other data in the output buffer).
+ utf8_source = &output->data()[begin_length];
+ utf8_source_len = output->length() - begin_length;
+ } else {
+ // We don't need to unescape, use input for IDNization later. (We know the
+ // input has non-ASCII, or the simple version would have been called
+ // instead of us.)
+ utf8_source = host;
+ utf8_source_len = host_len;
+ }
+
+ // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
+ // Above, we may have used the output to write the unescaped values to, so
+ // we have to rewind it to where we started after we convert it to UTF-16.
+ StackBufferW utf16;
+ if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
+ // In this error case, the input may or may not be the output.
+ StackBuffer utf8;
+ for (int i = 0; i < utf8_source_len; i++)
+ utf8.push_back(utf8_source[i]);
+ output->set_length(begin_length);
+ AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
+ return false;
+ }
+ output->set_length(begin_length);
+
+ // This will call DoSimpleHost which will do normal ASCII canonicalization
+ // and also check for IP addresses in the outpt.
+ return DoIDNHost(utf16.data(), utf16.length(), output);
+}
+
+// UTF-16 convert host to its ASCII version. The set up is already ready for
+// the backend, so we just pass through. The has_escaped flag should be set if
+// the input string requires unescaping.
+bool DoComplexHost(const char16* host, int host_len,
+ bool has_non_ascii, bool has_escaped, CanonOutput* output) {
+ if (has_escaped) {
+ // Yikes, we have escaped characters with wide input. The escaped
+ // characters should be interpreted as UTF-8. To solve this problem,
+ // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
+ //
+ // We don't bother to optimize the conversion in the ASCII case (which
+ // *could* just be a copy) and use the UTF-8 path, because it should be
+ // very rare that host names have escaped characters, and it is relatively
+ // fast to do the conversion anyway.
+ StackBuffer utf8;
+ if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
+ AppendInvalidNarrowString(host, 0, host_len, output);
+ return false;
+ }
+
+ // Once we convert to UTF-8, we can use the 8-bit version of the complex
+ // host handling code above.
+ return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
+ has_escaped, output);
+ }
+
+ // No unescaping necessary, we can safely pass the input to ICU. This
+ // function will only get called if we either have escaped or non-ascii
+ // input, so it's safe to just use ICU now. Even if the input is ASCII,
+ // this function will do the right thing (just slower than we could).
+ return DoIDNHost(host, host_len, output);
+}
+
+template<typename CHAR, typename UCHAR>
+void DoHost(const CHAR* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ if (host.len <= 0) {
+ // Empty hosts don't need anything.
+ host_info->family = CanonHostInfo::NEUTRAL;
+ host_info->out_host = url_parse::Component();
+ return;
+ }
+
+ bool has_non_ascii, has_escaped;
+ ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
+
+ // Keep track of output's initial length, so we can rewind later.
+ const int output_begin = output->length();
+
+ bool success;
+ if (!has_non_ascii && !has_escaped) {
+ success = DoSimpleHost(&spec[host.begin], host.len,
+ output, &has_non_ascii);
+ DCHECK(!has_non_ascii);
+ } else {
+ success = DoComplexHost(&spec[host.begin], host.len,
+ has_non_ascii, has_escaped, output);
+ }
+
+ if (!success) {
+ // Canonicalization failed. Set BROKEN to notify the caller.
+ host_info->family = CanonHostInfo::BROKEN;
+ } else {
+ // After all the other canonicalization, check if we ended up with an IP
+ // address. IP addresses are small, so writing into this temporary buffer
+ // should not cause an allocation.
+ RawCanonOutput<64> canon_ip;
+ CanonicalizeIPAddress(output->data(),
+ url_parse::MakeRange(output_begin, output->length()),
+ &canon_ip, host_info);
+
+ // If we got an IPv4/IPv6 address, copy the canonical form back to the
+ // real buffer. Otherwise, it's a hostname or broken IP, in which case
+ // we just leave it in place.
+ if (host_info->IsIPAddress()) {
+ output->set_length(output_begin);
+ output->Append(canon_ip.data(), canon_ip.length());
+ }
+ }
+
+ host_info->out_host = url_parse::MakeRange(output_begin, output->length());
+}
+
+} // namespace
+
+bool CanonicalizeHost(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ url_parse::Component* out_host) {
+ CanonHostInfo host_info;
+ DoHost<char, unsigned char>(spec, host, output, &host_info);
+ *out_host = host_info.out_host;
+ return (host_info.family != CanonHostInfo::BROKEN);
+}
+
+bool CanonicalizeHost(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ url_parse::Component* out_host) {
+ CanonHostInfo host_info;
+ DoHost<char16, char16>(spec, host, output, &host_info);
+ *out_host = host_info.out_host;
+ return (host_info.family != CanonHostInfo::BROKEN);
+}
+
+void CanonicalizeHostVerbose(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo *host_info) {
+ DoHost<char, unsigned char>(spec, host, output, host_info);
+}
+
+void CanonicalizeHostVerbose(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo *host_info) {
+ DoHost<char16, char16>(spec, host, output, host_info);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_icu.cc b/url/url_canon_icu.cc
new file mode 100644
index 0000000..eaae643
--- /dev/null
+++ b/url/url_canon_icu.cc
@@ -0,0 +1,210 @@
+// Copyright 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ICU integration functions.
+
+#include <stdlib.h>
+#include <string.h>
+#include <unicode/ucnv.h>
+#include <unicode/ucnv_cb.h>
+#include <unicode/uidna.h>
+
+#include "googleurl/src/url_canon_icu.h"
+#include "googleurl/src/url_canon_internal.h" // for _itoa_s
+
+#include "base/logging.h"
+
+namespace url_canon {
+
+namespace {
+
+// Called when converting a character that can not be represented, this will
+// append an escaped version of the numerical character reference for that code
+// point. It is of the form "&#1234;" and we will escape the non-digits to
+// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
+void appendURLEscapedChar(const void* context,
+ UConverterFromUnicodeArgs* from_args,
+ const UChar* code_units,
+ int32_t length,
+ UChar32 code_point,
+ UConverterCallbackReason reason,
+ UErrorCode* err) {
+ if (reason == UCNV_UNASSIGNED) {
+ *err = U_ZERO_ERROR;
+
+ const static int prefix_len = 6;
+ const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped
+ ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err);
+
+ DCHECK(code_point < 0x110000);
+ char number[8]; // Max Unicode code point is 7 digits.
+ _itoa_s(code_point, number, 10);
+ int number_len = static_cast<int>(strlen(number));
+ ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err);
+
+ const static int postfix_len = 3;
+ const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped
+ ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err);
+ }
+}
+
+// A class for scoping the installation of the invalid character callback.
+class AppendHandlerInstaller {
+ public:
+ // The owner of this object must ensure that the converter is alive for the
+ // duration of this object's lifetime.
+ AppendHandlerInstaller(UConverter* converter) : converter_(converter) {
+ UErrorCode err = U_ZERO_ERROR;
+ ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0,
+ &old_callback_, &old_context_, &err);
+ }
+
+ ~AppendHandlerInstaller() {
+ UErrorCode err = U_ZERO_ERROR;
+ ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err);
+ }
+
+ private:
+ UConverter* converter_;
+
+ UConverterFromUCallback old_callback_;
+ const void* old_context_;
+};
+
+} // namespace
+
+ICUCharsetConverter::ICUCharsetConverter(UConverter* converter)
+ : converter_(converter) {
+}
+
+ICUCharsetConverter::~ICUCharsetConverter() {
+}
+
+void ICUCharsetConverter::ConvertFromUTF16(const char16* input,
+ int input_len,
+ CanonOutput* output) {
+ // Install our error handler. It will be called for character that can not
+ // be represented in the destination character set.
+ AppendHandlerInstaller handler(converter_);
+
+ int begin_offset = output->length();
+ int dest_capacity = output->capacity() - begin_offset;
+ output->set_length(output->length());
+
+ do {
+ UErrorCode err = U_ZERO_ERROR;
+ char* dest = &output->data()[begin_offset];
+ int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity,
+ input, input_len, &err);
+ if (err != U_BUFFER_OVERFLOW_ERROR) {
+ output->set_length(begin_offset + required_capacity);
+ return;
+ }
+
+ // Output didn't fit, expand
+ dest_capacity = required_capacity;
+ output->Resize(begin_offset + dest_capacity);
+ } while (true);
+}
+
+// Converts the Unicode input representing a hostname to ASCII using IDN rules.
+// The output must be ASCII, but is represented as wide characters.
+//
+// On success, the output will be filled with the ASCII host name and it will
+// return true. Unlike most other canonicalization functions, this assumes that
+// the output is empty. The beginning of the host will be at offset 0, and
+// the length of the output will be set to the length of the new host name.
+//
+// On error, this will return false. The output in this case is undefined.
+bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) {
+ DCHECK(output->length() == 0); // Output buffer is assumed empty.
+ while (true) {
+ // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
+ // the spec (which do exist). This does not present any risk and is a
+ // little more future proof.
+ UErrorCode err = U_ZERO_ERROR;
+ int num_converted = uidna_IDNToASCII(src, src_len, output->data(),
+ output->capacity(),
+ UIDNA_ALLOW_UNASSIGNED, NULL, &err);
+ if (err == U_ZERO_ERROR) {
+ output->set_length(num_converted);
+ return true;
+ }
+ if (err != U_BUFFER_OVERFLOW_ERROR)
+ return false; // Unknown error, give up.
+
+ // Not enough room in our buffer, expand.
+ output->Resize(output->capacity() * 2);
+ }
+}
+
+bool ReadUTFChar(const char* str, int* begin, int length,
+ unsigned* code_point_out) {
+ int code_point; // Avoids warning when U8_NEXT writes -1 to it.
+ U8_NEXT(str, *begin, length, code_point);
+ *code_point_out = static_cast<unsigned>(code_point);
+
+ // The ICU macro above moves to the next char, we want to point to the last
+ // char consumed.
+ (*begin)--;
+
+ // Validate the decoded value.
+ if (U_IS_UNICODE_CHAR(code_point))
+ return true;
+ *code_point_out = kUnicodeReplacementCharacter;
+ return false;
+}
+
+bool ReadUTFChar(const char16* str, int* begin, int length,
+ unsigned* code_point) {
+ if (U16_IS_SURROGATE(str[*begin])) {
+ if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length ||
+ !U16_IS_TRAIL(str[*begin + 1])) {
+ // Invalid surrogate pair.
+ *code_point = kUnicodeReplacementCharacter;
+ return false;
+ } else {
+ // Valid surrogate pair.
+ *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
+ (*begin)++;
+ }
+ } else {
+ // Not a surrogate, just one 16-bit word.
+ *code_point = str[*begin];
+ }
+
+ if (U_IS_UNICODE_CHAR(*code_point))
+ return true;
+
+ // Invalid code point.
+ *code_point = kUnicodeReplacementCharacter;
+ return false;
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_icu.h b/url/url_canon_icu.h
new file mode 100644
index 0000000..e529fcb
--- /dev/null
+++ b/url/url_canon_icu.h
@@ -0,0 +1,63 @@
+// Copyright 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ICU integration functions.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_ICU_H__
+#define GOOGLEURL_SRC_URL_CANON_ICU_H__
+
+#include "googleurl/src/url_canon.h"
+
+typedef struct UConverter UConverter;
+
+namespace url_canon {
+
+// An implementation of CharsetConverter that implementations can use to
+// interface the canonicalizer with ICU's conversion routines.
+class ICUCharsetConverter : public CharsetConverter {
+ public:
+ // Constructs a converter using an already-existing ICU character set
+ // converter. This converter is NOT owned by this object; the lifetime must
+ // be managed by the creator such that it is alive as long as this is.
+ GURL_API ICUCharsetConverter(UConverter* converter);
+
+ GURL_API virtual ~ICUCharsetConverter();
+
+ GURL_API virtual void ConvertFromUTF16(const char16* input,
+ int input_len,
+ CanonOutput* output);
+
+ private:
+ // The ICU converter, not owned by this class.
+ UConverter* converter_;
+};
+
+} // namespace url_canon
+
+#endif // GOOGLEURL_SRC_URL_CANON_ICU_H__
diff --git a/url/url_canon_internal.cc b/url/url_canon_internal.cc
new file mode 100644
index 0000000..cd791bb
--- /dev/null
+++ b/url/url_canon_internal.cc
@@ -0,0 +1,427 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <cstdio>
+#include <errno.h>
+#include <stdlib.h>
+#include <string>
+
+#include "googleurl/src/url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+template<typename CHAR, typename UCHAR>
+void DoAppendStringOfType(const CHAR* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output) {
+ for (int i = 0; i < length; i++) {
+ if (static_cast<UCHAR>(source[i]) >= 0x80) {
+ // ReadChar will fill the code point with kUnicodeReplacementCharacter
+ // when the input is invalid, which is what we want.
+ unsigned code_point;
+ ReadUTFChar(source, &i, length, &code_point);
+ AppendUTF8EscapedValue(code_point, output);
+ } else {
+ // Just append the 7-bit character, possibly escaping it.
+ unsigned char uch = static_cast<unsigned char>(source[i]);
+ if (!IsCharOfType(uch, type))
+ AppendEscapedChar(uch, output);
+ else
+ output->push_back(uch);
+ }
+ }
+}
+
+// This function assumes the input values are all contained in 8-bit,
+// although it allows any type. Returns true if input is valid, false if not.
+template<typename CHAR, typename UCHAR>
+void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ for (int i = begin; i < end; i++) {
+ UCHAR uch = static_cast<UCHAR>(spec[i]);
+ if (uch >= 0x80) {
+ // Handle UTF-8/16 encodings. This call will correctly handle the error
+ // case by appending the invalid character.
+ AppendUTF8EscapedChar(spec, &i, end, output);
+ } else if (uch <= ' ' || uch == 0x7f) {
+ // This function is for error handling, so we escape all control
+ // characters and spaces, but not anything else since we lack
+ // context to do something more specific.
+ AppendEscapedChar(static_cast<unsigned char>(uch), output);
+ } else {
+ output->push_back(static_cast<char>(uch));
+ }
+ }
+}
+
+// Overrides one component, see the url_canon::Replacements structure for
+// what the various combionations of source pointer and component mean.
+void DoOverrideComponent(const char* override_source,
+ const url_parse::Component& override_component,
+ const char** dest,
+ url_parse::Component* dest_component) {
+ if (override_source) {
+ *dest = override_source;
+ *dest_component = override_component;
+ }
+}
+
+// Similar to DoOverrideComponent except that it takes a UTF-16 input and does
+// not actually set the output character pointer.
+//
+// The input is converted to UTF-8 at the end of the given buffer as a temporary
+// holding place. The component indentifying the portion of the buffer used in
+// the |utf8_buffer| will be specified in |*dest_component|.
+//
+// This will not actually set any |dest| pointer like DoOverrideComponent
+// does because all of the pointers will point into the |utf8_buffer|, which
+// may get resized while we're overriding a subsequent component. Instead, the
+// caller should use the beginning of the |utf8_buffer| as the string pointer
+// for all components once all overrides have been prepared.
+bool PrepareUTF16OverrideComponent(
+ const char16* override_source,
+ const url_parse::Component& override_component,
+ CanonOutput* utf8_buffer,
+ url_parse::Component* dest_component) {
+ bool success = true;
+ if (override_source) {
+ if (!override_component.is_valid()) {
+ // Non-"valid" component (means delete), so we need to preserve that.
+ *dest_component = url_parse::Component();
+ } else {
+ // Convert to UTF-8.
+ dest_component->begin = utf8_buffer->length();
+ success = ConvertUTF16ToUTF8(&override_source[override_component.begin],
+ override_component.len, utf8_buffer);
+ dest_component->len = utf8_buffer->length() - dest_component->begin;
+ }
+ }
+ return success;
+}
+
+} // namespace
+
+// See the header file for this array's declaration.
+const unsigned char kSharedCharTypeTable[0x100] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f
+ 0, // 0x20 ' ' (escape spaces in queries)
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x21 !
+ 0, // 0x22 "
+ 0, // 0x23 # (invalid in query since it marks the ref)
+ CHAR_QUERY | CHAR_USERINFO, // 0x24 $
+ CHAR_QUERY | CHAR_USERINFO, // 0x25 %
+ CHAR_QUERY | CHAR_USERINFO, // 0x26 &
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x27 '
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x28 (
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x29 )
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2a *
+ CHAR_QUERY | CHAR_USERINFO, // 0x2b +
+ CHAR_QUERY | CHAR_USERINFO, // 0x2c ,
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x2d -
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x2e .
+ CHAR_QUERY, // 0x2f /
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x30 0
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x31 1
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x32 2
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x33 3
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x34 4
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x35 5
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x36 6
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT | CHAR_COMPONENT, // 0x37 7
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x38 8
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_COMPONENT, // 0x39 9
+ CHAR_QUERY, // 0x3a :
+ CHAR_QUERY, // 0x3b ;
+ 0, // 0x3c < (Try to prevent certain types of XSS.)
+ CHAR_QUERY, // 0x3d =
+ 0, // 0x3e > (Try to prevent certain types of XSS.)
+ CHAR_QUERY, // 0x3f ?
+ CHAR_QUERY, // 0x40 @
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x41 A
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x42 B
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x43 C
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x44 D
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x45 E
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x46 F
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x47 G
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x48 H
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x49 I
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4a J
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4b K
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4c L
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4d M
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4e N
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x4f O
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x50 P
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x51 Q
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x52 R
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x53 S
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x54 T
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x55 U
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x56 V
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x57 W
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x58 X
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x59 Y
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5a Z
+ CHAR_QUERY, // 0x5b [
+ CHAR_QUERY, // 0x5c '\'
+ CHAR_QUERY, // 0x5d ]
+ CHAR_QUERY, // 0x5e ^
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x5f _
+ CHAR_QUERY, // 0x60 `
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x61 a
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x62 b
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x63 c
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x64 d
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x65 e
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_COMPONENT, // 0x66 f
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x67 g
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x68 h
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x69 i
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6a j
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6b k
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6c l
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6d m
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6e n
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x6f o
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x70 p
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x71 q
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x72 r
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x73 s
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x74 t
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x75 u
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x76 v
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x77 w
+ CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_COMPONENT, // 0x78 x
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x79 y
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7a z
+ CHAR_QUERY, // 0x7b {
+ CHAR_QUERY, // 0x7c |
+ CHAR_QUERY, // 0x7d }
+ CHAR_QUERY | CHAR_USERINFO | CHAR_COMPONENT, // 0x7e ~
+ 0, // 0x7f
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 - 0xaf
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 - 0xbf
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 - 0xcf
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 - 0xdf
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
+};
+
+const char kHexCharLookup[0x10] = {
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
+};
+
+const char kCharToHexLookup[8] = {
+ 0, // 0x00 - 0x1f
+ '0', // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39
+ 'A' - 10, // 0x40 - 0x5f: letters A - F are 0x41 - 0x46
+ 'a' - 10, // 0x60 - 0x7f: letters a - f are 0x61 - 0x66
+ 0, // 0x80 - 0x9F
+ 0, // 0xA0 - 0xBF
+ 0, // 0xC0 - 0xDF
+ 0, // 0xE0 - 0xFF
+};
+
+const char16 kUnicodeReplacementCharacter = 0xfffd;
+
+void AppendStringOfType(const char* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output) {
+ DoAppendStringOfType<char, unsigned char>(source, length, type, output);
+}
+
+void AppendStringOfType(const char16* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output) {
+ DoAppendStringOfType<char16, char16>(source, length, type, output);
+}
+
+void AppendInvalidNarrowString(const char* spec, int begin, int end,
+ CanonOutput* output) {
+ DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);
+}
+
+void AppendInvalidNarrowString(const char16* spec, int begin, int end,
+ CanonOutput* output) {
+ DoAppendInvalidNarrowString<char16, char16>(spec, begin, end, output);
+}
+
+bool ConvertUTF16ToUTF8(const char16* input, int input_len,
+ CanonOutput* output) {
+ bool success = true;
+ for (int i = 0; i < input_len; i++) {
+ unsigned code_point;
+ success &= ReadUTFChar(input, &i, input_len, &code_point);
+ AppendUTF8Value(code_point, output);
+ }
+ return success;
+}
+
+bool ConvertUTF8ToUTF16(const char* input, int input_len,
+ CanonOutputT<char16>* output) {
+ bool success = true;
+ for (int i = 0; i < input_len; i++) {
+ unsigned code_point;
+ success &= ReadUTFChar(input, &i, input_len, &code_point);
+ AppendUTF16Value(code_point, output);
+ }
+ return success;
+}
+
+void SetupOverrideComponents(const char* base,
+ const Replacements<char>& repl,
+ URLComponentSource<char>* source,
+ url_parse::Parsed* parsed) {
+ // Get the source and parsed structures of the things we are replacing.
+ const URLComponentSource<char>& repl_source = repl.sources();
+ const url_parse::Parsed& repl_parsed = repl.components();
+
+ DoOverrideComponent(repl_source.scheme, repl_parsed.scheme,
+ &source->scheme, &parsed->scheme);
+ DoOverrideComponent(repl_source.username, repl_parsed.username,
+ &source->username, &parsed->username);
+ DoOverrideComponent(repl_source.password, repl_parsed.password,
+ &source->password, &parsed->password);
+
+ // Our host should be empty if not present, so override the default setup.
+ DoOverrideComponent(repl_source.host, repl_parsed.host,
+ &source->host, &parsed->host);
+ if (parsed->host.len == -1)
+ parsed->host.len = 0;
+
+ DoOverrideComponent(repl_source.port, repl_parsed.port,
+ &source->port, &parsed->port);
+ DoOverrideComponent(repl_source.path, repl_parsed.path,
+ &source->path, &parsed->path);
+ DoOverrideComponent(repl_source.query, repl_parsed.query,
+ &source->query, &parsed->query);
+ DoOverrideComponent(repl_source.ref, repl_parsed.ref,
+ &source->ref, &parsed->ref);
+}
+
+bool SetupUTF16OverrideComponents(const char* base,
+ const Replacements<char16>& repl,
+ CanonOutput* utf8_buffer,
+ URLComponentSource<char>* source,
+ url_parse::Parsed* parsed) {
+ bool success = true;
+
+ // Get the source and parsed structures of the things we are replacing.
+ const URLComponentSource<char16>& repl_source = repl.sources();
+ const url_parse::Parsed& repl_parsed = repl.components();
+
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.scheme, repl_parsed.scheme,
+ utf8_buffer, &parsed->scheme);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.username, repl_parsed.username,
+ utf8_buffer, &parsed->username);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.password, repl_parsed.password,
+ utf8_buffer, &parsed->password);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.host, repl_parsed.host,
+ utf8_buffer, &parsed->host);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.port, repl_parsed.port,
+ utf8_buffer, &parsed->port);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.path, repl_parsed.path,
+ utf8_buffer, &parsed->path);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.query, repl_parsed.query,
+ utf8_buffer, &parsed->query);
+ success &= PrepareUTF16OverrideComponent(
+ repl_source.ref, repl_parsed.ref,
+ utf8_buffer, &parsed->ref);
+
+ // PrepareUTF16OverrideComponent will not have set the data pointer since the
+ // buffer could be resized, invalidating the pointers. We set the data
+ // pointers for affected components now that the buffer is finalized.
+ if (repl_source.scheme) source->scheme = utf8_buffer->data();
+ if (repl_source.username) source->username = utf8_buffer->data();
+ if (repl_source.password) source->password = utf8_buffer->data();
+ if (repl_source.host) source->host = utf8_buffer->data();
+ if (repl_source.port) source->port = utf8_buffer->data();
+ if (repl_source.path) source->path = utf8_buffer->data();
+ if (repl_source.query) source->query = utf8_buffer->data();
+ if (repl_source.ref) source->ref = utf8_buffer->data();
+
+ return success;
+}
+
+#ifndef WIN32
+
+int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) {
+ const char* format_str;
+ if (radix == 10)
+ format_str = "%d";
+ else if (radix == 16)
+ format_str = "%x";
+ else
+ return EINVAL;
+
+ int written = snprintf(buffer, size_in_chars, format_str, value);
+ if (static_cast<size_t>(written) >= size_in_chars) {
+ // Output was truncated, or written was negative.
+ return EINVAL;
+ }
+ return 0;
+}
+
+int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix) {
+ if (radix != 10)
+ return EINVAL;
+
+ // No more than 12 characters will be required for a 32-bit integer.
+ // Add an extra byte for the terminating null.
+ char temp[13];
+ int written = snprintf(temp, sizeof(temp), "%d", value);
+ if (static_cast<size_t>(written) >= size_in_chars) {
+ // Output was truncated, or written was negative.
+ return EINVAL;
+ }
+
+ for (int i = 0; i < written; ++i) {
+ buffer[i] = static_cast<char16>(temp[i]);
+ }
+ buffer[written] = '\0';
+ return 0;
+}
+
+#endif // !WIN32
+
+} // namespace url_canon
diff --git a/url/url_canon_internal.h b/url/url_canon_internal.h
new file mode 100644
index 0000000..9165398
--- /dev/null
+++ b/url/url_canon_internal.h
@@ -0,0 +1,461 @@
+// Copyright 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is intended to be included in another C++ file where the character
+// types are defined. This allows us to write mostly generic code, but not have
+// templace bloat because everything is inlined when anybody calls any of our
+// functions.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
+#define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
+
+#include <stdlib.h>
+
+#include "base/logging.h"
+#include "googleurl/src/url_canon.h"
+
+namespace url_canon {
+
+// Character type handling -----------------------------------------------------
+
+// Bits that identify different character types. These types identify different
+// bits that are set for each 8-bit character in the kSharedCharTypeTable.
+enum SharedCharTypes {
+ // Characters that do not require escaping in queries. Characters that do
+ // not have this flag will be escaped; see url_canon_query.cc
+ CHAR_QUERY = 1,
+
+ // Valid in the username/password field.
+ CHAR_USERINFO = 2,
+
+ // Valid in a IPv4 address (digits plus dot and 'x' for hex).
+ CHAR_IPV4 = 4,
+
+ // Valid in an ASCII-representation of a hex digit (as in %-escaped).
+ CHAR_HEX = 8,
+
+ // Valid in an ASCII-representation of a decimal digit.
+ CHAR_DEC = 16,
+
+ // Valid in an ASCII-representation of an octal digit.
+ CHAR_OCT = 32,
+
+ // Characters that do not require escaping in encodeURIComponent. Characters
+ // that do not have this flag will be escaped; see url_util.cc.
+ CHAR_COMPONENT = 64,
+};
+
+// This table contains the flags in SharedCharTypes for each 8-bit character.
+// Some canonicalization functions have their own specialized lookup table.
+// For those with simple requirements, we have collected the flags in one
+// place so there are fewer lookup tables to load into the CPU cache.
+//
+// Using an unsigned char type has a small but measurable performance benefit
+// over using a 32-bit number.
+extern const unsigned char kSharedCharTypeTable[0x100];
+
+// More readable wrappers around the character type lookup table.
+inline bool IsCharOfType(unsigned char c, SharedCharTypes type) {
+ return !!(kSharedCharTypeTable[c] & type);
+}
+inline bool IsQueryChar(unsigned char c) {
+ return IsCharOfType(c, CHAR_QUERY);
+}
+inline bool IsIPv4Char(unsigned char c) {
+ return IsCharOfType(c, CHAR_IPV4);
+}
+inline bool IsHexChar(unsigned char c) {
+ return IsCharOfType(c, CHAR_HEX);
+}
+inline bool IsComponentChar(unsigned char c) {
+ return IsCharOfType(c, CHAR_COMPONENT);
+}
+
+// Appends the given string to the output, escaping characters that do not
+// match the given |type| in SharedCharTypes.
+void AppendStringOfType(const char* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output);
+void AppendStringOfType(const char16* source, int length,
+ SharedCharTypes type,
+ CanonOutput* output);
+
+// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit
+// that will be used to represent it.
+GURL_API extern const char kHexCharLookup[0x10];
+
+// This lookup table allows fast conversion between ASCII hex letters and their
+// corresponding numerical value. The 8-bit range is divided up into 8
+// regions of 0x20 characters each. Each of the three character types (numbers,
+// uppercase, lowercase) falls into different regions of this range. The table
+// contains the amount to subtract from characters in that range to get at
+// the corresponding numerical value.
+//
+// See HexDigitToValue for the lookup.
+extern const char kCharToHexLookup[8];
+
+// Assumes the input is a valid hex digit! Call IsHexChar before using this.
+inline unsigned char HexCharToValue(unsigned char c) {
+ return c - kCharToHexLookup[c / 0x20];
+}
+
+// Indicates if the given character is a dot or dot equivalent, returning the
+// number of characters taken by it. This will be one for a literal dot, 3 for
+// an escaped dot. If the character is not a dot, this will return 0.
+template<typename CHAR>
+inline int IsDot(const CHAR* spec, int offset, int end) {
+ if (spec[offset] == '.') {
+ return 1;
+ } else if (spec[offset] == '%' && offset + 3 <= end &&
+ spec[offset + 1] == '2' &&
+ (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) {
+ // Found "%2e"
+ return 3;
+ }
+ return 0;
+}
+
+// Returns the canonicalized version of the input character according to scheme
+// rules. This is implemented alongside the scheme canonicalizer, and is
+// required for relative URL resolving to test for scheme equality.
+//
+// Returns 0 if the input character is not a valid scheme character.
+char CanonicalSchemeChar(char16 ch);
+
+// Write a single character, escaped, to the output. This always escapes: it
+// does no checking that thee character requires escaping.
+// Escaping makes sense only 8 bit chars, so code works in all cases of
+// input parameters (8/16bit).
+template<typename UINCHAR, typename OUTCHAR>
+inline void AppendEscapedChar(UINCHAR ch,
+ CanonOutputT<OUTCHAR>* output) {
+ output->push_back('%');
+ output->push_back(kHexCharLookup[(ch >> 4) & 0xf]);
+ output->push_back(kHexCharLookup[ch & 0xf]);
+}
+
+// The character we'll substitute for undecodable or invalid characters.
+extern const char16 kUnicodeReplacementCharacter;
+
+// UTF-8 functions ------------------------------------------------------------
+
+// Reads one character in UTF-8 starting at |*begin| in |str| and places
+// the decoded value into |*code_point|. If the character is valid, we will
+// return true. If invalid, we'll return false and put the
+// kUnicodeReplacementCharacter into |*code_point|.
+//
+// |*begin| will be updated to point to the last character consumed so it
+// can be incremented in a loop and will be ready for the next character.
+// (for a single-byte ASCII character, it will not be changed).
+//
+// Implementation is in url_canon_icu.cc.
+GURL_API bool ReadUTFChar(const char* str, int* begin, int length,
+ unsigned* code_point_out);
+
+// Generic To-UTF-8 converter. This will call the given append method for each
+// character that should be appended, with the given output method. Wrappers
+// are provided below for escaped and non-escaped versions of this.
+//
+// The char_value must have already been checked that it's a valid Unicode
+// character.
+template<class Output, void Appender(unsigned char, Output*)>
+inline void DoAppendUTF8(unsigned char_value, Output* output) {
+ if (char_value <= 0x7f) {
+ Appender(static_cast<unsigned char>(char_value), output);
+ } else if (char_value <= 0x7ff) {
+ // 110xxxxx 10xxxxxx
+ Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+ output);
+ } else if (char_value <= 0xffff) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+ output);
+ } else if (char_value <= 0x10FFFF) { // Max unicode code point.
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)),
+ output);
+ Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
+ output);
+ } else {
+ // Invalid UTF-8 character (>20 bits).
+ NOTREACHED();
+ }
+}
+
+// Helper used by AppendUTF8Value below. We use an unsigned parameter so there
+// are no funny sign problems with the input, but then have to convert it to
+// a regular char for appending.
+inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) {
+ output->push_back(static_cast<char>(ch));
+}
+
+// Writes the given character to the output as UTF-8. This does NO checking
+// of the validity of the unicode characters; the caller should ensure that
+// the value it is appending is valid to append.
+inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) {
+ DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output);
+}
+
+// Writes the given character to the output as UTF-8, escaping ALL
+// characters (even when they are ASCII). This does NO checking of the
+// validity of the unicode characters; the caller should ensure that the value
+// it is appending is valid to append.
+inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) {
+ DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output);
+}
+
+// UTF-16 functions -----------------------------------------------------------
+
+// Reads one character in UTF-16 starting at |*begin| in |str| and places
+// the decoded value into |*code_point|. If the character is valid, we will
+// return true. If invalid, we'll return false and put the
+// kUnicodeReplacementCharacter into |*code_point|.
+//
+// |*begin| will be updated to point to the last character consumed so it
+// can be incremented in a loop and will be ready for the next character.
+// (for a single-16-bit-word character, it will not be changed).
+//
+// Implementation is in url_canon_icu.cc.
+GURL_API bool ReadUTFChar(const char16* str, int* begin, int length,
+ unsigned* code_point);
+
+// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method.
+inline void AppendUTF16Value(unsigned code_point,
+ CanonOutputT<char16>* output) {
+ if (code_point > 0xffff) {
+ output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0));
+ output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00));
+ } else {
+ output->push_back(static_cast<char16>(code_point));
+ }
+}
+
+// Escaping functions ---------------------------------------------------------
+
+// Writes the given character to the output as UTF-8, escaped. Call this
+// function only when the input is wide. Returns true on success. Failure
+// means there was some problem with the encoding, we'll still try to
+// update the |*begin| pointer and add a placeholder character to the
+// output so processing can continue.
+//
+// We will append the character starting at ch[begin] with the buffer ch
+// being |length|. |*begin| will be updated to point to the last character
+// consumed (we may consume more than one for UTF-16) so that if called in
+// a loop, incrementing the pointer will move to the next character.
+//
+// Every single output character will be escaped. This means that if you
+// give it an ASCII character as input, it will be escaped. Some code uses
+// this when it knows that a character is invalid according to its rules
+// for validity. If you don't want escaping for ASCII characters, you will
+// have to filter them out prior to calling this function.
+//
+// Assumes that ch[begin] is within range in the array, but does not assume
+// that any following characters are.
+inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length,
+ CanonOutput* output) {
+ // UTF-16 input. Readchar16 will handle invalid characters for us and give
+ // us the kUnicodeReplacementCharacter, so we don't have to do special
+ // checking after failure, just pass through the failure to the caller.
+ unsigned char_value;
+ bool success = ReadUTFChar(str, begin, length, &char_value);
+ AppendUTF8EscapedValue(char_value, output);
+ return success;
+}
+
+// Handles UTF-8 input. See the wide version above for usage.
+inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length,
+ CanonOutput* output) {
+ // ReadUTF8Char will handle invalid characters for us and give us the
+ // kUnicodeReplacementCharacter, so we don't have to do special checking
+ // after failure, just pass through the failure to the caller.
+ unsigned ch;
+ bool success = ReadUTFChar(str, begin, length, &ch);
+ AppendUTF8EscapedValue(ch, output);
+ return success;
+}
+
+// Given a '%' character at |*begin| in the string |spec|, this will decode
+// the escaped value and put it into |*unescaped_value| on success (returns
+// true). On failure, this will return false, and will not write into
+// |*unescaped_value|.
+//
+// |*begin| will be updated to point to the last character of the escape
+// sequence so that when called with the index of a for loop, the next time
+// through it will point to the next character to be considered. On failure,
+// |*begin| will be unchanged.
+inline bool Is8BitChar(char c) {
+ return true; // this case is specialized to avoid a warning
+}
+inline bool Is8BitChar(char16 c) {
+ return c <= 255;
+}
+
+template<typename CHAR>
+inline bool DecodeEscaped(const CHAR* spec, int* begin, int end,
+ unsigned char* unescaped_value) {
+ if (*begin + 3 > end ||
+ !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) {
+ // Invalid escape sequence because there's not enough room, or the
+ // digits are not ASCII.
+ return false;
+ }
+
+ unsigned char first = static_cast<unsigned char>(spec[*begin + 1]);
+ unsigned char second = static_cast<unsigned char>(spec[*begin + 2]);
+ if (!IsHexChar(first) || !IsHexChar(second)) {
+ // Invalid hex digits, fail.
+ return false;
+ }
+
+ // Valid escape sequence.
+ *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second);
+ *begin += 2;
+ return true;
+}
+
+// Appends the given substring to the output, escaping "some" characters that
+// it feels may not be safe. It assumes the input values are all contained in
+// 8-bit although it allows any type.
+//
+// This is used in error cases to append invalid output so that it looks
+// approximately correct. Non-error cases should not call this function since
+// the escaping rules are not guaranteed!
+void AppendInvalidNarrowString(const char* spec, int begin, int end,
+ CanonOutput* output);
+void AppendInvalidNarrowString(const char16* spec, int begin, int end,
+ CanonOutput* output);
+
+// Misc canonicalization helpers ----------------------------------------------
+
+// Converts between UTF-8 and UTF-16, returning true on successful conversion.
+// The output will be appended to the given canonicalizer output (so make sure
+// it's empty if you want to replace).
+//
+// On invalid input, this will still write as much output as possible,
+// replacing the invalid characters with the "invalid character". It will
+// return false in the failure case, and the caller should not continue as
+// normal.
+GURL_API bool ConvertUTF16ToUTF8(const char16* input, int input_len,
+ CanonOutput* output);
+GURL_API bool ConvertUTF8ToUTF16(const char* input, int input_len,
+ CanonOutputT<char16>* output);
+
+// Converts from UTF-16 to 8-bit using the character set converter. If the
+// converter is NULL, this will use UTF-8.
+void ConvertUTF16ToQueryEncoding(const char16* input,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output);
+
+// Applies the replacements to the given component source. The component source
+// should be pre-initialized to the "old" base. That is, all pointers will
+// point to the spec of the old URL, and all of the Parsed components will
+// be indices into that string.
+//
+// The pointers and components in the |source| for all non-NULL strings in the
+// |repl| (replacements) will be updated to reference those strings.
+// Canonicalizing with the new |source| and |parsed| can then combine URL
+// components from many different strings.
+void SetupOverrideComponents(const char* base,
+ const Replacements<char>& repl,
+ URLComponentSource<char>* source,
+ url_parse::Parsed* parsed);
+
+// Like the above 8-bit version, except that it additionally converts the
+// UTF-16 input to UTF-8 before doing the overrides.
+//
+// The given utf8_buffer is used to store the converted components. They will
+// be appended one after another, with the parsed structure identifying the
+// appropriate substrings. This buffer is a parameter because the source has
+// no storage, so the buffer must have the same lifetime as the source
+// parameter owned by the caller.
+//
+// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of
+// |source| will point into this buffer, which could be invalidated if
+// additional data is added and the CanonOutput resizes its buffer.
+//
+// Returns true on success. Fales means that the input was not valid UTF-16,
+// although we will have still done the override with "invalid characters" in
+// place of errors.
+bool SetupUTF16OverrideComponents(const char* base,
+ const Replacements<char16>& repl,
+ CanonOutput* utf8_buffer,
+ URLComponentSource<char>* source,
+ url_parse::Parsed* parsed);
+
+// Implemented in url_canon_path.cc, these are required by the relative URL
+// resolver as well, so we declare them here.
+bool CanonicalizePartialPath(const char* spec,
+ const url_parse::Component& path,
+ int path_begin_in_output,
+ CanonOutput* output);
+bool CanonicalizePartialPath(const char16* spec,
+ const url_parse::Component& path,
+ int path_begin_in_output,
+ CanonOutput* output);
+
+#ifndef WIN32
+
+// Implementations of Windows' int-to-string conversions
+GURL_API int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix);
+GURL_API int _itow_s(int value, char16* buffer, size_t size_in_chars,
+ int radix);
+
+// Secure template overloads for these functions
+template<size_t N>
+inline int _itoa_s(int value, char (&buffer)[N], int radix) {
+ return _itoa_s(value, buffer, N, radix);
+}
+
+template<size_t N>
+inline int _itow_s(int value, char16 (&buffer)[N], int radix) {
+ return _itow_s(value, buffer, N, radix);
+}
+
+// _strtoui64 and strtoull behave the same
+inline unsigned long long _strtoui64(const char* nptr,
+ char** endptr, int base) {
+ return strtoull(nptr, endptr, base);
+}
+
+#endif // WIN32
+
+} // namespace url_canon
+
+#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__
diff --git a/url/url_canon_internal_file.h b/url/url_canon_internal_file.h
new file mode 100644
index 0000000..63a9c5b
--- /dev/null
+++ b/url/url_canon_internal_file.h
@@ -0,0 +1,157 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// As with url_canon_internal.h, this file is intended to be included in
+// another C++ file where the template types are defined. This allows the
+// programmer to use this to use these functions for their own strings
+// types, without bloating the code by having inline templates used in
+// every call site.
+//
+// *** This file must be included after url_canon_internal as we depend on some
+// functions in it. ***
+
+#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
+#define GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
+
+#include "googleurl/src/url_file.h"
+#include "googleurl/src/url_parse_internal.h"
+
+using namespace url_canon;
+
+// Given a pointer into the spec, this copies and canonicalizes the drive
+// letter and colon to the output, if one is found. If there is not a drive
+// spec, it won't do anything. The index of the next character in the input
+// spec is returned (after the colon when a drive spec is found, the begin
+// offset if one is not).
+template<typename CHAR>
+static int FileDoDriveSpec(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo,
+ // (with backslashes instead of slashes as well).
+ int num_slashes = CountConsecutiveSlashes(spec, begin, end);
+ int after_slashes = begin + num_slashes;
+
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end))
+ return begin; // Haven't consumed any characters
+
+ // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid
+ // and that it is followed by a colon/pipe.
+
+ // Normalize Windows drive letters to uppercase
+ if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z')
+ output->push_back(spec[after_slashes] - 'a' + 'A');
+ else
+ output->push_back(static_cast<char>(spec[after_slashes]));
+
+ // Normalize the character following it to a colon rather than pipe.
+ output->push_back(':');
+ output->push_back('/');
+ return after_slashes + 2;
+}
+
+// FileDoDriveSpec will have already added the first backslash, so we need to
+// write everything following the slashes using the path canonicalizer.
+template<typename CHAR, typename UCHAR>
+static void FileDoPath(const CHAR* spec, int begin, int end,
+ CanonOutput* output) {
+ // Normalize the number of slashes after the drive letter. The path
+ // canonicalizer expects the input to begin in a slash already so
+ // doesn't check. We want to handle no-slashes
+ int num_slashes = CountConsecutiveSlashes(spec, begin, end);
+ int after_slashes = begin + num_slashes;
+
+ // Now use the regular path canonicalizer to canonicalize the rest of the
+ // path. We supply it with the path following the slashes. It won't prepend
+ // a slash because it assumes any nonempty path already starts with one.
+ // We explicitly filter out calls with no path here to prevent that case.
+ ParsedURL::Component sub_path(after_slashes, end - after_slashes);
+ if (sub_path.len > 0) {
+ // Give it a fake output component to write into. DoCanonicalizeFile will
+ // compute the full path component.
+ ParsedURL::Component fake_output_path;
+ URLCanonInternal<CHAR, UCHAR>::DoPath(
+ spec, sub_path, output, &fake_output_path);
+ }
+}
+
+template<typename CHAR, typename UCHAR>
+static bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source,
+ const ParsedURL& parsed,
+ CanonOutput* output,
+ ParsedURL* new_parsed) {
+ // Things we don't set in file: URLs.
+ new_parsed->username = ParsedURL::Component(0, -1);
+ new_parsed->password = ParsedURL::Component(0, -1);
+ new_parsed->port = ParsedURL::Component(0, -1);
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->push_back('f');
+ output->push_back('i');
+ output->push_back('l');
+ output->push_back('e');
+ new_parsed->scheme.len = output->length() - new_parsed->scheme.begin;
+ output->push_back(':');
+
+ // Write the separator for the host.
+ output->push_back('/');
+ output->push_back('/');
+
+ // Append the host. For many file URLs, this will be empty. For UNC, this
+ // will be present.
+ // TODO(brettw) This doesn't do any checking for host name validity. We
+ // should probably handle validity checking of UNC hosts differently than
+ // for regular IP hosts.
+ bool success = URLCanonInternal<CHAR, UCHAR>::DoHost(
+ source.host, parsed.host, output, &new_parsed->host);
+
+ // Write a separator for the start of the path. We'll ignore any slashes
+ // already at the beginning of the path.
+ new_parsed->path.begin = output->length();
+ output->push_back('/');
+
+ // Copies and normalizes the "c:" at the beginning, if present.
+ int after_drive = FileDoDriveSpec(source.path, parsed.path.begin,
+ parsed.path.end(), output);
+
+ // Copies the rest of the path
+ FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output);
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
+
+ // Things following the path we can use the standard canonicalizers for.
+ success &= URLCanonInternal<CHAR, UCHAR>::DoQuery(
+ source.query, parsed.query, output, &new_parsed->query);
+ success &= URLCanonInternal<CHAR, UCHAR>::DoRef(
+ source.ref, parsed.ref, output, &new_parsed->ref);
+
+ return success;
+}
+
+#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__
diff --git a/url/url_canon_ip.cc b/url/url_canon_ip.cc
new file mode 100644
index 0000000..1421e79
--- /dev/null
+++ b/url/url_canon_ip.cc
@@ -0,0 +1,730 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "googleurl/src/url_canon_ip.h"
+
+#include <stdlib.h>
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "googleurl/src/url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// Converts one of the character types that represent a numerical base to the
+// corresponding base.
+int BaseForType(SharedCharTypes type) {
+ switch (type) {
+ case CHAR_HEX:
+ return 16;
+ case CHAR_DEC:
+ return 10;
+ case CHAR_OCT:
+ return 8;
+ default:
+ return 0;
+ }
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoFindIPv4Components(const CHAR* spec,
+ const url_parse::Component& host,
+ url_parse::Component components[4]) {
+ if (!host.is_nonempty())
+ return false;
+
+ int cur_component = 0; // Index of the component we're working on.
+ int cur_component_begin = host.begin; // Start of the current component.
+ int end = host.end();
+ for (int i = host.begin; /* nothing */; i++) {
+ if (i >= end || spec[i] == '.') {
+ // Found the end of the current component.
+ int component_len = i - cur_component_begin;
+ components[cur_component] =
+ url_parse::Component(cur_component_begin, component_len);
+
+ // The next component starts after the dot.
+ cur_component_begin = i + 1;
+ cur_component++;
+
+ // Don't allow empty components (two dots in a row), except we may
+ // allow an empty component at the end (this would indicate that the
+ // input ends in a dot). We also want to error if the component is
+ // empty and it's the only component (cur_component == 1).
+ if (component_len == 0 && (i < end || cur_component == 1))
+ return false;
+
+ if (i >= end)
+ break; // End of the input.
+
+ if (cur_component == 4) {
+ // Anything else after the 4th component is an error unless it is a
+ // dot that would otherwise be treated as the end of input.
+ if (spec[i] == '.' && i + 1 == end)
+ break;
+ return false;
+ }
+ } else if (static_cast<UCHAR>(spec[i]) >= 0x80 ||
+ !IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
+ // Invalid character for an IPv4 address.
+ return false;
+ }
+ }
+
+ // Fill in any unused components.
+ while (cur_component < 4)
+ components[cur_component++] = url_parse::Component();
+ return true;
+}
+
+// Converts an IPv4 component to a 32-bit number, while checking for overflow.
+//
+// Possible return values:
+// - IPV4 - The number was valid, and did not overflow.
+// - BROKEN - The input was numeric, but too large for a 32-bit field.
+// - NEUTRAL - Input was not numeric.
+//
+// The input is assumed to be ASCII. FindIPv4Components should have stripped
+// out any input that is greater than 7 bits. The components are assumed
+// to be non-empty.
+template<typename CHAR>
+CanonHostInfo::Family IPv4ComponentToNumber(
+ const CHAR* spec,
+ const url_parse::Component& component,
+ uint32* number) {
+ // Figure out the base
+ SharedCharTypes base;
+ int base_prefix_len = 0; // Size of the prefix for this base.
+ if (spec[component.begin] == '0') {
+ // Either hex or dec, or a standalone zero.
+ if (component.len == 1) {
+ base = CHAR_DEC;
+ } else if (spec[component.begin + 1] == 'X' ||
+ spec[component.begin + 1] == 'x') {
+ base = CHAR_HEX;
+ base_prefix_len = 2;
+ } else {
+ base = CHAR_OCT;
+ base_prefix_len = 1;
+ }
+ } else {
+ base = CHAR_DEC;
+ }
+
+ // Extend the prefix to consume all leading zeros.
+ while (base_prefix_len < component.len &&
+ spec[component.begin + base_prefix_len] == '0')
+ base_prefix_len++;
+
+ // Put the component, minus any base prefix, into a NULL-terminated buffer so
+ // we can call the standard library. Because leading zeros have already been
+ // discarded, filling the entire buffer is guaranteed to trigger the 32-bit
+ // overflow check.
+ const int kMaxComponentLen = 16;
+ char buf[kMaxComponentLen + 1]; // digits + '\0'
+ int dest_i = 0;
+ for (int i = component.begin + base_prefix_len; i < component.end(); i++) {
+ // We know the input is 7-bit, so convert to narrow (if this is the wide
+ // version of the template) by casting.
+ char input = static_cast<char>(spec[i]);
+
+ // Validate that this character is OK for the given base.
+ if (!IsCharOfType(input, base))
+ return CanonHostInfo::NEUTRAL;
+
+ // Fill the buffer, if there's space remaining. This check allows us to
+ // verify that all characters are numeric, even those that don't fit.
+ if (dest_i < kMaxComponentLen)
+ buf[dest_i++] = input;
+ }
+
+ buf[dest_i] = '\0';
+
+ // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
+ // number can overflow a 64-bit number in <= 16 characters).
+ uint64 num = _strtoui64(buf, NULL, BaseForType(base));
+
+ // Check for 32-bit overflow.
+ if (num > kuint32max)
+ return CanonHostInfo::BROKEN;
+
+ // No overflow. Success!
+ *number = static_cast<uint32>(num);
+ return CanonHostInfo::IPV4;
+}
+
+// See declaration of IPv4AddressToNumber for documentation.
+template<typename CHAR>
+CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec,
+ const url_parse::Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components) {
+ // The identified components. Not all may exist.
+ url_parse::Component components[4];
+ if (!FindIPv4Components(spec, host, components))
+ return CanonHostInfo::NEUTRAL;
+
+ // Convert existing components to digits. Values up to
+ // |existing_components| will be valid.
+ uint32 component_values[4];
+ int existing_components = 0;
+
+ // Set to true if one or more components are BROKEN. BROKEN is only
+ // returned if all components are IPV4 or BROKEN, so, for example,
+ // 12345678912345.de returns NEUTRAL rather than broken.
+ bool broken = false;
+ for (int i = 0; i < 4; i++) {
+ if (components[i].len <= 0)
+ continue;
+ CanonHostInfo::Family family = IPv4ComponentToNumber(
+ spec, components[i], &component_values[existing_components]);
+
+ if (family == CanonHostInfo::BROKEN) {
+ broken = true;
+ } else if (family != CanonHostInfo::IPV4) {
+ // Stop if we hit a non-BROKEN invalid non-empty component.
+ return family;
+ }
+
+ existing_components++;
+ }
+
+ if (broken)
+ return CanonHostInfo::BROKEN;
+
+ // Use that sequence of numbers to fill out the 4-component IP address.
+
+ // First, process all components but the last, while making sure each fits
+ // within an 8-bit field.
+ for (int i = 0; i < existing_components - 1; i++) {
+ if (component_values[i] > kuint8max)
+ return CanonHostInfo::BROKEN;
+ address[i] = static_cast<unsigned char>(component_values[i]);
+ }
+
+ // Next, consume the last component to fill in the remaining bytes.
+ uint32 last_value = component_values[existing_components - 1];
+ for (int i = 3; i >= existing_components - 1; i--) {
+ address[i] = static_cast<unsigned char>(last_value);
+ last_value >>= 8;
+ }
+
+ // If the last component has residual bits, report overflow.
+ if (last_value != 0)
+ return CanonHostInfo::BROKEN;
+
+ // Tell the caller how many components we saw.
+ *num_ipv4_components = existing_components;
+
+ // Success!
+ return CanonHostInfo::IPV4;
+}
+
+// Return true if we've made a final IPV4/BROKEN decision, false if the result
+// is NEUTRAL, and we could use a second opinion.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeIPv4Address(const CHAR* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ host_info->family = IPv4AddressToNumber(
+ spec, host, host_info->address, &host_info->num_ipv4_components);
+
+ switch (host_info->family) {
+ case CanonHostInfo::IPV4:
+ // Definitely an IPv4 address.
+ host_info->out_host.begin = output->length();
+ AppendIPv4Address(host_info->address, output);
+ host_info->out_host.len = output->length() - host_info->out_host.begin;
+ return true;
+ case CanonHostInfo::BROKEN:
+ // Definitely broken.
+ return true;
+ default:
+ // Could be IPv6 or a hostname.
+ return false;
+ }
+}
+
+// Helper class that describes the main components of an IPv6 input string.
+// See the following examples to understand how it breaks up an input string:
+//
+// [Example 1]: input = "[::aa:bb]"
+// ==> num_hex_components = 2
+// ==> hex_components[0] = Component(3,2) "aa"
+// ==> hex_components[1] = Component(6,2) "bb"
+// ==> index_of_contraction = 0
+// ==> ipv4_component = Component(0, -1)
+//
+// [Example 2]: input = "[1:2::3:4:5]"
+// ==> num_hex_components = 5
+// ==> hex_components[0] = Component(1,1) "1"
+// ==> hex_components[1] = Component(3,1) "2"
+// ==> hex_components[2] = Component(6,1) "3"
+// ==> hex_components[3] = Component(8,1) "4"
+// ==> hex_components[4] = Component(10,1) "5"
+// ==> index_of_contraction = 2
+// ==> ipv4_component = Component(0, -1)
+//
+// [Example 3]: input = "[::ffff:192.168.0.1]"
+// ==> num_hex_components = 1
+// ==> hex_components[0] = Component(3,4) "ffff"
+// ==> index_of_contraction = 0
+// ==> ipv4_component = Component(8, 11) "192.168.0.1"
+//
+// [Example 4]: input = "[1::]"
+// ==> num_hex_components = 1
+// ==> hex_components[0] = Component(1,1) "1"
+// ==> index_of_contraction = 1
+// ==> ipv4_component = Component(0, -1)
+//
+// [Example 5]: input = "[::192.168.0.1]"
+// ==> num_hex_components = 0
+// ==> index_of_contraction = 0
+// ==> ipv4_component = Component(8, 11) "192.168.0.1"
+//
+struct IPv6Parsed {
+ // Zero-out the parse information.
+ void reset() {
+ num_hex_components = 0;
+ index_of_contraction = -1;
+ ipv4_component.reset();
+ }
+
+ // There can be up to 8 hex components (colon separated) in the literal.
+ url_parse::Component hex_components[8];
+
+ // The count of hex components present. Ranges from [0,8].
+ int num_hex_components;
+
+ // The index of the hex component that the "::" contraction precedes, or
+ // -1 if there is no contraction.
+ int index_of_contraction;
+
+ // The range of characters which are an IPv4 literal.
+ url_parse::Component ipv4_component;
+};
+
+// Parse the IPv6 input string. If parsing succeeded returns true and fills
+// |parsed| with the information. If parsing failed (because the input is
+// invalid) returns false.
+template<typename CHAR, typename UCHAR>
+bool DoParseIPv6(const CHAR* spec,
+ const url_parse::Component& host,
+ IPv6Parsed* parsed) {
+ // Zero-out the info.
+ parsed->reset();
+
+ if (!host.is_nonempty())
+ return false;
+
+ // The index for start and end of address range (no brackets).
+ int begin = host.begin;
+ int end = host.end();
+
+ int cur_component_begin = begin; // Start of the current component.
+
+ // Scan through the input, searching for hex components, "::" contractions,
+ // and IPv4 components.
+ for (int i = begin; /* i <= end */; i++) {
+ bool is_colon = spec[i] == ':';
+ bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':';
+
+ // We reached the end of the current component if we encounter a colon
+ // (separator between hex components, or start of a contraction), or end of
+ // input.
+ if (is_colon || i == end) {
+ int component_len = i - cur_component_begin;
+
+ // A component should not have more than 4 hex digits.
+ if (component_len > 4)
+ return false;
+
+ // Don't allow empty components.
+ if (component_len == 0) {
+ // The exception is when contractions appear at beginning of the
+ // input or at the end of the input.
+ if (!((is_contraction && i == begin) || (i == end &&
+ parsed->index_of_contraction == parsed->num_hex_components)))
+ return false;
+ }
+
+ // Add the hex component we just found to running list.
+ if (component_len > 0) {
+ // Can't have more than 8 components!
+ if (parsed->num_hex_components >= 8)
+ return false;
+
+ parsed->hex_components[parsed->num_hex_components++] =
+ url_parse::Component(cur_component_begin, component_len);
+ }
+ }
+
+ if (i == end)
+ break; // Reached the end of the input, DONE.
+
+ // We found a "::" contraction.
+ if (is_contraction) {
+ // There can be at most one contraction in the literal.
+ if (parsed->index_of_contraction != -1)
+ return false;
+ parsed->index_of_contraction = parsed->num_hex_components;
+ ++i; // Consume the colon we peeked.
+ }
+
+ if (is_colon) {
+ // Colons are separators between components, keep track of where the
+ // current component started (after this colon).
+ cur_component_begin = i + 1;
+ } else {
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
+ return false; // Not ASCII.
+
+ if (!IsHexChar(static_cast<unsigned char>(spec[i]))) {
+ // Regular components are hex numbers. It is also possible for
+ // a component to be an IPv4 address in dotted form.
+ if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) {
+ // Since IPv4 address can only appear at the end, assume the rest
+ // of the string is an IPv4 address. (We will parse this separately
+ // later).
+ parsed->ipv4_component = url_parse::Component(
+ cur_component_begin, end - cur_component_begin);
+ break;
+ } else {
+ // The character was neither a hex digit, nor an IPv4 character.
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+// Verifies the parsed IPv6 information, checking that the various components
+// add up to the right number of bits (hex components are 16 bits, while
+// embedded IPv4 formats are 32 bits, and contractions are placeholdes for
+// 16 or more bits). Returns true if sizes match up, false otherwise. On
+// success writes the length of the contraction (if any) to
+// |out_num_bytes_of_contraction|.
+bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed,
+ int* out_num_bytes_of_contraction) {
+ // Each group of four hex digits contributes 16 bits.
+ int num_bytes_without_contraction = parsed.num_hex_components * 2;
+
+ // If an IPv4 address was embedded at the end, it contributes 32 bits.
+ if (parsed.ipv4_component.is_valid())
+ num_bytes_without_contraction += 4;
+
+ // If there was a "::" contraction, its size is going to be:
+ // MAX([16bits], [128bits] - num_bytes_without_contraction).
+ int num_bytes_of_contraction = 0;
+ if (parsed.index_of_contraction != -1) {
+ num_bytes_of_contraction = 16 - num_bytes_without_contraction;
+ if (num_bytes_of_contraction < 2)
+ num_bytes_of_contraction = 2;
+ }
+
+ // Check that the numbers add up.
+ if (num_bytes_without_contraction + num_bytes_of_contraction != 16)
+ return false;
+
+ *out_num_bytes_of_contraction = num_bytes_of_contraction;
+ return true;
+}
+
+// Converts a hex comonent into a number. This cannot fail since the caller has
+// already verified that each character in the string was a hex digit, and
+// that there were no more than 4 characters.
+template<typename CHAR>
+uint16 IPv6HexComponentToNumber(const CHAR* spec,
+ const url_parse::Component& component) {
+ DCHECK(component.len <= 4);
+
+ // Copy the hex string into a C-string.
+ char buf[5];
+ for (int i = 0; i < component.len; ++i)
+ buf[i] = static_cast<char>(spec[component.begin + i]);
+ buf[component.len] = '\0';
+
+ // Convert it to a number (overflow is not possible, since with 4 hex
+ // characters we can at most have a 16 bit number).
+ return static_cast<uint16>(_strtoui64(buf, NULL, 16));
+}
+
+// Converts an IPv6 address to a 128-bit number (network byte order), returning
+// true on success. False means that the input was not a valid IPv6 address.
+template<typename CHAR, typename UCHAR>
+bool DoIPv6AddressToNumber(const CHAR* spec,
+ const url_parse::Component& host,
+ unsigned char address[16]) {
+ // Make sure the component is bounded by '[' and ']'.
+ int end = host.end();
+ if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']')
+ return false;
+
+ // Exclude the square brackets.
+ url_parse::Component ipv6_comp(host.begin + 1, host.len - 2);
+
+ // Parse the IPv6 address -- identify where all the colon separated hex
+ // components are, the "::" contraction, and the embedded IPv4 address.
+ IPv6Parsed ipv6_parsed;
+ if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed))
+ return false;
+
+ // Do some basic size checks to make sure that the address doesn't
+ // specify more than 128 bits or fewer than 128 bits. This also resolves
+ // how may zero bytes the "::" contraction represents.
+ int num_bytes_of_contraction;
+ if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction))
+ return false;
+
+ int cur_index_in_address = 0;
+
+ // Loop through each hex components, and contraction in order.
+ for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) {
+ // Append the contraction if it appears before this component.
+ if (i == ipv6_parsed.index_of_contraction) {
+ for (int j = 0; j < num_bytes_of_contraction; ++j)
+ address[cur_index_in_address++] = 0;
+ }
+ // Append the hex component's value.
+ if (i != ipv6_parsed.num_hex_components) {
+ // Get the 16-bit value for this hex component.
+ uint16 number = IPv6HexComponentToNumber<CHAR>(
+ spec, ipv6_parsed.hex_components[i]);
+ // Append to |address|, in network byte order.
+ address[cur_index_in_address++] = (number & 0xFF00) >> 8;
+ address[cur_index_in_address++] = (number & 0x00FF);
+ }
+ }
+
+ // If there was an IPv4 section, convert it into a 32-bit number and append
+ // it to |address|.
+ if (ipv6_parsed.ipv4_component.is_valid()) {
+ // Append the 32-bit number to |address|.
+ int ignored_num_ipv4_components;
+ if (CanonHostInfo::IPV4 !=
+ IPv4AddressToNumber(spec,
+ ipv6_parsed.ipv4_component,
+ &address[cur_index_in_address],
+ &ignored_num_ipv4_components))
+ return false;
+ }
+
+ return true;
+}
+
+// Searches for the longest sequence of zeros in |address|, and writes the
+// range into |contraction_range|. The run of zeros must be at least 16 bits,
+// and if there is a tie the first is chosen.
+void ChooseIPv6ContractionRange(const unsigned char address[16],
+ url_parse::Component* contraction_range) {
+ // The longest run of zeros in |address| seen so far.
+ url_parse::Component max_range;
+
+ // The current run of zeros in |address| being iterated over.
+ url_parse::Component cur_range;
+
+ for (int i = 0; i < 16; i += 2) {
+ // Test for 16 bits worth of zero.
+ bool is_zero = (address[i] == 0 && address[i + 1] == 0);
+
+ if (is_zero) {
+ // Add the zero to the current range (or start a new one).
+ if (!cur_range.is_valid())
+ cur_range = url_parse::Component(i, 0);
+ cur_range.len += 2;
+ }
+
+ if (!is_zero || i == 14) {
+ // Just completed a run of zeros. If the run is greater than 16 bits,
+ // it is a candidate for the contraction.
+ if (cur_range.len > 2 && cur_range.len > max_range.len) {
+ max_range = cur_range;
+ }
+ cur_range.reset();
+ }
+ }
+ *contraction_range = max_range;
+}
+
+// Return true if we've made a final IPV6/BROKEN decision, false if the result
+// is NEUTRAL, and we could use a second opinion.
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeIPv6Address(const CHAR* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ // Turn the IP address into a 128 bit number.
+ if (!IPv6AddressToNumber(spec, host, host_info->address)) {
+ // If it's not an IPv6 address, scan for characters that should *only*
+ // exist in an IPv6 address.
+ for (int i = host.begin; i < host.end(); i++) {
+ switch (spec[i]) {
+ case '[':
+ case ']':
+ case ':':
+ host_info->family = CanonHostInfo::BROKEN;
+ return true;
+ }
+ }
+
+ // No invalid characters. Could still be IPv4 or a hostname.
+ host_info->family = CanonHostInfo::NEUTRAL;
+ return false;
+ }
+
+ host_info->out_host.begin = output->length();
+ output->push_back('[');
+ AppendIPv6Address(host_info->address, output);
+ output->push_back(']');
+ host_info->out_host.len = output->length() - host_info->out_host.begin;
+
+ host_info->family = CanonHostInfo::IPV6;
+ return true;
+}
+
+} // namespace
+
+void AppendIPv4Address(const unsigned char address[4], CanonOutput* output) {
+ for (int i = 0; i < 4; i++) {
+ char str[16];
+ _itoa_s(address[i], str, 10);
+
+ for (int ch = 0; str[ch] != 0; ch++)
+ output->push_back(str[ch]);
+
+ if (i != 3)
+ output->push_back('.');
+ }
+}
+
+void AppendIPv6Address(const unsigned char address[16], CanonOutput* output) {
+ // We will output the address according to the rules in:
+ // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4
+
+ // Start by finding where to place the "::" contraction (if any).
+ url_parse::Component contraction_range;
+ ChooseIPv6ContractionRange(address, &contraction_range);
+
+ for (int i = 0; i <= 14;) {
+ // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive.
+ DCHECK(i % 2 == 0);
+ if (i == contraction_range.begin && contraction_range.len > 0) {
+ // Jump over the contraction.
+ if (i == 0)
+ output->push_back(':');
+ output->push_back(':');
+ i = contraction_range.end();
+ } else {
+ // Consume the next 16 bits from |address|.
+ int x = address[i] << 8 | address[i + 1];
+
+ i += 2;
+
+ // Stringify the 16 bit number (at most requires 4 hex digits).
+ char str[5];
+ _itoa_s(x, str, 16);
+ for (int ch = 0; str[ch] != 0; ++ch)
+ output->push_back(str[ch]);
+
+ // Put a colon after each number, except the last.
+ if (i < 16)
+ output->push_back(':');
+ }
+ }
+}
+
+bool FindIPv4Components(const char* spec,
+ const url_parse::Component& host,
+ url_parse::Component components[4]) {
+ return DoFindIPv4Components<char, unsigned char>(spec, host, components);
+}
+
+bool FindIPv4Components(const char16* spec,
+ const url_parse::Component& host,
+ url_parse::Component components[4]) {
+ return DoFindIPv4Components<char16, char16>(spec, host, components);
+}
+
+void CanonicalizeIPAddress(const char* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ if (DoCanonicalizeIPv4Address<char, unsigned char>(
+ spec, host, output, host_info))
+ return;
+ if (DoCanonicalizeIPv6Address<char, unsigned char>(
+ spec, host, output, host_info))
+ return;
+}
+
+void CanonicalizeIPAddress(const char16* spec,
+ const url_parse::Component& host,
+ CanonOutput* output,
+ CanonHostInfo* host_info) {
+ if (DoCanonicalizeIPv4Address<char16, char16>(
+ spec, host, output, host_info))
+ return;
+ if (DoCanonicalizeIPv6Address<char16, char16>(
+ spec, host, output, host_info))
+ return;
+}
+
+CanonHostInfo::Family IPv4AddressToNumber(const char* spec,
+ const url_parse::Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components) {
+ return DoIPv4AddressToNumber<char>(spec, host, address, num_ipv4_components);
+}
+
+CanonHostInfo::Family IPv4AddressToNumber(const char16* spec,
+ const url_parse::Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components) {
+ return DoIPv4AddressToNumber<char16>(
+ spec, host, address, num_ipv4_components);
+}
+
+bool IPv6AddressToNumber(const char* spec,
+ const url_parse::Component& host,
+ unsigned char address[16]) {
+ return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address);
+}
+
+bool IPv6AddressToNumber(const char16* spec,
+ const url_parse::Component& host,
+ unsigned char address[16]) {
+ return DoIPv6AddressToNumber<char16, char16>(spec, host, address);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_ip.h b/url/url_canon_ip.h
new file mode 100644
index 0000000..a2900c6
--- /dev/null
+++ b/url/url_canon_ip.h
@@ -0,0 +1,109 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_IP_H__
+#define GOOGLEURL_SRC_URL_CANON_IP_H__
+
+#include "base/string16.h"
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_common.h"
+#include "googleurl/src/url_parse.h"
+
+namespace url_canon {
+
+// Writes the given IPv4 address to |output|.
+GURL_API void AppendIPv4Address(const unsigned char address[4],
+ CanonOutput* output);
+
+// Writes the given IPv6 address to |output|.
+GURL_API void AppendIPv6Address(const unsigned char address[16],
+ CanonOutput* output);
+
+// Searches the host name for the portions of the IPv4 address. On success,
+// each component will be placed into |components| and it will return true.
+// It will return false if the host can not be separated as an IPv4 address
+// or if there are any non-7-bit characters or other characters that can not
+// be in an IP address. (This is important so we fail as early as possible for
+// common non-IP hostnames.)
+//
+// Not all components may exist. If there are only 3 components, for example,
+// the last one will have a length of -1 or 0 to indicate it does not exist.
+//
+// Note that many platform's inet_addr will ignore everything after a space
+// in certain curcumstances if the stuff before the space looks like an IP
+// address. IE6 is included in this. We do NOT handle this case. In many cases,
+// the browser's canonicalization will get run before this which converts
+// spaces to %20 (in the case of IE7) or rejects them (in the case of
+// Mozilla), so this code path never gets hit. Our host canonicalization will
+// notice these spaces and escape them, which will make IP address finding
+// fail. This seems like better behavior than stripping after a space.
+GURL_API bool FindIPv4Components(const char* spec,
+ const url_parse::Component& host,
+ url_parse::Component components[4]);
+GURL_API bool FindIPv4Components(const char16* spec,
+ const url_parse::Component& host,
+ url_parse::Component components[4]);
+
+// Converts an IPv4 address to a 32-bit number (network byte order).
+//
+// Possible return values:
+// IPV4 - IPv4 address was successfully parsed.
+// BROKEN - Input was formatted like an IPv4 address, but overflow occurred
+// during parsing.
+// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address.
+// It might be an IPv6 address, or a hostname.
+//
+// On success, |num_ipv4_components| will be populated with the number of
+// components in the IPv4 address.
+GURL_API CanonHostInfo::Family IPv4AddressToNumber(
+ const char* spec,
+ const url_parse::Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components);
+GURL_API CanonHostInfo::Family IPv4AddressToNumber(
+ const char16* spec,
+ const url_parse::Component& host,
+ unsigned char address[4],
+ int* num_ipv4_components);
+
+// Converts an IPv6 address to a 128-bit number (network byte order), returning
+// true on success. False means that the input was not a valid IPv6 address.
+//
+// NOTE that |host| is expected to be surrounded by square brackets.
+// i.e. "[::1]" rather than "::1".
+GURL_API bool IPv6AddressToNumber(const char* spec,
+ const url_parse::Component& host,
+ unsigned char address[16]);
+GURL_API bool IPv6AddressToNumber(const char16* spec,
+ const url_parse::Component& host,
+ unsigned char address[16]);
+
+} // namespace url_canon
+
+#endif // GOOGLEURL_SRC_URL_CANON_IP_H__
diff --git a/url/url_canon_mailtourl.cc b/url/url_canon_mailtourl.cc
new file mode 100644
index 0000000..97868b8
--- /dev/null
+++ b/url/url_canon_mailtourl.cc
@@ -0,0 +1,137 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "mailto:" URLs.
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_file.h"
+#include "googleurl/src/url_parse_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+
+ // mailto: only uses {scheme, path, query} -- clear the rest.
+ new_parsed->username = url_parse::Component();
+ new_parsed->password = url_parse::Component();
+ new_parsed->host = url_parse::Component();
+ new_parsed->port = url_parse::Component();
+ new_parsed->ref = url_parse::Component();
+
+ // Scheme (known, so we don't bother running it through the more
+ // complicated scheme canonicalizer).
+ new_parsed->scheme.begin = output->length();
+ output->Append("mailto:", 7);
+ new_parsed->scheme.len = 6;
+
+ bool success = true;
+
+ // Path
+ if (parsed.path.is_valid()) {
+ new_parsed->path.begin = output->length();
+
+ // Copy the path using path URL's more lax escaping rules.
+ // We convert to UTF-8 and escape non-ASCII, but leave all
+ // ASCII characters alone.
+ int end = parsed.path.end();
+ for (int i = parsed.path.begin; i < end; ++i) {
+ UCHAR uch = static_cast<UCHAR>(source.path[i]);
+ if (uch < 0x20 || uch >= 0x80)
+ success &= AppendUTF8EscapedChar(source.path, &i, end, output);
+ else
+ output->push_back(static_cast<char>(uch));
+ }
+
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
+ } else {
+ // No path at all
+ new_parsed->path.reset();
+ }
+
+ // Query -- always use the default utf8 charset converter.
+ CanonicalizeQuery(source.query, parsed.query, NULL,
+ output, &new_parsed->query);
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizeMailtoURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, output, new_parsed);
+}
+
+bool CanonicalizeMailtoURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizeMailtoURL<char16, char16>(
+ URLComponentSource<char16>(spec), parsed, output, new_parsed);
+}
+
+bool ReplaceMailtoURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
+ source, parsed, output, new_parsed);
+}
+
+bool ReplaceMailtoURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeMailtoURL<char, unsigned char>(
+ source, parsed, output, new_parsed);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_path.cc b/url/url_canon_path.cc
new file mode 100644
index 0000000..d86643a
--- /dev/null
+++ b/url/url_canon_path.cc
@@ -0,0 +1,378 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Canonicalization functions for the paths of URLs.
+
+#include "base/logging.h"
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_parse_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+enum CharacterFlags {
+ // Pass through unchanged, whether escaped or unescaped. This doesn't
+ // actually set anything so you can't OR it to check, it's just to make the
+ // table below more clear when neither ESCAPE or UNESCAPE is set.
+ PASS = 0,
+
+ // This character requires special handling in DoPartialPath. Doing this test
+ // first allows us to filter out the common cases of regular characters that
+ // can be directly copied.
+ SPECIAL = 1,
+
+ // This character must be escaped in the canonical output. Note that all
+ // escaped chars also have the "special" bit set so that the code that looks
+ // for this is triggered. Not valid with PASS or ESCAPE
+ ESCAPE_BIT = 2,
+ ESCAPE = ESCAPE_BIT | SPECIAL,
+
+ // This character must be unescaped in canonical output. Not valid with
+ // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these
+ // characters unescaped, they should just be copied.
+ UNESCAPE = 4,
+
+ // This character is disallowed in URLs. Note that the "special" bit is also
+ // set to trigger handling.
+ INVALID_BIT = 8,
+ INVALID = INVALID_BIT | SPECIAL,
+};
+
+// This table contains one of the above flag values. Note some flags are more
+// than one bits because they also turn on the "special" flag. Special is the
+// only flag that may be combined with others.
+//
+// This table is designed to match exactly what IE does with the characters.
+//
+// Dot is even more special, and the escaped version is handled specially by
+// IsDot. Therefore, we don't need the "escape" flag, and even the "unescape"
+// bit is never handled (we just need the "special") bit.
+const unsigned char kPathCharLookup[0x100] = {
+// NULL control chars...
+ INVALID, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+// control chars...
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+// ' ' ! " # $ % & ' ( ) * + , - . /
+ ESCAPE, PASS, ESCAPE, ESCAPE, PASS, ESCAPE, PASS, PASS, PASS, PASS, PASS, PASS, PASS, UNESCAPE,SPECIAL, PASS,
+// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE,
+// @ A B C D E F G H I J K L M N O
+ PASS, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
+// P Q R S T U V W X Y Z [ \ ] ^ _
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE,
+// ` a b c d e f g h i j k l m n o
+ ESCAPE, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
+// p q r s t u v w x y z { | } ~ <NBSP>
+ UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE, ESCAPE, ESCAPE, UNESCAPE,ESCAPE,
+// ...all the high-bit characters are escaped
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
+ ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE};
+
+enum DotDisposition {
+ // The given dot is just part of a filename and is not special.
+ NOT_A_DIRECTORY,
+
+ // The given dot is the current directory.
+ DIRECTORY_CUR,
+
+ // The given dot is the first of a double dot that should take us up one.
+ DIRECTORY_UP
+};
+
+// When the path resolver finds a dot, this function is called with the
+// character following that dot to see what it is. The return value
+// indicates what type this dot is (see above). This code handles the case
+// where the dot is at the end of the input.
+//
+// |*consumed_len| will contain the number of characters in the input that
+// express what we found.
+//
+// If the input is "../foo", |after_dot| = 1, |end| = 6, and
+// at the end, |*consumed_len| = 2 for the "./" this function consumed. The
+// original dot length should be handled by the caller.
+template<typename CHAR>
+DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot,
+ int end, int* consumed_len) {
+ if (after_dot == end) {
+ // Single dot at the end.
+ *consumed_len = 0;
+ return DIRECTORY_CUR;
+ }
+ if (url_parse::IsURLSlash(spec[after_dot])) {
+ // Single dot followed by a slash.
+ *consumed_len = 1; // Consume the slash
+ return DIRECTORY_CUR;
+ }
+
+ int second_dot_len = IsDot(spec, after_dot, end);
+ if (second_dot_len) {
+ int after_second_dot = after_dot + second_dot_len;
+ if (after_second_dot == end) {
+ // Double dot at the end.
+ *consumed_len = second_dot_len;
+ return DIRECTORY_UP;
+ }
+ if (url_parse::IsURLSlash(spec[after_second_dot])) {
+ // Double dot followed by a slash.
+ *consumed_len = second_dot_len + 1;
+ return DIRECTORY_UP;
+ }
+ }
+
+ // The dots are followed by something else, not a directory.
+ *consumed_len = 0;
+ return NOT_A_DIRECTORY;
+}
+
+// Rewinds the output to the previous slash. It is assumed that the output
+// ends with a slash and this doesn't count (we call this when we are
+// appending directory paths, so the previous path component has and ending
+// slash).
+//
+// This will stop at the first slash (assumed to be at position
+// |path_begin_in_output| and not go any higher than that. Some web pages
+// do ".." too many times, so we need to handle that brokenness.
+//
+// It searches for a literal slash rather than including a backslash as well
+// because it is run only on the canonical output.
+//
+// The output is guaranteed to end in a slash when this function completes.
+void BackUpToPreviousSlash(int path_begin_in_output,
+ CanonOutput* output) {
+ DCHECK(output->length() > 0);
+
+ int i = output->length() - 1;
+ DCHECK(output->at(i) == '/');
+ if (i == path_begin_in_output)
+ return; // We're at the first slash, nothing to do.
+
+ // Now back up (skipping the trailing slash) until we find another slash.
+ i--;
+ while (output->at(i) != '/' && i > path_begin_in_output)
+ i--;
+
+ // Now shrink the output to just include that last slash we found.
+ output->set_length(i + 1);
+}
+
+// Appends the given path to the output. It assumes that if the input path
+// starts with a slash, it should be copied to the output. If no path has
+// already been appended to the output (the case when not resolving
+// relative URLs), the path should begin with a slash.
+//
+// If there are already path components (this mode is used when appending
+// relative paths for resolving), it assumes that the output already has
+// a trailing slash and that if the input begins with a slash, it should be
+// copied to the output.
+//
+// We do not collapse multiple slashes in a row to a single slash. It seems
+// no web browsers do this, and we don't want incompababilities, even though
+// it would be correct for most systems.
+template<typename CHAR, typename UCHAR>
+bool DoPartialPath(const CHAR* spec,
+ const url_parse::Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
+ int end = path.end();
+
+ bool success = true;
+ for (int i = path.begin; i < end; i++) {
+ UCHAR uch = static_cast<UCHAR>(spec[i]);
+ if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) {
+ // We only need to test wide input for having non-ASCII characters. For
+ // narrow input, we'll always just use the lookup table. We don't try to
+ // do anything tricky with decoding/validating UTF-8. This function will
+ // read one or two UTF-16 characters and append the output as UTF-8. This
+ // call will be removed in 8-bit mode.
+ success &= AppendUTF8EscapedChar(spec, &i, end, output);
+ } else {
+ // Normal ASCII character or 8-bit input, use the lookup table.
+ unsigned char out_ch = static_cast<unsigned char>(uch);
+ unsigned char flags = kPathCharLookup[out_ch];
+ if (flags & SPECIAL) {
+ // Needs special handling of some sort.
+ int dotlen;
+ if ((dotlen = IsDot(spec, i, end)) > 0) {
+ // See if this dot was preceeded by a slash in the output. We
+ // assume that when canonicalizing paths, they will always
+ // start with a slash and not a dot, so we don't have to
+ // bounds check the output.
+ //
+ // Note that we check this in the case of dots so we don't have to
+ // special case slashes. Since slashes are much more common than
+ // dots, this actually increases performance measurably (though
+ // slightly).
+ DCHECK(output->length() > path_begin_in_output);
+ if (output->length() > path_begin_in_output &&
+ output->at(output->length() - 1) == '/') {
+ // Slash followed by a dot, check to see if this is means relative
+ int consumed_len;
+ switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end,
+ &consumed_len)) {
+ case NOT_A_DIRECTORY:
+ // Copy the dot to the output, it means nothing special.
+ output->push_back('.');
+ i += dotlen - 1;
+ break;
+ case DIRECTORY_CUR: // Current directory, just skip the input.
+ i += dotlen + consumed_len - 1;
+ break;
+ case DIRECTORY_UP:
+ BackUpToPreviousSlash(path_begin_in_output, output);
+ i += dotlen + consumed_len - 1;
+ break;
+ }
+ } else {
+ // This dot is not preceeded by a slash, it is just part of some
+ // file name.
+ output->push_back('.');
+ i += dotlen - 1;
+ }
+
+ } else if (out_ch == '\\') {
+ // Convert backslashes to forward slashes
+ output->push_back('/');
+
+ } else if (out_ch == '%') {
+ // Handle escape sequences.
+ unsigned char unescaped_value;
+ if (DecodeEscaped(spec, &i, end, &unescaped_value)) {
+ // Valid escape sequence, see if we keep, reject, or unescape it.
+ char unescaped_flags = kPathCharLookup[unescaped_value];
+
+ if (unescaped_flags & UNESCAPE) {
+ // This escaped value shouldn't be escaped, copy it.
+ output->push_back(unescaped_value);
+ } else if (unescaped_flags & INVALID_BIT) {
+ // Invalid escaped character, copy it and remember the error.
+ output->push_back('%');
+ output->push_back(static_cast<char>(spec[i - 1]));
+ output->push_back(static_cast<char>(spec[i]));
+ success = false;
+ } else {
+ // Valid escaped character but we should keep it escaped. We
+ // don't want to change the case of any hex letters in case
+ // the server is sensitive to that, so we just copy the two
+ // characters without checking (DecodeEscape will have advanced
+ // to the last character of the pair).
+ output->push_back('%');
+ output->push_back(static_cast<char>(spec[i - 1]));
+ output->push_back(static_cast<char>(spec[i]));
+ }
+ } else {
+ // Invalid escape sequence. IE7 rejects any URLs with such
+ // sequences, while Firefox, IE6, and Safari all pass it through
+ // unchanged. We are more permissive unlike IE7. I don't think this
+ // can cause significant problems, if it does, we should change
+ // to be more like IE7.
+ output->push_back('%');
+ }
+
+ } else if (flags & INVALID_BIT) {
+ // For NULLs, etc. fail.
+ AppendEscapedChar(out_ch, output);
+ success = false;
+
+ } else if (flags & ESCAPE_BIT) {
+ // This character should be escaped.
+ AppendEscapedChar(out_ch, output);
+ }
+ } else {
+ // Nothing special about this character, just append it.
+ output->push_back(out_ch);
+ }
+ }
+ }
+ return success;
+}
+
+template<typename CHAR, typename UCHAR>
+bool DoPath(const CHAR* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path) {
+ bool success = true;
+ out_path->begin = output->length();
+ if (path.len > 0) {
+ // Write out an initial slash if the input has none. If we just parse a URL
+ // and then canonicalize it, it will of course have a slash already. This
+ // check is for the replacement and relative URL resolving cases of file
+ // URLs.
+ if (!url_parse::IsURLSlash(spec[path.begin]))
+ output->push_back('/');
+
+ success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output);
+ } else {
+ // No input, canonical path is a slash.
+ output->push_back('/');
+ }
+ out_path->len = output->length() - out_path->begin;
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizePath(const char* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path) {
+ return DoPath<char, unsigned char>(spec, path, output, out_path);
+}
+
+bool CanonicalizePath(const char16* spec,
+ const url_parse::Component& path,
+ CanonOutput* output,
+ url_parse::Component* out_path) {
+ return DoPath<char16, char16>(spec, path, output, out_path);
+}
+
+bool CanonicalizePartialPath(const char* spec,
+ const url_parse::Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
+ return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output,
+ output);
+}
+
+bool CanonicalizePartialPath(const char16* spec,
+ const url_parse::Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
+ return DoPartialPath<char16, char16>(spec, path, path_begin_in_output,
+ output);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_pathurl.cc b/url/url_canon_pathurl.cc
new file mode 100644
index 0000000..4a990c7
--- /dev/null
+++ b/url/url_canon_pathurl.cc
@@ -0,0 +1,128 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions for canonicalizing "path" URLs. Not to be confused with the path
+// of a URL, these are URLs that have no authority section, only a path. For
+// example, "javascript:" and "data:".
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ // Scheme: this will append the colon.
+ bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
+ output, &new_parsed->scheme);
+
+ // We assume there's no authority for path URLs. Note that hosts should never
+ // have -1 length.
+ new_parsed->username.reset();
+ new_parsed->password.reset();
+ new_parsed->host.reset();
+ new_parsed->port.reset();
+
+ if (parsed.path.is_valid()) {
+ // Copy the path using path URL's more lax escaping rules (think for
+ // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all
+ // ASCII characters alone. This helps readability of JavaStript.
+ new_parsed->path.begin = output->length();
+ int end = parsed.path.end();
+ for (int i = parsed.path.begin; i < end; i++) {
+ UCHAR uch = static_cast<UCHAR>(source.path[i]);
+ if (uch < 0x20 || uch >= 0x80)
+ success &= AppendUTF8EscapedChar(source.path, &i, end, output);
+ else
+ output->push_back(static_cast<char>(uch));
+ }
+ new_parsed->path.len = output->length() - new_parsed->path.begin;
+ } else {
+ // Empty path.
+ new_parsed->path.reset();
+ }
+
+ // Assume there's no query or ref.
+ new_parsed->query.reset();
+ new_parsed->ref.reset();
+
+ return success;
+}
+
+} // namespace
+
+bool CanonicalizePathURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizePathURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, output, new_parsed);
+}
+
+bool CanonicalizePathURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizePathURL<char16, char16>(
+ URLComponentSource<char16>(spec), parsed, output, new_parsed);
+}
+
+bool ReplacePathURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizePathURL<char, unsigned char>(
+ source, parsed, output, new_parsed);
+}
+
+bool ReplacePathURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizePathURL<char, unsigned char>(
+ source, parsed, output, new_parsed);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_query.cc b/url/url_canon_query.cc
new file mode 100644
index 0000000..cee8774
--- /dev/null
+++ b/url/url_canon_query.cc
@@ -0,0 +1,189 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+
+// Query canonicalization in IE
+// ----------------------------
+// IE is very permissive for query parameters specified in links on the page
+// (in contrast to links that it constructs itself based on form data). It does
+// not unescape any character. It does not reject any escape sequence (be they
+// invalid like "%2y" or freaky like %00).
+//
+// IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09),
+// LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier
+// layer since they are removed from all portions of the URL). All other
+// characters are passed unmodified. Invalid UTF-16 sequences are preserved as
+// well, with each character in the input being converted to UTF-8. It is the
+// server's job to make sense of this invalid query.
+//
+// Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page)
+// are converted to the invalid character and sent as unescaped UTF-8 (0xef,
+// 0xbf, 0xbd). This may not be canonicalization, the parser may generate these
+// strings before the URL handler ever sees them.
+//
+// Our query canonicalization
+// --------------------------
+// We escape all non-ASCII characters and control characters, like Firefox.
+// This is more conformant to the URL spec, and there do not seem to be many
+// problems relating to Firefox's behavior.
+//
+// Like IE, we will never unescape (although the application may want to try
+// unescaping to present the user with a more understandable URL). We will
+// replace all invalid sequences (including invalid UTF-16 sequences, which IE
+// doesn't) with the "invalid character," and we will escape it.
+
+namespace url_canon {
+
+namespace {
+
+// Returns true if the characters starting at |begin| and going until |end|
+// (non-inclusive) are all representable in 7-bits.
+template<typename CHAR, typename UCHAR>
+bool IsAllASCII(const CHAR* spec, const url_parse::Component& query) {
+ int end = query.end();
+ for (int i = query.begin; i < end; i++) {
+ if (static_cast<UCHAR>(spec[i]) >= 0x80)
+ return false;
+ }
+ return true;
+}
+
+// Appends the given string to the output, escaping characters that do not
+// match the given |type| in SharedCharTypes. This version will accept 8 or 16
+// bit characters, but assumes that they have only 7-bit values. It also assumes
+// that all UTF-8 values are correct, so doesn't bother checking
+template<typename CHAR>
+void AppendRaw8BitQueryString(const CHAR* source, int length,
+ CanonOutput* output) {
+ for (int i = 0; i < length; i++) {
+ if (!IsQueryChar(static_cast<unsigned char>(source[i])))
+ AppendEscapedChar(static_cast<unsigned char>(source[i]), output);
+ else // Doesn't need escaping.
+ output->push_back(static_cast<char>(source[i]));
+ }
+}
+
+// Runs the converter on the given UTF-8 input. Since the converter expects
+// UTF-16, we have to convert first. The converter must be non-NULL.
+void RunConverter(const char* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output) {
+ // This function will replace any misencoded values with the invalid
+ // character. This is what we want so we don't have to check for error.
+ RawCanonOutputW<1024> utf16;
+ ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16);
+ converter->ConvertFromUTF16(utf16.data(), utf16.length(), output);
+}
+
+// Runs the converter with the given UTF-16 input. We don't have to do
+// anything, but this overriddden function allows us to use the same code
+// for both UTF-8 and UTF-16 input.
+void RunConverter(const char16* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output) {
+ converter->ConvertFromUTF16(&spec[query.begin], query.len, output);
+}
+
+template<typename CHAR, typename UCHAR>
+void DoConvertToQueryEncoding(const CHAR* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output) {
+ if (IsAllASCII<CHAR, UCHAR>(spec, query)) {
+ // Easy: the input can just appended with no character set conversions.
+ AppendRaw8BitQueryString(&spec[query.begin], query.len, output);
+
+ } else {
+ // Harder: convert to the proper encoding first.
+ if (converter) {
+ // Run the converter to get an 8-bit string, then append it, escaping
+ // necessary values.
+ RawCanonOutput<1024> eight_bit;
+ RunConverter(spec, query, converter, &eight_bit);
+ AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output);
+
+ } else {
+ // No converter, do our own UTF-8 conversion.
+ AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);
+ }
+ }
+}
+
+template<typename CHAR, typename UCHAR>
+void DoCanonicalizeQuery(const CHAR* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ url_parse::Component* out_query) {
+ if (query.len < 0) {
+ *out_query = url_parse::Component();
+ return;
+ }
+
+ output->push_back('?');
+ out_query->begin = output->length();
+
+ DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output);
+
+ out_query->len = output->length() - out_query->begin;
+}
+
+} // namespace
+
+void CanonicalizeQuery(const char* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ url_parse::Component* out_query) {
+ DoCanonicalizeQuery<char, unsigned char>(spec, query, converter,
+ output, out_query);
+}
+
+void CanonicalizeQuery(const char16* spec,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output,
+ url_parse::Component* out_query) {
+ DoCanonicalizeQuery<char16, char16>(spec, query, converter,
+ output, out_query);
+}
+
+void ConvertUTF16ToQueryEncoding(const char16* input,
+ const url_parse::Component& query,
+ CharsetConverter* converter,
+ CanonOutput* output) {
+ DoConvertToQueryEncoding<char16, char16>(input, query,
+ converter, output);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_relative.cc b/url/url_canon_relative.cc
new file mode 100644
index 0000000..63630b4
--- /dev/null
+++ b/url/url_canon_relative.cc
@@ -0,0 +1,579 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Canonicalizer functions for working with and resolving relative URLs.
+
+#include "base/logging.h"
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_file.h"
+#include "googleurl/src/url_parse_internal.h"
+#include "googleurl/src/url_util_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
+// 379034), whereas IE is case-insensetive.
+//
+// We choose to be more permissive like IE. We don't need to worry about
+// unescaping or anything here: neither IE or Firefox allow this. We also
+// don't have to worry about invalid scheme characters since we are comparing
+// against the canonical scheme of the base.
+//
+// The base URL should always be canonical, therefore is ASCII.
+template<typename CHAR>
+bool AreSchemesEqual(const char* base,
+ const url_parse::Component& base_scheme,
+ const CHAR* cmp,
+ const url_parse::Component& cmp_scheme) {
+ if (base_scheme.len != cmp_scheme.len)
+ return false;
+ for (int i = 0; i < base_scheme.len; i++) {
+ // We assume the base is already canonical, so we don't have to
+ // canonicalize it.
+ if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) !=
+ base[base_scheme.begin + i])
+ return false;
+ }
+ return true;
+}
+
+#ifdef WIN32
+
+// Here, we also allow Windows paths to be represented as "/C:/" so we can be
+// consistent about URL paths beginning with slashes. This function is like
+// DoesBeginWindowsDrivePath except that it also requires a slash at the
+// beginning.
+template<typename CHAR>
+bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset,
+ int spec_len) {
+ if (start_offset >= spec_len)
+ return false;
+ return url_parse::IsURLSlash(spec[start_offset]) &&
+ url_parse::DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len);
+}
+
+#endif // WIN32
+
+// See IsRelativeURL in the header file for usage.
+template<typename CHAR>
+bool DoIsRelativeURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const CHAR* url,
+ int url_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ url_parse::Component* relative_component) {
+ *is_relative = false; // So we can default later to not relative.
+
+ // Trim whitespace and construct a new range for the substring.
+ int begin = 0;
+ url_parse::TrimURL(url, &begin, &url_len);
+ if (begin >= url_len) {
+ // Empty URLs are relative, but do nothing.
+ *relative_component = url_parse::Component(begin, 0);
+ *is_relative = true;
+ return true;
+ }
+
+#ifdef WIN32
+ // We special case paths like "C:\foo" so they can link directly to the
+ // file on Windows (IE compatability). The security domain stuff should
+ // prevent a link like this from actually being followed if its on a
+ // web page.
+ //
+ // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/"
+ // as relative, as this will just replace the path when the base scheme
+ // is a file and the answer will still be correct.
+ //
+ // We require strict backslashes when detecting UNC since two forward
+ // shashes should be treated a a relative URL with a hostname.
+ if (url_parse::DoesBeginWindowsDriveSpec(url, begin, url_len) ||
+ url_parse::DoesBeginUNCPath(url, begin, url_len, true))
+ return true;
+#endif // WIN32
+
+ // See if we've got a scheme, if not, we know this is a relative URL.
+ // BUT: Just because we have a scheme, doesn't make it absolute.
+ // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
+ // empty, we treat it as relative (":foo") like IE does.
+ url_parse::Component scheme;
+ if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) {
+ // Don't allow relative URLs if the base scheme doesn't support it.
+ if (!is_base_hierarchical)
+ return false;
+
+ *relative_component = url_parse::MakeRange(begin, url_len);
+ *is_relative = true;
+ return true;
+ }
+
+ // If the scheme isn't valid, then it's relative.
+ int scheme_end = scheme.end();
+ for (int i = scheme.begin; i < scheme_end; i++) {
+ if (!CanonicalSchemeChar(url[i])) {
+ *relative_component = url_parse::MakeRange(begin, url_len);
+ *is_relative = true;
+ return true;
+ }
+ }
+
+ // If the scheme is not the same, then we can't count it as relative.
+ if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme))
+ return true;
+
+ // When the scheme that they both share is not hierarchical, treat the
+ // incoming scheme as absolute (this way with the base of "data:foo",
+ // "data:bar" will be reported as absolute.
+ if (!is_base_hierarchical)
+ return true;
+
+ int colon_offset = scheme.end();
+
+ // If it's a filesystem URL, the only valid way to make it relative is not to
+ // supply a scheme. There's no equivalent to e.g. http:index.html.
+ if (url_util::CompareSchemeComponent(url, scheme, "filesystem"))
+ return true;
+
+ // ExtractScheme guarantees that the colon immediately follows what it
+ // considers to be the scheme. CountConsecutiveSlashes will handle the
+ // case where the begin offset is the end of the input.
+ int num_slashes = url_parse::CountConsecutiveSlashes(url, colon_offset + 1,
+ url_len);
+
+ if (num_slashes == 0 || num_slashes == 1) {
+ // No slashes means it's a relative path like "http:foo.html". One slash
+ // is an absolute path. "http:/home/foo.html"
+ *is_relative = true;
+ *relative_component = url_parse::MakeRange(colon_offset + 1, url_len);
+ return true;
+ }
+
+ // Two or more slashes after the scheme we treat as absolute.
+ return true;
+}
+
+// Copies all characters in the range [begin, end) of |spec| to the output,
+// up until and including the last slash. There should be a slash in the
+// range, if not, nothing will be copied.
+//
+// The input is assumed to be canonical, so we search only for exact slashes
+// and not backslashes as well. We also know that it's ASCII.
+void CopyToLastSlash(const char* spec,
+ int begin,
+ int end,
+ CanonOutput* output) {
+ // Find the last slash.
+ int last_slash = -1;
+ for (int i = end - 1; i >= begin; i--) {
+ if (spec[i] == '/') {
+ last_slash = i;
+ break;
+ }
+ }
+ if (last_slash < 0)
+ return; // No slash.
+
+ // Copy.
+ for (int i = begin; i <= last_slash; i++)
+ output->push_back(spec[i]);
+}
+
+// Copies a single component from the source to the output. This is used
+// when resolving relative URLs and a given component is unchanged. Since the
+// source should already be canonical, we don't have to do anything special,
+// and the input is ASCII.
+void CopyOneComponent(const char* source,
+ const url_parse::Component& source_component,
+ CanonOutput* output,
+ url_parse::Component* output_component) {
+ if (source_component.len < 0) {
+ // This component is not present.
+ *output_component = url_parse::Component();
+ return;
+ }
+
+ output_component->begin = output->length();
+ int source_end = source_component.end();
+ for (int i = source_component.begin; i < source_end; i++)
+ output->push_back(source[i]);
+ output_component->len = output->length() - output_component->begin;
+}
+
+#ifdef WIN32
+
+// Called on Windows when the base URL is a file URL, this will copy the "C:"
+// to the output, if there is a drive letter and if that drive letter is not
+// being overridden by the relative URL. Otherwise, do nothing.
+//
+// It will return the index of the beginning of the next character in the
+// base to be processed: if there is a "C:", the slash after it, or if
+// there is no drive letter, the slash at the beginning of the path, or
+// the end of the base. This can be used as the starting offset for further
+// path processing.
+template<typename CHAR>
+int CopyBaseDriveSpecIfNecessary(const char* base_url,
+ int base_path_begin,
+ int base_path_end,
+ const CHAR* relative_url,
+ int path_start,
+ int relative_url_len,
+ CanonOutput* output) {
+ if (base_path_begin >= base_path_end)
+ return base_path_begin; // No path.
+
+ // If the relative begins with a drive spec, don't do anything. The existing
+ // drive spec in the base will be replaced.
+ if (url_parse::DoesBeginWindowsDriveSpec(relative_url,
+ path_start, relative_url_len)) {
+ return base_path_begin; // Relative URL path is "C:/foo"
+ }
+
+ // The path should begin with a slash (as all canonical paths do). We check
+ // if it is followed by a drive letter and copy it.
+ if (DoesBeginSlashWindowsDriveSpec(base_url,
+ base_path_begin,
+ base_path_end)) {
+ // Copy the two-character drive spec to the output. It will now look like
+ // "file:///C:" so the rest of it can be treated like a standard path.
+ output->push_back('/');
+ output->push_back(base_url[base_path_begin + 1]);
+ output->push_back(base_url[base_path_begin + 2]);
+ return base_path_begin + 3;
+ }
+
+ return base_path_begin;
+}
+
+#endif // WIN32
+
+// A subroutine of DoResolveRelativeURL, this resolves the URL knowning that
+// the input is a relative path or less (qyuery or ref).
+template<typename CHAR>
+bool DoResolveRelativePath(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ bool base_is_file,
+ const CHAR* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ bool success = true;
+
+ // We know the authority section didn't change, copy it to the output. We
+ // also know we have a path so can copy up to there.
+ url_parse::Component path, query, ref;
+ url_parse::ParsePathInternal(relative_url,
+ relative_component,
+ &path,
+ &query,
+ &ref);
+ // Canonical URLs always have a path, so we can use that offset.
+ output->Append(base_url, base_parsed.path.begin);
+
+ if (path.len > 0) {
+ // The path is replaced or modified.
+ int true_path_begin = output->length();
+
+ // For file: URLs on Windows, we don't want to treat the drive letter and
+ // colon as part of the path for relative file resolution when the
+ // incoming URL does not provide a drive spec. We save the true path
+ // beginning so we can fix it up after we are done.
+ int base_path_begin = base_parsed.path.begin;
+#ifdef WIN32
+ if (base_is_file) {
+ base_path_begin = CopyBaseDriveSpecIfNecessary(
+ base_url, base_parsed.path.begin, base_parsed.path.end(),
+ relative_url, relative_component.begin, relative_component.end(),
+ output);
+ // Now the output looks like either "file://" or "file:///C:"
+ // and we can start appending the rest of the path. |base_path_begin|
+ // points to the character in the base that comes next.
+ }
+#endif // WIN32
+
+ if (url_parse::IsURLSlash(relative_url[path.begin])) {
+ // Easy case: the path is an absolute path on the server, so we can
+ // just replace everything from the path on with the new versions.
+ // Since the input should be canonical hierarchical URL, we should
+ // always have a path.
+ success &= CanonicalizePath(relative_url, path,
+ output, &out_parsed->path);
+ } else {
+ // Relative path, replace the query, and reference. We take the
+ // original path with the file part stripped, and append the new path.
+ // The canonicalizer will take care of resolving ".." and "."
+ int path_begin = output->length();
+ CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(),
+ output);
+ success &= CanonicalizePartialPath(relative_url, path, path_begin,
+ output);
+ out_parsed->path = url_parse::MakeRange(path_begin, output->length());
+
+ // Copy the rest of the stuff after the path from the relative path.
+ }
+
+ // Finish with the query and reference part (these can't fail).
+ CanonicalizeQuery(relative_url, query, query_converter,
+ output, &out_parsed->query);
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+
+ // Fix the path beginning to add back the "C:" we may have written above.
+ out_parsed->path = url_parse::MakeRange(true_path_begin,
+ out_parsed->path.end());
+ return success;
+ }
+
+ // If we get here, the path is unchanged: copy to output.
+ CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path);
+
+ if (query.is_valid()) {
+ // Just the query specified, replace the query and reference (ignore
+ // failures for refs)
+ CanonicalizeQuery(relative_url, query, query_converter,
+ output, &out_parsed->query);
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+ return success;
+ }
+
+ // If we get here, the query is unchanged: copy to output. Note that the
+ // range of the query parameter doesn't include the question mark, so we
+ // have to add it manually if there is a component.
+ if (base_parsed.query.is_valid())
+ output->push_back('?');
+ CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query);
+
+ if (ref.is_valid()) {
+ // Just the reference specified: replace it (ignoring failures).
+ CanonicalizeRef(relative_url, ref, output, &out_parsed->ref);
+ return success;
+ }
+
+ // We should always have something to do in this function, the caller checks
+ // that some component is being replaced.
+ DCHECK(false) << "Not reached";
+ return success;
+}
+
+// Resolves a relative URL that contains a host. Typically, these will
+// be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which
+// should be kept from the original URL is the scheme.
+template<typename CHAR>
+bool DoResolveRelativeHost(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ const CHAR* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ // Parse the relative URL, just like we would for anything following a
+ // scheme.
+ url_parse::Parsed relative_parsed; // Everything but the scheme is valid.
+ url_parse::ParseAfterScheme(&relative_url[relative_component.begin],
+ relative_component.len, relative_component.begin,
+ &relative_parsed);
+
+ // Now we can just use the replacement function to replace all the necessary
+ // parts of the old URL with the new one.
+ Replacements<CHAR> replacements;
+ replacements.SetUsername(relative_url, relative_parsed.username);
+ replacements.SetPassword(relative_url, relative_parsed.password);
+ replacements.SetHost(relative_url, relative_parsed.host);
+ replacements.SetPort(relative_url, relative_parsed.port);
+ replacements.SetPath(relative_url, relative_parsed.path);
+ replacements.SetQuery(relative_url, relative_parsed.query);
+ replacements.SetRef(relative_url, relative_parsed.ref);
+
+ return ReplaceStandardURL(base_url, base_parsed, replacements,
+ query_converter, output, out_parsed);
+}
+
+// Resolves a relative URL that happens to be an absolute file path. Examples
+// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
+template<typename CHAR>
+bool DoResolveAbsoluteFile(const CHAR* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ // Parse the file URL. The file URl parsing function uses the same logic
+ // as we do for determining if the file is absolute, in which case it will
+ // not bother to look for a scheme.
+ url_parse::Parsed relative_parsed;
+ url_parse::ParseFileURL(&relative_url[relative_component.begin],
+ relative_component.len, &relative_parsed);
+
+ return CanonicalizeFileURL(&relative_url[relative_component.begin],
+ relative_component.len, relative_parsed,
+ query_converter, output, out_parsed);
+}
+
+// TODO(brettw) treat two slashes as root like Mozilla for FTP?
+template<typename CHAR>
+bool DoResolveRelativeURL(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ bool base_is_file,
+ const CHAR* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ // Starting point for our output parsed. We'll fix what we change.
+ *out_parsed = base_parsed;
+
+ // Sanity check: the input should have a host or we'll break badly below.
+ // We can only resolve relative URLs with base URLs that have hosts and
+ // paths (even the default path of "/" is OK).
+ //
+ // We allow hosts with no length so we can handle file URLs, for example.
+ if (base_parsed.path.len <= 0) {
+ // On error, return the input (resolving a relative URL on a non-relative
+ // base = the base).
+ int base_len = base_parsed.Length();
+ for (int i = 0; i < base_len; i++)
+ output->push_back(base_url[i]);
+ return false;
+ }
+
+ if (relative_component.len <= 0) {
+ // Empty relative URL, leave unchanged, only removing the ref component.
+ int base_len = base_parsed.Length();
+ base_len -= base_parsed.ref.len + 1;
+ out_parsed->ref.reset();
+ output->Append(base_url, base_len);
+ return true;
+ }
+
+ int num_slashes = url_parse::CountConsecutiveSlashes(
+ relative_url, relative_component.begin, relative_component.end());
+
+#ifdef WIN32
+ // On Windows, two slashes for a file path (regardless of which direction
+ // they are) means that it's UNC. Two backslashes on any base scheme mean
+ // that it's an absolute UNC path (we use the base_is_file flag to control
+ // how strict the UNC finder is).
+ //
+ // We also allow Windows absolute drive specs on any scheme (for example
+ // "c:\foo") like IE does. There must be no preceeding slashes in this
+ // case (we reject anything like "/c:/foo") because that should be treated
+ // as a path. For file URLs, we allow any number of slashes since that would
+ // be setting the path.
+ //
+ // This assumes the absolute path resolver handles absolute URLs like this
+ // properly. url_util::DoCanonicalize does this.
+ int after_slashes = relative_component.begin + num_slashes;
+ if (url_parse::DoesBeginUNCPath(relative_url, relative_component.begin,
+ relative_component.end(), !base_is_file) ||
+ ((num_slashes == 0 || base_is_file) &&
+ url_parse::DoesBeginWindowsDriveSpec(relative_url, after_slashes,
+ relative_component.end()))) {
+ return DoResolveAbsoluteFile(relative_url, relative_component,
+ query_converter, output, out_parsed);
+ }
+#else
+ // Other platforms need explicit handling for file: URLs with multiple
+ // slashes because the generic scheme parsing always extracts a host, but a
+ // file: URL only has a host if it has exactly 2 slashes. This also
+ // handles the special case where the URL is only slashes, since that
+ // doesn't have a host part either.
+ if (base_is_file &&
+ (num_slashes > 2 || num_slashes == relative_component.len)) {
+ return DoResolveAbsoluteFile(relative_url, relative_component,
+ query_converter, output, out_parsed);
+ }
+#endif
+
+ // Any other double-slashes mean that this is relative to the scheme.
+ if (num_slashes >= 2) {
+ return DoResolveRelativeHost(base_url, base_parsed,
+ relative_url, relative_component,
+ query_converter, output, out_parsed);
+ }
+
+ // When we get here, we know that the relative URL is on the same host.
+ return DoResolveRelativePath(base_url, base_parsed, base_is_file,
+ relative_url, relative_component,
+ query_converter, output, out_parsed);
+}
+
+} // namespace
+
+bool IsRelativeURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const char* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ url_parse::Component* relative_component) {
+ return DoIsRelativeURL<char>(
+ base, base_parsed, fragment, fragment_len, is_base_hierarchical,
+ is_relative, relative_component);
+}
+
+bool IsRelativeURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const char16* fragment,
+ int fragment_len,
+ bool is_base_hierarchical,
+ bool* is_relative,
+ url_parse::Component* relative_component) {
+ return DoIsRelativeURL<char16>(
+ base, base_parsed, fragment, fragment_len, is_base_hierarchical,
+ is_relative, relative_component);
+}
+
+bool ResolveRelativeURL(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ bool base_is_file,
+ const char* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ return DoResolveRelativeURL<char>(
+ base_url, base_parsed, base_is_file, relative_url,
+ relative_component, query_converter, output, out_parsed);
+}
+
+bool ResolveRelativeURL(const char* base_url,
+ const url_parse::Parsed& base_parsed,
+ bool base_is_file,
+ const char16* relative_url,
+ const url_parse::Component& relative_component,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ return DoResolveRelativeURL<char16>(
+ base_url, base_parsed, base_is_file, relative_url,
+ relative_component, query_converter, output, out_parsed);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_stdstring.h b/url/url_canon_stdstring.h
new file mode 100644
index 0000000..21272e0
--- /dev/null
+++ b/url/url_canon_stdstring.h
@@ -0,0 +1,134 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This header file defines a canonicalizer output method class for STL
+// strings. Because the canonicalizer tries not to be dependent on the STL,
+// we have segregated it here.
+
+#ifndef GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+#define GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+
+#include <string>
+#include "googleurl/src/url_canon.h"
+
+namespace url_canon {
+
+// Write into a std::string given in the constructor. This object does not own
+// the string itself, and the user must ensure that the string stays alive
+// throughout the lifetime of this object.
+//
+// The given string will be appended to; any existing data in the string will
+// be preserved. The caller should reserve() the amount of data in the string
+// they expect to be written. We will resize if necessary, but that's slow.
+//
+// Note that when canonicalization is complete, the string will likely have
+// unused space at the end because we make the string very big to start out
+// with (by |initial_size|). This ends up being important because resize
+// operations are slow, and because the base class needs to write directly
+// into the buffer.
+//
+// Therefore, the user should call Complete() before using the string that
+// this class wrote into.
+class StdStringCanonOutput : public CanonOutput {
+ public:
+ StdStringCanonOutput(std::string* str)
+ : CanonOutput(),
+ str_(str) {
+ cur_len_ = static_cast<int>(str_->size()); // Append to existing data.
+ str_->resize(str_->capacity());
+ buffer_ = str_->empty() ? NULL : &(*str_)[0];
+ buffer_len_ = static_cast<int>(str_->size());
+ }
+ virtual ~StdStringCanonOutput() {
+ // Nothing to do, we don't own the string.
+ }
+
+ // Must be called after writing has completed but before the string is used.
+ void Complete() {
+ str_->resize(cur_len_);
+ buffer_len_ = cur_len_;
+ }
+
+ virtual void Resize(int sz) {
+ str_->resize(sz);
+ buffer_ = str_->empty() ? NULL : &(*str_)[0];
+ buffer_len_ = sz;
+ }
+
+ protected:
+ std::string* str_;
+};
+
+// An extension of the Replacements class that allows the setters to use
+// standard strings.
+//
+// The strings passed as arguments are not copied and must remain valid until
+// this class goes out of scope.
+template<typename STR>
+class StdStringReplacements :
+ public url_canon::Replacements<typename STR::value_type> {
+ public:
+ void SetSchemeStr(const STR& s) {
+ this->SetScheme(s.data(),
+ url_parse::Component(0, static_cast<int>(s.length())));
+ }
+ void SetUsernameStr(const STR& s) {
+ this->SetUsername(s.data(),
+ url_parse::Component(0, static_cast<int>(s.length())));
+ }
+ void SetPasswordStr(const STR& s) {
+ this->SetPassword(s.data(),
+ url_parse::Component(0, static_cast<int>(s.length())));
+ }
+ void SetHostStr(const STR& s) {
+ this->SetHost(s.data(),
+ url_parse::Component(0, static_cast<int>(s.length())));
+ }
+ void SetPortStr(const STR& s) {
+ this->SetPort(s.data(),
+ url_parse::Component(0, static_cast<int>(s.length())));
+ }
+ void SetPathStr(const STR& s) {
+ this->SetPath(s.data(),
+ url_parse::Component(0, static_cast<int>(s.length())));
+ }
+ void SetQueryStr(const STR& s) {
+ this->SetQuery(s.data(),
+ url_parse::Component(0, static_cast<int>(s.length())));
+ }
+ void SetRefStr(const STR& s) {
+ this->SetRef(s.data(),
+ url_parse::Component(0, static_cast<int>(s.length())));
+ }
+};
+
+} // namespace url_canon
+
+#endif // GOOGLEURL_SRC_URL_CANON_STDSTRING_H__
+
diff --git a/url/url_canon_stdurl.cc b/url/url_canon_stdurl.cc
new file mode 100644
index 0000000..1e21a14
--- /dev/null
+++ b/url/url_canon_stdurl.cc
@@ -0,0 +1,211 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Functions to canonicalize "standard" URLs, which are ones that have an
+// authority section including a host name.
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_internal.h"
+
+namespace url_canon {
+
+namespace {
+
+template<typename CHAR, typename UCHAR>
+bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ // Scheme: this will append the colon.
+ bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
+ output, &new_parsed->scheme);
+
+ // Authority (username, password, host, port)
+ bool have_authority;
+ if (parsed.username.is_valid() || parsed.password.is_valid() ||
+ parsed.host.is_nonempty() || parsed.port.is_valid()) {
+ have_authority = true;
+
+ // Only write the authority separators when we have a scheme.
+ if (parsed.scheme.is_valid()) {
+ output->push_back('/');
+ output->push_back('/');
+ }
+
+ // User info: the canonicalizer will handle the : and @.
+ success &= CanonicalizeUserInfo(source.username, parsed.username,
+ source.password, parsed.password,
+ output,
+ &new_parsed->username,
+ &new_parsed->password);
+
+ success &= CanonicalizeHost(source.host, parsed.host,
+ output, &new_parsed->host);
+
+ // Host must not be empty for standard URLs.
+ if (!parsed.host.is_nonempty())
+ success = false;
+
+ // Port: the port canonicalizer will handle the colon.
+ int default_port = DefaultPortForScheme(
+ &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len);
+ success &= CanonicalizePort(source.port, parsed.port, default_port,
+ output, &new_parsed->port);
+ } else {
+ // No authority, clear the components.
+ have_authority = false;
+ new_parsed->host.reset();
+ new_parsed->username.reset();
+ new_parsed->password.reset();
+ new_parsed->port.reset();
+ success = false; // Standard URLs must have an authority.
+ }
+
+ // Path
+ if (parsed.path.is_valid()) {
+ success &= CanonicalizePath(source.path, parsed.path,
+ output, &new_parsed->path);
+ } else if (have_authority ||
+ parsed.query.is_valid() || parsed.ref.is_valid()) {
+ // When we have an empty path, make up a path when we have an authority
+ // or something following the path. The only time we allow an empty
+ // output path is when there is nothing else.
+ new_parsed->path = url_parse::Component(output->length(), 1);
+ output->push_back('/');
+ } else {
+ // No path at all
+ new_parsed->path.reset();
+ }
+
+ // Query
+ CanonicalizeQuery(source.query, parsed.query, query_converter,
+ output, &new_parsed->query);
+
+ // Ref: ignore failure for this, since the page can probably still be loaded.
+ CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
+
+ return success;
+}
+
+} // namespace
+
+
+// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
+// if the scheme is unknown.
+int DefaultPortForScheme(const char* scheme, int scheme_len) {
+ int default_port = url_parse::PORT_UNSPECIFIED;
+ switch (scheme_len) {
+ case 4:
+ if (!strncmp(scheme, "http", scheme_len))
+ default_port = 80;
+ break;
+ case 5:
+ if (!strncmp(scheme, "https", scheme_len))
+ default_port = 443;
+ break;
+ case 3:
+ if (!strncmp(scheme, "ftp", scheme_len))
+ default_port = 21;
+ else if (!strncmp(scheme, "wss", scheme_len))
+ default_port = 443;
+ break;
+ case 6:
+ if (!strncmp(scheme, "gopher", scheme_len))
+ default_port = 70;
+ break;
+ case 2:
+ if (!strncmp(scheme, "ws", scheme_len))
+ default_port = 80;
+ break;
+ }
+ return default_port;
+}
+
+bool CanonicalizeStandardURL(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizeStandardURL<char, unsigned char>(
+ URLComponentSource<char>(spec), parsed, query_converter,
+ output, new_parsed);
+}
+
+bool CanonicalizeStandardURL(const char16* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ return DoCanonicalizeStandardURL<char16, char16>(
+ URLComponentSource<char16>(spec), parsed, query_converter,
+ output, new_parsed);
+}
+
+// It might be nice in the future to optimize this so unchanged components don't
+// need to be recanonicalized. This is especially true since the common case for
+// ReplaceComponents is removing things we don't want, like reference fragments
+// and usernames. These cases can become more efficient if we can assume the
+// rest of the URL is OK with these removed (or only the modified parts
+// recanonicalized). This would be much more complex to implement, however.
+//
+// You would also need to update DoReplaceComponents in url_util.cc which
+// relies on this re-checking everything (see the comment there for why).
+bool ReplaceStandardURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupOverrideComponents(base, replacements, &source, &parsed);
+ return DoCanonicalizeStandardURL<char, unsigned char>(
+ source, parsed, query_converter, output, new_parsed);
+}
+
+// For 16-bit replacements, we turn all the replacements into UTF-8 so the
+// regular codepath can be used.
+bool ReplaceStandardURL(const char* base,
+ const url_parse::Parsed& base_parsed,
+ const Replacements<char16>& replacements,
+ CharsetConverter* query_converter,
+ CanonOutput* output,
+ url_parse::Parsed* new_parsed) {
+ RawCanonOutput<1024> utf8;
+ URLComponentSource<char> source(base);
+ url_parse::Parsed parsed(base_parsed);
+ SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
+ return DoCanonicalizeStandardURL<char, unsigned char>(
+ source, parsed, query_converter, output, new_parsed);
+}
+
+} // namespace url_canon
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
new file mode 100644
index 0000000..0c57f55
--- /dev/null
+++ b/url/url_canon_unittest.cc
@@ -0,0 +1,2133 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <errno.h>
+#include <unicode/ucnv.h>
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_icu.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_canon_stdstring.h"
+#include "googleurl/src/url_parse.h"
+#include "googleurl/src/url_test_utils.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+// Some implementations of base/basictypes.h may define ARRAYSIZE.
+// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro
+// which is in our version of basictypes.h.
+#ifndef ARRAYSIZE
+#define ARRAYSIZE ARRAYSIZE_UNSAFE
+#endif
+
+using url_test_utils::WStringToUTF16;
+using url_test_utils::ConvertUTF8ToUTF16;
+using url_test_utils::ConvertUTF16ToUTF8;
+using url_canon::CanonHostInfo;
+
+namespace {
+
+struct ComponentCase {
+ const char* input;
+ const char* expected;
+ url_parse::Component expected_component;
+ bool expected_success;
+};
+
+// ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests
+// treat each input as optional, and will only try processing if non-NULL.
+// The output is always 8-bit.
+struct DualComponentCase {
+ const char* input8;
+ const wchar_t* input16;
+ const char* expected;
+ url_parse::Component expected_component;
+ bool expected_success;
+};
+
+// Test cases for CanonicalizeIPAddress(). The inputs are identical to
+// DualComponentCase, but the output has extra CanonHostInfo fields.
+struct IPAddressCase {
+ const char* input8;
+ const wchar_t* input16;
+ const char* expected;
+ url_parse::Component expected_component;
+
+ // CanonHostInfo fields, for verbose output.
+ CanonHostInfo::Family expected_family;
+ int expected_num_ipv4_components;
+ const char* expected_address_hex; // Two hex chars per IP address byte.
+};
+
+std::string BytesToHexString(unsigned char bytes[16], int length) {
+ EXPECT_TRUE(length == 0 || length == 4 || length == 16)
+ << "Bad IP address length: " << length;
+ std::string result;
+ for (int i = 0; i < length; ++i) {
+ result.push_back(url_canon::kHexCharLookup[(bytes[i] >> 4) & 0xf]);
+ result.push_back(url_canon::kHexCharLookup[bytes[i] & 0xf]);
+ }
+ return result;
+}
+
+struct ReplaceCase {
+ const char* base;
+ const char* scheme;
+ const char* username;
+ const char* password;
+ const char* host;
+ const char* port;
+ const char* path;
+ const char* query;
+ const char* ref;
+ const char* expected;
+};
+
+// Wrapper around a UConverter object that managers creation and destruction.
+class UConvScoper {
+ public:
+ explicit UConvScoper(const char* charset_name) {
+ UErrorCode err = U_ZERO_ERROR;
+ converter_ = ucnv_open(charset_name, &err);
+ }
+
+ ~UConvScoper() {
+ if (converter_)
+ ucnv_close(converter_);
+ }
+
+ // Returns the converter object, may be NULL.
+ UConverter* converter() const { return converter_; }
+
+ private:
+ UConverter* converter_;
+};
+
+// Magic string used in the replacements code that tells SetupReplComp to
+// call the clear function.
+const char kDeleteComp[] = "|";
+
+// Sets up a replacement for a single component. This is given pointers to
+// the set and clear function for the component being replaced, and will
+// either set the component (if it exists) or clear it (if the replacement
+// string matches kDeleteComp).
+//
+// This template is currently used only for the 8-bit case, and the strlen
+// causes it to fail in other cases. It is left a template in case we have
+// tests for wide replacements.
+template<typename CHAR>
+void SetupReplComp(
+ void (url_canon::Replacements<CHAR>::*set)(const CHAR*,
+ const url_parse::Component&),
+ void (url_canon::Replacements<CHAR>::*clear)(),
+ url_canon::Replacements<CHAR>* rep,
+ const CHAR* str) {
+ if (str && str[0] == kDeleteComp[0]) {
+ (rep->*clear)();
+ } else if (str) {
+ (rep->*set)(str, url_parse::Component(0, static_cast<int>(strlen(str))));
+ }
+}
+
+} // namespace
+
+TEST(URLCanonTest, DoAppendUTF8) {
+ struct UTF8Case {
+ unsigned input;
+ const char* output;
+ } utf_cases[] = {
+ // Valid code points.
+ {0x24, "\x24"},
+ {0xA2, "\xC2\xA2"},
+ {0x20AC, "\xE2\x82\xAC"},
+ {0x24B62, "\xF0\xA4\xAD\xA2"},
+ {0x10FFFF, "\xF4\x8F\xBF\xBF"},
+ };
+ std::string out_str;
+ for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) {
+ out_str.clear();
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_canon::AppendUTF8Value(utf_cases[i].input, &output);
+ output.Complete();
+ EXPECT_EQ(utf_cases[i].output, out_str);
+ }
+}
+
+// TODO(mattm): Can't run this in debug mode for now, since the DCHECK will
+// cause the Chromium stacktrace dialog to appear and hang the test.
+// See http://crbug.com/49580.
+#if defined(GTEST_HAS_DEATH_TEST) && defined(NDEBUG)
+TEST(URLCanonTest, DoAppendUTF8Invalid) {
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ // Invalid code point (too large).
+ ASSERT_DEBUG_DEATH({
+ url_canon::AppendUTF8Value(0x110000, &output);
+ output.Complete();
+ EXPECT_EQ("", out_str);
+ }, "");
+}
+#endif
+
+TEST(URLCanonTest, UTF) {
+ // Low-level test that we handle reading, canonicalization, and writing
+ // UTF-8/UTF-16 strings properly.
+ struct UTFCase {
+ const char* input8;
+ const wchar_t* input16;
+ bool expected_success;
+ const char* output;
+ } utf_cases[] = {
+ // Valid canonical input should get passed through & escaped.
+ {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
+ // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
+ {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
+ // Non-shortest-form UTF-8 are invalid. The bad char should be replaced
+ // with the invalid character (EF BF DB in UTF-8).
+ {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, "%EF%BF%BD%E5%A5%BD"},
+ // Invalid UTF-8 sequences should be marked as invalid (the first
+ // sequence is truncated).
+ {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"},
+ // Character going off the end.
+ {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"},
+ // ...same with low surrogates with no high surrogate.
+ {"\xed\xb0\x80", L"\xdc00", false, "%EF%BF%BD"},
+ // Test a UTF-8 encoded surrogate value is marked as invalid.
+ // ED A0 80 = U+D800
+ {"\xed\xa0\x80", NULL, false, "%EF%BF%BD"},
+ };
+
+ std::string out_str;
+ for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) {
+ if (utf_cases[i].input8) {
+ out_str.clear();
+ url_canon::StdStringCanonOutput output(&out_str);
+
+ int input_len = static_cast<int>(strlen(utf_cases[i].input8));
+ bool success = true;
+ for (int ch = 0; ch < input_len; ch++) {
+ success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len,
+ &output);
+ }
+ output.Complete();
+ EXPECT_EQ(utf_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(utf_cases[i].output), out_str);
+ }
+ if (utf_cases[i].input16) {
+ out_str.clear();
+ url_canon::StdStringCanonOutput output(&out_str);
+
+ string16 input_str(WStringToUTF16(utf_cases[i].input16));
+ int input_len = static_cast<int>(input_str.length());
+ bool success = true;
+ for (int ch = 0; ch < input_len; ch++) {
+ success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
+ &output);
+ }
+ output.Complete();
+ EXPECT_EQ(utf_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(utf_cases[i].output), out_str);
+ }
+
+ if (utf_cases[i].input8 && utf_cases[i].input16 &&
+ utf_cases[i].expected_success) {
+ // Check that the UTF-8 and UTF-16 inputs are equivalent.
+
+ // UTF-16 -> UTF-8
+ std::string input8_str(utf_cases[i].input8);
+ string16 input16_str(WStringToUTF16(utf_cases[i].input16));
+ EXPECT_EQ(input8_str, ConvertUTF16ToUTF8(input16_str));
+
+ // UTF-8 -> UTF-16
+ EXPECT_EQ(input16_str, ConvertUTF8ToUTF16(input8_str));
+ }
+ }
+}
+
+TEST(URLCanonTest, ICUCharsetConverter) {
+ struct ICUCase {
+ const wchar_t* input;
+ const char* encoding;
+ const char* expected;
+ } icu_cases[] = {
+ // UTF-8.
+ {L"Hello, world", "utf-8", "Hello, world"},
+ {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
+ // Non-BMP UTF-8.
+ {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
+ // Big5
+ {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
+ // Unrepresentable character in the destination set.
+ {L"hello\x4f60\x06de\x597dworld", "big5", "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) {
+ UConvScoper conv(icu_cases[i].encoding);
+ ASSERT_TRUE(conv.converter() != NULL);
+ url_canon::ICUCharsetConverter converter(conv.converter());
+
+ std::string str;
+ url_canon::StdStringCanonOutput output(&str);
+
+ string16 input_str(WStringToUTF16(icu_cases[i].input));
+ int input_len = static_cast<int>(input_str.length());
+ converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
+ output.Complete();
+
+ EXPECT_STREQ(icu_cases[i].expected, str.c_str());
+ }
+
+ // Test string sizes around the resize boundary for the output to make sure
+ // the converter resizes as needed.
+ const int static_size = 16;
+ UConvScoper conv("utf-8");
+ ASSERT_TRUE(conv.converter());
+ url_canon::ICUCharsetConverter converter(conv.converter());
+ for (int i = static_size - 2; i <= static_size + 2; i++) {
+ // Make a string with the appropriate length.
+ string16 input;
+ for (int ch = 0; ch < i; ch++)
+ input.push_back('a');
+
+ url_canon::RawCanonOutput<static_size> output;
+ converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
+ &output);
+ EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
+ }
+}
+
+TEST(URLCanonTest, Scheme) {
+ // Here, we're mostly testing that unusual characters are handled properly.
+ // The canonicalizer doesn't do any parsing or whitespace detection. It will
+ // also do its best on error, and will escape funny sequences (these won't be
+ // valid schemes and it will return error).
+ //
+ // Note that the canonicalizer will append a colon to the output to separate
+ // out the rest of the URL, which is not present in the input. We check,
+ // however, that the output range includes everything but the colon.
+ ComponentCase scheme_cases[] = {
+ {"http", "http:", url_parse::Component(0, 4), true},
+ {"HTTP", "http:", url_parse::Component(0, 4), true},
+ {" HTTP ", "%20http%20:", url_parse::Component(0, 10), false},
+ {"htt: ", "htt%3A%20:", url_parse::Component(0, 9), false},
+ {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", url_parse::Component(0, 22), false},
+ // Don't re-escape something already escaped. Note that it will
+ // "canonicalize" the 'A' to 'a', but that's OK.
+ {"ht%3Atp", "ht%3atp:", url_parse::Component(0, 7), false},
+ };
+
+ std::string out_str;
+
+ for (size_t i = 0; i < arraysize(scheme_cases); i++) {
+ int url_len = static_cast<int>(strlen(scheme_cases[i].input));
+ url_parse::Component in_comp(0, url_len);
+ url_parse::Component out_comp;
+
+ out_str.clear();
+ url_canon::StdStringCanonOutput output1(&out_str);
+ bool success = url_canon::CanonicalizeScheme(scheme_cases[i].input,
+ in_comp, &output1, &out_comp);
+ output1.Complete();
+
+ EXPECT_EQ(scheme_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
+ EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
+
+ // Now try the wide version
+ out_str.clear();
+ url_canon::StdStringCanonOutput output2(&out_str);
+
+ string16 wide_input(ConvertUTF8ToUTF16(scheme_cases[i].input));
+ in_comp.len = static_cast<int>(wide_input.length());
+ success = url_canon::CanonicalizeScheme(wide_input.c_str(), in_comp,
+ &output2, &out_comp);
+ output2.Complete();
+
+ EXPECT_EQ(scheme_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
+ EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
+ }
+
+ // Test the case where the scheme is declared nonexistant, it should be
+ // converted into an empty scheme.
+ url_parse::Component out_comp;
+ out_str.clear();
+ url_canon::StdStringCanonOutput output(&out_str);
+
+ EXPECT_TRUE(url_canon::CanonicalizeScheme("", url_parse::Component(0, -1),
+ &output, &out_comp));
+ output.Complete();
+
+ EXPECT_EQ(std::string(":"), out_str);
+ EXPECT_EQ(0, out_comp.begin);
+ EXPECT_EQ(0, out_comp.len);
+}
+
+TEST(URLCanonTest, Host) {
+ IPAddressCase host_cases[] = {
+ // Basic canonicalization, uppercase should be converted to lowercase.
+ {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
+ // Spaces and some other characters should be escaped.
+ {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", url_parse::Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""},
+ // Exciting different types of spaces!
+ {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", url_parse::Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
+ // Other types of space (no-break, zero-width, zero-width-no-break) are
+ // name-prepped away to nothing.
+ {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1, ""},
+ // Ideographic full stop (full-width period for Chinese, etc.) should be
+ // treated as a dot.
+ {NULL, L"www.foo\x3002"L"bar.com", "www.foo.bar.com", url_parse::Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
+ // Invalid unicode characters should fail...
+ // ...In wide input, ICU will barf and we'll end up with the input as
+ // escaped UTF-8 (the invalid character should be replaced with the
+ // replacement character).
+ {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
+ // ...This is the same as previous but with with escaped.
+ {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
+ // Test name prepping, fullwidth input should be converted to ASCII and NOT
+ // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
+ {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", url_parse::Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
+ // Test that fullwidth escaped values are properly name-prepped,
+ // then converted or rejected.
+ // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
+ {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
+ {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
+ {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
+ // UTF-8 (wide case). The output should be equivalent to the true wide
+ // character input above).
+ {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
+ // Invalid escaped characters should fail and the percents should be
+ // escaped.
+ {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", url_parse::Component(0, 10), CanonHostInfo::BROKEN, -1, ""},
+ // If we get an invalid character that has been escaped.
+ {"%25", L"%25", "%25", url_parse::Component(0, 3), CanonHostInfo::BROKEN, -1, ""},
+ {"hello%00", L"hello%00", "hello%00", url_parse::Component(0, 8), CanonHostInfo::BROKEN, -1, ""},
+ // Escaped numbers should be treated like IP addresses if they are.
+ {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
+ {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
+ // Invalid escaping should trigger the regular host error handling.
+ {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", url_parse::Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
+ // Something that isn't exactly an IP should get treated as a host and
+ // spaces escaped.
+ {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", url_parse::Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
+ // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
+ // These are "0Xc0.0250.01" in fullwidth.
+ {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
+ // Broken IP addresses get marked as such.
+ {"192.168.0.257", L"192.168.0.257", "192.168.0.257", url_parse::Component(0, 13), CanonHostInfo::BROKEN, -1, ""},
+ {"[google.com]", L"[google.com]", "[google.com]", url_parse::Component(0, 12), CanonHostInfo::BROKEN, -1, ""},
+ // Cyrillic letter followed buy ( should return punicode for ( escaped before punicode string was created. I.e.
+ // if ( is escaped after punicode is created we would get xn--%28-8tb (incorrect).
+ {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", url_parse::Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
+ // Address with all hexidecimal characters with leading number of 1<<32
+ // or greater and should return NEUTRAL rather than BROKEN if not all
+ // components are numbers.
+ {"12345678912345.de", L"12345678912345.de", "12345678912345.de", url_parse::Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
+ {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de", url_parse::Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
+ {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de", "12345678912345.12345678912345.de", url_parse::Component(0, 32), CanonHostInfo::NEUTRAL, -1, ""},
+ {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de", url_parse::Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
+ {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde", url_parse::Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
+ };
+
+ // CanonicalizeHost() non-verbose.
+ std::string out_str;
+ for (size_t i = 0; i < arraysize(host_cases); i++) {
+ // Narrow version.
+ if (host_cases[i].input8) {
+ int host_len = static_cast<int>(strlen(host_cases[i].input8));
+ url_parse::Component in_comp(0, host_len);
+ url_parse::Component out_comp;
+
+ out_str.clear();
+ url_canon::StdStringCanonOutput output(&out_str);
+
+ bool success = url_canon::CanonicalizeHost(host_cases[i].input8, in_comp,
+ &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
+ success);
+ EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+ EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
+ }
+
+ // Wide version.
+ if (host_cases[i].input16) {
+ string16 input16(WStringToUTF16(host_cases[i].input16));
+ int host_len = static_cast<int>(input16.length());
+ url_parse::Component in_comp(0, host_len);
+ url_parse::Component out_comp;
+
+ out_str.clear();
+ url_canon::StdStringCanonOutput output(&out_str);
+
+ bool success = url_canon::CanonicalizeHost(input16.c_str(), in_comp,
+ &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
+ success);
+ EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+ EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
+ }
+ }
+
+ // CanonicalizeHostVerbose()
+ for (size_t i = 0; i < arraysize(host_cases); i++) {
+ // Narrow version.
+ if (host_cases[i].input8) {
+ int host_len = static_cast<int>(strlen(host_cases[i].input8));
+ url_parse::Component in_comp(0, host_len);
+
+ out_str.clear();
+ url_canon::StdStringCanonOutput output(&out_str);
+ CanonHostInfo host_info;
+
+ url_canon::CanonicalizeHostVerbose(host_cases[i].input8, in_comp,
+ &output, &host_info);
+ output.Complete();
+
+ EXPECT_EQ(host_cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+ EXPECT_EQ(host_cases[i].expected_component.begin,
+ host_info.out_host.begin);
+ EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
+ EXPECT_EQ(std::string(host_cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
+ EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
+ host_info.num_ipv4_components);
+ }
+ }
+
+ // Wide version.
+ if (host_cases[i].input16) {
+ string16 input16(WStringToUTF16(host_cases[i].input16));
+ int host_len = static_cast<int>(input16.length());
+ url_parse::Component in_comp(0, host_len);
+
+ out_str.clear();
+ url_canon::StdStringCanonOutput output(&out_str);
+ CanonHostInfo host_info;
+
+ url_canon::CanonicalizeHostVerbose(input16.c_str(), in_comp,
+ &output, &host_info);
+ output.Complete();
+
+ EXPECT_EQ(host_cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(host_cases[i].expected), out_str);
+ EXPECT_EQ(host_cases[i].expected_component.begin,
+ host_info.out_host.begin);
+ EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
+ EXPECT_EQ(std::string(host_cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
+ EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
+ host_info.num_ipv4_components);
+ }
+ }
+ }
+}
+
+TEST(URLCanonTest, IPv4) {
+ IPAddressCase cases[] = {
+ // Empty is not an IP address.
+ {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {".", L".", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Regular IP addresses in different bases.
+ {"192.168.0.1", L"192.168.0.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
+ {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
+ {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
+ // Non-IP addresses due to invalid characters.
+ {"192.168.9.com", L"192.168.9.com", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Invalid characters for the base should be rejected.
+ {"19a.168.0.1", L"19a.168.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {"0308.0250.00.01", L"0308.0250.00.01", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // If there are not enough components, the last one should fill them out.
+ {"192", L"192", "0.0.0.192", url_parse::Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"},
+ {"0xC0a80001", L"0xC0a80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
+ {"030052000001", L"030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
+ {"000030052000001", L"000030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
+ {"192.168", L"192.168", "192.0.0.168", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"},
+ {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
+ {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
+ {"192.168.1", L"192.168.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
+ // Too many components means not an IP address.
+ {"192.168.0.0.1", L"192.168.0.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // We allow a single trailing dot.
+ {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
+ {"192.168.0.1. hello", L"192.168.0.1. hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {"192.168.0.1..", L"192.168.0.1..", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Two dots in a row means not an IP address.
+ {"192.168..1", L"192.168..1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Any numerical overflow should be marked as BROKEN.
+ {"0x100.0", L"0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0x100.0.0", L"0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0x100.0.0.0", L"0x100.0.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0x100.0.0", L"0.0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0.0x100.0", L"0.0.0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0.0.0x100", L"0.0.0.0x100", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0.0x10000", L"0.0.0x10000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0.0x1000000", L"0.0x1000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0x100000000", L"0x100000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Repeat the previous tests, minus 1, to verify boundaries.
+ {"0xFF.0", L"0xFF.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"},
+ {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"},
+ {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"},
+ {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"},
+ {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"},
+ {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"},
+ {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
+ {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", url_parse::Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
+ {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", url_parse::Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
+ // Old trunctations tests. They're all "BROKEN" now.
+ {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"192.168.0.257", L"192.168.0.257", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"192.168.0xa20001", L"192.168.0xa20001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"192.015052000001", L"192.015052000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"0X12C0a80001", L"0X12C0a80001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"276.1.2", L"276.1.2", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Spaces should be rejected.
+ {"192.168.0.1 hello", L"192.168.0.1 hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Very large numbers.
+ {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"},
+ {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", url_parse::Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
+ // A number has no length limit, but long numbers can still overflow.
+ {"00000000000000000001", L"00000000000000000001", "0.0.0.1", url_parse::Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"},
+ {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // If a long component is non-numeric, it's a hostname, *not* a broken IP.
+ {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Truncation of all zeros should still result in 0.
+ {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", url_parse::Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"},
+ };
+
+ for (size_t i = 0; i < arraysize(cases); i++) {
+ // 8-bit version.
+ url_parse::Component component(0,
+ static_cast<int>(strlen(cases[i].input8)));
+
+ std::string out_str1;
+ url_canon::StdStringCanonOutput output1(&out_str1);
+ url_canon::CanonHostInfo host_info;
+ url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1,
+ &host_info);
+ output1.Complete();
+
+ EXPECT_EQ(cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_info.family == CanonHostInfo::IPV4) {
+ EXPECT_STREQ(cases[i].expected, out_str1.c_str());
+ EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+ EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+ EXPECT_EQ(cases[i].expected_num_ipv4_components,
+ host_info.num_ipv4_components);
+ }
+
+ // 16-bit version.
+ string16 input16(WStringToUTF16(cases[i].input16));
+ component = url_parse::Component(0, static_cast<int>(input16.length()));
+
+ std::string out_str2;
+ url_canon::StdStringCanonOutput output2(&out_str2);
+ url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2,
+ &host_info);
+ output2.Complete();
+
+ EXPECT_EQ(cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_info.family == CanonHostInfo::IPV4) {
+ EXPECT_STREQ(cases[i].expected, out_str2.c_str());
+ EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+ EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+ EXPECT_EQ(cases[i].expected_num_ipv4_components,
+ host_info.num_ipv4_components);
+ }
+ }
+}
+
+TEST(URLCanonTest, IPv6) {
+ IPAddressCase cases[] = {
+ // Empty is not an IP address.
+ {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1, ""},
+ // Non-IPs with [:] characters are marked BROKEN.
+ {":", L":", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[", L"[", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[:", L"[:", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"]", L"]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {":]", L":]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[]", L"[]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[:]", L"[:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Regular IP address is invalid without bounding '[' and ']'.
+ {"2001:db8::1", L"2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[2001:db8::1", L"[2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"2001:db8::1]", L"2001:db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Regular IP addresses.
+ {"[::]", L"[::]", "[::]", url_parse::Component(0,4), CanonHostInfo::IPV6, -1, "00000000000000000000000000000000"},
+ {"[::1]", L"[::1]", "[::1]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000001"},
+ {"[1::]", L"[1::]", "[1::]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1, "00010000000000000000000000000000"},
+
+ // Leading zeros should be stripped.
+ {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1, "00000001000200030004000500060007"},
+
+ // Upper case letters should be lowercased.
+ {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", url_parse::Component(0,20), CanonHostInfo::IPV6, -1, "000A000B000C00DE00FF0000000100AC"},
+
+ // The same address can be written with different contractions, but should
+ // get canonicalized to the same thing.
+ {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
+ {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
+
+ // Addresses with embedded IPv4.
+ {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", url_parse::Component(0,10), CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"},
+ {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"},
+ {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]", url_parse::Component(0, 15), CanonHostInfo::IPV6, -1, "00000000000000000000EEEEC0A80001"},
+ {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]", url_parse::Component(0, 14), CanonHostInfo::IPV6, -1, "200100000000000000000000C0A80001"},
+ {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // IPv4 with last component missing.
+ {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0010002"},
+
+ // IPv4 using hex.
+ // TODO(eroman): Should this format be disallowed?
+ {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"},
+
+ // There may be zeros surrounding the "::" contraction.
+ {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"},
+
+ {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", url_parse::Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
+
+ // Can only have one "::" contraction in an IPv6 string literal.
+ {"[2001::db8::1]", L"[2001::db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // No more than 2 consecutive ':'s.
+ {"[2001:db8:::1]", L"[2001:db8:::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[:::]", L"[:::]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Non-IP addresses due to invalid characters.
+ {"[2001::.com]", L"[2001::.com]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // If there are not enough components, the last one should fill them out.
+ // ... omitted at this time ...
+ // Too many components means not an IP address. Similarly with too few if using IPv4 compat or mapped addresses.
+ {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Too many bits (even though 8 comonents, the last one holds 32 bits).
+ {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // Too many bits specified -- the contraction would have to be zero-length
+ // to not exceed 128 bits.
+ {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // The contraction is for 16 bits of zero.
+ {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1, "00010002000300040005000600000008"},
+
+ // Cannot have a trailing colon.
+ {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // Cannot have negative numbers.
+ {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
+ // The scope_id should be included in the canonicalized URL, and is an
+ // unsigned decimal number.
+
+ // Invalid because no ID was given after the percent.
+
+ // Don't allow scope-id
+ {"[1::%1]", L"[1::%1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[1::%eth0]", L"[1::%eth0]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[1::%]", L"[1::%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[%]", L"[%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[::%:]", L"[::%:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // Don't allow leading or trailing colons.
+ {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+
+ // We allow a single trailing dot.
+ // ... omitted at this time ...
+ // Two dots in a row means not an IP address.
+ {"[::192.168..1]", L"[::192.168..1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ // Any non-first components get truncated to one byte.
+ // ... omitted at this time ...
+ // Spaces should be rejected.
+ {"[::1 hello]", L"[::1 hello]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1, ""},
+ };
+
+ for (size_t i = 0; i < arraysize(cases); i++) {
+ // 8-bit version.
+ url_parse::Component component(0,
+ static_cast<int>(strlen(cases[i].input8)));
+
+ std::string out_str1;
+ url_canon::StdStringCanonOutput output1(&out_str1);
+ url_canon::CanonHostInfo host_info;
+ url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1,
+ &host_info);
+ output1.Complete();
+
+ EXPECT_EQ(cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength())) << "iter " << i << " host " << cases[i].input8;
+ if (host_info.family == CanonHostInfo::IPV6) {
+ EXPECT_STREQ(cases[i].expected, out_str1.c_str());
+ EXPECT_EQ(cases[i].expected_component.begin,
+ host_info.out_host.begin);
+ EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+ }
+
+ // 16-bit version.
+ string16 input16(WStringToUTF16(cases[i].input16));
+ component = url_parse::Component(0, static_cast<int>(input16.length()));
+
+ std::string out_str2;
+ url_canon::StdStringCanonOutput output2(&out_str2);
+ url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2,
+ &host_info);
+ output2.Complete();
+
+ EXPECT_EQ(cases[i].expected_family, host_info.family);
+ EXPECT_EQ(std::string(cases[i].expected_address_hex),
+ BytesToHexString(host_info.address, host_info.AddressLength()));
+ if (host_info.family == CanonHostInfo::IPV6) {
+ EXPECT_STREQ(cases[i].expected, out_str2.c_str());
+ EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
+ EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
+ }
+ }
+}
+
+TEST(URLCanonTest, IPEmpty) {
+ std::string out_str1;
+ url_canon::StdStringCanonOutput output1(&out_str1);
+ url_canon::CanonHostInfo host_info;
+
+ // This tests tests.
+ const char spec[] = "192.168.0.1";
+ url_canon::CanonicalizeIPAddress(spec, url_parse::Component(),
+ &output1, &host_info);
+ EXPECT_FALSE(host_info.IsIPAddress());
+
+ url_canon::CanonicalizeIPAddress(spec, url_parse::Component(0, 0),
+ &output1, &host_info);
+ EXPECT_FALSE(host_info.IsIPAddress());
+}
+
+TEST(URLCanonTest, UserInfo) {
+ // Note that the canonicalizer should escape and treat empty components as
+ // not being there.
+
+ // We actually parse a full input URL so we can get the initial components.
+ struct UserComponentCase {
+ const char* input;
+ const char* expected;
+ url_parse::Component expected_username;
+ url_parse::Component expected_password;
+ bool expected_success;
+ } user_info_cases[] = {
+ {"http://user:pass@host.com/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true},
+ {"http://@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true},
+ {"http://:@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true},
+ {"http://foo:@host.com/", "foo@", url_parse::Component(0, 3), url_parse::Component(0, -1), true},
+ {"http://:foo@host.com/", ":foo@", url_parse::Component(0, 0), url_parse::Component(1, 3), true},
+ {"http://^ :$\t@host.com/", "%5E%20:$%09@", url_parse::Component(0, 6), url_parse::Component(7, 4), true},
+ {"http://user:pass@/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true},
+ {"http://%2540:bar@domain.com/", "%2540:bar@", url_parse::Component(0, 5), url_parse::Component(6, 3), true },
+
+ // IE7 compatability: old versions allowed backslashes in usernames, but
+ // IE7 does not. We disallow it as well.
+ {"ftp://me\\mydomain:pass@foo.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(user_info_cases); i++) {
+ int url_len = static_cast<int>(strlen(user_info_cases[i].input));
+ url_parse::Parsed parsed;
+ url_parse::ParseStandardURL(user_info_cases[i].input, url_len, &parsed);
+ url_parse::Component out_user, out_pass;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output1(&out_str);
+
+ bool success = url_canon::CanonicalizeUserInfo(user_info_cases[i].input,
+ parsed.username,
+ user_info_cases[i].input,
+ parsed.password,
+ &output1, &out_user,
+ &out_pass);
+ output1.Complete();
+
+ EXPECT_EQ(user_info_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
+ EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
+ EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
+ EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
+ EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
+
+ // Now try the wide version
+ out_str.clear();
+ url_canon::StdStringCanonOutput output2(&out_str);
+ string16 wide_input(ConvertUTF8ToUTF16(user_info_cases[i].input));
+ success = url_canon::CanonicalizeUserInfo(wide_input.c_str(),
+ parsed.username,
+ wide_input.c_str(),
+ parsed.password,
+ &output2, &out_user, &out_pass);
+ output2.Complete();
+
+ EXPECT_EQ(user_info_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
+ EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
+ EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
+ EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
+ EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
+ }
+}
+
+TEST(URLCanonTest, Port) {
+ // We only need to test that the number gets properly put into the output
+ // buffer. The parser unit tests will test scanning the number correctly.
+ //
+ // Note that the CanonicalizePort will always prepend a colon to the output
+ // to separate it from the colon that it assumes preceeds it.
+ struct PortCase {
+ const char* input;
+ int default_port;
+ const char* expected;
+ url_parse::Component expected_component;
+ bool expected_success;
+ } port_cases[] = {
+ // Invalid input should be copied w/ failure.
+ {"as df", 80, ":as%20df", url_parse::Component(1, 7), false},
+ {"-2", 80, ":-2", url_parse::Component(1, 2), false},
+ // Default port should be omitted.
+ {"80", 80, "", url_parse::Component(0, -1), true},
+ {"8080", 80, ":8080", url_parse::Component(1, 4), true},
+ // PORT_UNSPECIFIED should mean always keep the port.
+ {"80", url_parse::PORT_UNSPECIFIED, ":80", url_parse::Component(1, 2), true},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(port_cases); i++) {
+ int url_len = static_cast<int>(strlen(port_cases[i].input));
+ url_parse::Component in_comp(0, url_len);
+ url_parse::Component out_comp;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output1(&out_str);
+ bool success = url_canon::CanonicalizePort(port_cases[i].input, in_comp,
+ port_cases[i].default_port,
+ &output1, &out_comp);
+ output1.Complete();
+
+ EXPECT_EQ(port_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(port_cases[i].expected), out_str);
+ EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len);
+
+ // Now try the wide version
+ out_str.clear();
+ url_canon::StdStringCanonOutput output2(&out_str);
+ string16 wide_input(ConvertUTF8ToUTF16(port_cases[i].input));
+ success = url_canon::CanonicalizePort(wide_input.c_str(), in_comp,
+ port_cases[i].default_port,
+ &output2, &out_comp);
+ output2.Complete();
+
+ EXPECT_EQ(port_cases[i].expected_success, success);
+ EXPECT_EQ(std::string(port_cases[i].expected), out_str);
+ EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len);
+ }
+}
+
+TEST(URLCanonTest, Path) {
+ DualComponentCase path_cases[] = {
+ // ----- path collapsing tests -----
+ {"/././foo", L"/././foo", "/foo", url_parse::Component(0, 4), true},
+ {"/./.foo", L"/./.foo", "/.foo", url_parse::Component(0, 5), true},
+ {"/foo/.", L"/foo/.", "/foo/", url_parse::Component(0, 5), true},
+ {"/foo/./", L"/foo/./", "/foo/", url_parse::Component(0, 5), true},
+ // double dots followed by a slash or the end of the string count
+ {"/foo/bar/..", L"/foo/bar/..", "/foo/", url_parse::Component(0, 5), true},
+ {"/foo/bar/../", L"/foo/bar/../", "/foo/", url_parse::Component(0, 5), true},
+ // don't count double dots when they aren't followed by a slash
+ {"/foo/..bar", L"/foo/..bar", "/foo/..bar", url_parse::Component(0, 10), true},
+ // some in the middle
+ {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", url_parse::Component(0, 8), true},
+ {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a", url_parse::Component(0, 2), true},
+ // we should not be able to go above the root
+ {"/foo/../../..", L"/foo/../../..", "/", url_parse::Component(0, 1), true},
+ {"/foo/../../../ton", L"/foo/../../../ton", "/ton", url_parse::Component(0, 4), true},
+ // escaped dots should be unescaped and treated the same as dots
+ {"/foo/%2e", L"/foo/%2e", "/foo/", url_parse::Component(0, 5), true},
+ {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", url_parse::Component(0, 8), true},
+ {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar", "/..bar", url_parse::Component(0, 6), true},
+ // Multiple slashes in a row should be preserved and treated like empty
+ // directory names.
+ {"////../..", L"////../..", "//", url_parse::Component(0, 2), true},
+
+ // ----- escaping tests -----
+ {"/foo", L"/foo", "/foo", url_parse::Component(0, 4), true},
+ // Valid escape sequence
+ {"/%20foo", L"/%20foo", "/%20foo", url_parse::Component(0, 7), true},
+ // Invalid escape sequence we should pass through unchanged.
+ {"/foo%", L"/foo%", "/foo%", url_parse::Component(0, 5), true},
+ {"/foo%2", L"/foo%2", "/foo%2", url_parse::Component(0, 6), true},
+ // Invalid escape sequence: bad characters should be treated the same as
+ // the sourrounding text, not as escaped (in this case, UTF-8).
+ {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", url_parse::Component(0, 10), true},
+ {"/foo%2\xc2\xa9zbar", NULL, "/foo%2%C2%A9zbar", url_parse::Component(0, 16), true},
+ {NULL, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", url_parse::Component(0, 22), true},
+ // Regular characters that are escaped should be unescaped
+ {"/foo%41%7a", L"/foo%41%7a", "/fooAz", url_parse::Component(0, 6), true},
+ // Funny characters that are unescaped should be escaped
+ {"/foo\x09\x91%91", NULL, "/foo%09%91%91", url_parse::Component(0, 13), true},
+ {NULL, L"/foo\x09\x91%91", "/foo%09%C2%91%91", url_parse::Component(0, 16), true},
+ // Invalid characters that are escaped should cause a failure.
+ {"/foo%00%51", L"/foo%00%51", "/foo%00Q", url_parse::Component(0, 8), false},
+ // Some characters should be passed through unchanged regardless of esc.
+ {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", url_parse::Component(0, 13), true},
+ // Characters that are properly escaped should not have the case changed
+ // of hex letters.
+ {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", url_parse::Component(0, 13), true},
+ // Funny characters that are unescaped should be escaped
+ {"/foo\tbar", L"/foo\tbar", "/foo%09bar", url_parse::Component(0, 10), true},
+ // Backslashes should get converted to forward slashes
+ {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", url_parse::Component(0, 8), true},
+ // Hashes found in paths (possibly only when the caller explicitly sets
+ // the path on an already-parsed URL) should be escaped.
+ {"/foo#bar", L"/foo#bar", "/foo%23bar", url_parse::Component(0, 10), true},
+ // %7f should be allowed and %3D should not be unescaped (these were wrong
+ // in a previous version).
+ {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", url_parse::Component(0, 24), true},
+ // @ should be passed through unchanged (escaped or unescaped).
+ {"/@asdf%40", L"/@asdf%40", "/@asdf%40", url_parse::Component(0, 9), true},
+
+ // ----- encoding tests -----
+ // Basic conversions
+ {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", url_parse::Component(0, 37), true},
+ // Invalid unicode characters should fail. We only do validation on
+ // UTF-16 input, so this doesn't happen on 8-bit.
+ {"/\xef\xb7\x90zyx", NULL, "/%EF%B7%90zyx", url_parse::Component(0, 13), true},
+ {NULL, L"/\xfdd0zyx", "/%EF%BF%BDzyx", url_parse::Component(0, 13), false},
+ };
+
+ for (size_t i = 0; i < arraysize(path_cases); i++) {
+ if (path_cases[i].input8) {
+ int len = static_cast<int>(strlen(path_cases[i].input8));
+ url_parse::Component in_comp(0, len);
+ url_parse::Component out_comp;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ bool success = url_canon::CanonicalizePath(path_cases[i].input8, in_comp,
+ &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(path_cases[i].expected_success, success);
+ EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
+ EXPECT_EQ(path_cases[i].expected, out_str);
+ }
+
+ if (path_cases[i].input16) {
+ string16 input16(WStringToUTF16(path_cases[i].input16));
+ int len = static_cast<int>(input16.length());
+ url_parse::Component in_comp(0, len);
+ url_parse::Component out_comp;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+
+ bool success = url_canon::CanonicalizePath(input16.c_str(), in_comp,
+ &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(path_cases[i].expected_success, success);
+ EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
+ EXPECT_EQ(path_cases[i].expected, out_str);
+ }
+ }
+
+ // Manual test: embedded NULLs should be escaped and the URL should be marked
+ // as invalid.
+ const char path_with_null[] = "/ab\0c";
+ url_parse::Component in_comp(0, 5);
+ url_parse::Component out_comp;
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ bool success = url_canon::CanonicalizePath(path_with_null, in_comp,
+ &output, &out_comp);
+ output.Complete();
+ EXPECT_FALSE(success);
+ EXPECT_EQ("/ab%00c", out_str);
+}
+
+TEST(URLCanonTest, Query) {
+ struct QueryCase {
+ const char* input8;
+ const wchar_t* input16;
+ const char* encoding;
+ const char* expected;
+ } query_cases[] = {
+ // Regular ASCII case in some different encodings.
+ {"foo=bar", L"foo=bar", NULL, "?foo=bar"},
+ {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
+ {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
+ {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
+ // Allow question marks in the query without escaping
+ {"as?df", L"as?df", NULL, "?as?df"},
+ // Always escape '#' since it would mark the ref.
+ {"as#df", L"as#df", NULL, "?as%23df"},
+ // Escape some questionable 8-bit characters, but never unescape.
+ {"\x02hello\x7f bye", L"\x02hello\x7f bye", NULL, "?%02hello%7F%20bye"},
+ {"%40%41123", L"%40%41123", NULL, "?%40%41123"},
+ // Chinese input/output
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", NULL, "?q=%E4%BD%A0%E5%A5%BD"},
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", "?q=%C4%E3%BA%C3"},
+ {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
+ // Unencodable character in the destination character set should be
+ // escaped. The escape sequence unescapes to be the entity name:
+ // "?q=&#20320;"
+ {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", "?q=Chinese%26%2365319%3B"},
+ // Invalid UTF-8/16 input should be replaced with invalid characters.
+ {"q=\xed\xed", L"q=\xd800\xd800", NULL, "?q=%EF%BF%BD%EF%BF%BD"},
+ // Don't allow < or > because sometimes they are used for XSS if the
+ // URL is echoed in content. Firefox does this, IE doesn't.
+ {"q=<asdf>", L"q=<asdf>", NULL, "?q=%3Casdf%3E"},
+ // Escape double quotemarks in the query.
+ {"q=\"asdf\"", L"q=\"asdf\"", NULL, "?q=%22asdf%22"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) {
+ url_parse::Component out_comp;
+
+ UConvScoper conv(query_cases[i].encoding);
+ ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
+ url_canon::ICUCharsetConverter converter(conv.converter());
+
+ // Map NULL to a NULL converter pointer.
+ url_canon::ICUCharsetConverter* conv_pointer = &converter;
+ if (!query_cases[i].encoding)
+ conv_pointer = NULL;
+
+ if (query_cases[i].input8) {
+ int len = static_cast<int>(strlen(query_cases[i].input8));
+ url_parse::Component in_comp(0, len);
+ std::string out_str;
+
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_canon::CanonicalizeQuery(query_cases[i].input8, in_comp,
+ conv_pointer, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(query_cases[i].expected, out_str);
+ }
+
+ if (query_cases[i].input16) {
+ string16 input16(WStringToUTF16(query_cases[i].input16));
+ int len = static_cast<int>(input16.length());
+ url_parse::Component in_comp(0, len);
+ std::string out_str;
+
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_canon::CanonicalizeQuery(input16.c_str(), in_comp,
+ conv_pointer, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(query_cases[i].expected, out_str);
+ }
+ }
+
+ // Extra test for input with embedded NULL;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_parse::Component out_comp;
+ url_canon::CanonicalizeQuery("a \x00z\x01", url_parse::Component(0, 5), NULL,
+ &output, &out_comp);
+ output.Complete();
+ EXPECT_EQ("?a%20%00z%01", out_str);
+}
+
+TEST(URLCanonTest, Ref) {
+ // Refs are trivial, it just checks the encoding.
+ DualComponentCase ref_cases[] = {
+ // Regular one, we shouldn't escape spaces, et al.
+ {"hello, world", L"hello, world", "#hello, world", url_parse::Component(1, 12), true},
+ // UTF-8/wide input should be preserved
+ {"\xc2\xa9", L"\xa9", "#\xc2\xa9", url_parse::Component(1, 2), true},
+ // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
+ {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#\xF0\x90\x8C\x80ss", url_parse::Component(1, 6), true},
+ // Escaping should be preserved unchanged, even invalid ones
+ {"%41%a", L"%41%a", "#%41%a", url_parse::Component(1, 5), true},
+ // Invalid UTF-8/16 input should be flagged and the input made valid
+ {"\xc2", NULL, "#\xef\xbf\xbd", url_parse::Component(1, 3), true},
+ {NULL, L"\xd800\x597d", "#\xef\xbf\xbd\xe5\xa5\xbd", url_parse::Component(1, 6), true},
+ // Test a Unicode invalid character.
+ {"a\xef\xb7\x90", L"a\xfdd0", "#a\xef\xbf\xbd", url_parse::Component(1, 4), true},
+ // Refs can have # signs and we should preserve them.
+ {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", url_parse::Component(1, 9), true},
+ {"#asdf", L"#asdf", "##asdf", url_parse::Component(1, 5), true},
+ };
+
+ for (size_t i = 0; i < arraysize(ref_cases); i++) {
+ // 8-bit input
+ if (ref_cases[i].input8) {
+ int len = static_cast<int>(strlen(ref_cases[i].input8));
+ url_parse::Component in_comp(0, len);
+ url_parse::Component out_comp;
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_canon::CanonicalizeRef(ref_cases[i].input8, in_comp,
+ &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
+ EXPECT_EQ(ref_cases[i].expected, out_str);
+ }
+
+ // 16-bit input
+ if (ref_cases[i].input16) {
+ string16 input16(WStringToUTF16(ref_cases[i].input16));
+ int len = static_cast<int>(input16.length());
+ url_parse::Component in_comp(0, len);
+ url_parse::Component out_comp;
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_canon::CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
+ EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
+ EXPECT_EQ(ref_cases[i].expected, out_str);
+ }
+ }
+
+ // Try one with an embedded NULL. It should be stripped.
+ const char null_input[5] = "ab\x00z";
+ url_parse::Component null_input_component(0, 4);
+ url_parse::Component out_comp;
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_canon::CanonicalizeRef(null_input, null_input_component,
+ &output, &out_comp);
+ output.Complete();
+
+ EXPECT_EQ(1, out_comp.begin);
+ EXPECT_EQ(3, out_comp.len);
+ EXPECT_EQ("#abz", out_str);
+}
+
+TEST(URLCanonTest, CanonicalizeStandardURL) {
+ // The individual component canonicalize tests should have caught the cases
+ // for each of those components. Here, we just need to test that the various
+ // parts are included or excluded properly, and have the correct separators.
+ struct URLCase {
+ const char* input;
+ const char* expected;
+ bool expected_success;
+ } cases[] = {
+ {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#", true},
+ {"http://[www.google.com]/", "http://[www.google.com]/", false},
+ {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#", false},
+ {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo", true},
+ {"www.google.com", ":www.google.com/", true},
+ {"http://192.0x00A80001", "http://192.168.0.1/", true},
+ {"http://www/foo%2Ehtml", "http://www/foo.html", true},
+ {"http://user:pass@/", "http://user:pass@/", false},
+ {"http://%25DOMAIN:foobar@foodomain.com/", "http://%25DOMAIN:foobar@foodomain.com/", true},
+
+ // Backslashes should get converted to forward slashes.
+ {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true},
+
+ // Busted refs shouldn't make the whole thing fail.
+ {"http://www.google.com/asdf#\xc2", "http://www.google.com/asdf#\xef\xbf\xbd", true},
+
+ // Basic port tests.
+ {"http://foo:80/", "http://foo/", true},
+ {"http://foo:81/", "http://foo:81/", true},
+ {"httpa://foo:80/", "httpa://foo:80/", true},
+ {"http://foo:-80/", "http://foo:-80/", false},
+
+ {"https://foo:443/", "https://foo/", true},
+ {"https://foo:80/", "https://foo:80/", true},
+ {"ftp://foo:21/", "ftp://foo/", true},
+ {"ftp://foo:80/", "ftp://foo:80/", true},
+ {"gopher://foo:70/", "gopher://foo/", true},
+ {"gopher://foo:443/", "gopher://foo:443/", true},
+ {"ws://foo:80/", "ws://foo/", true},
+ {"ws://foo:81/", "ws://foo:81/", true},
+ {"ws://foo:443/", "ws://foo:443/", true},
+ {"ws://foo:815/", "ws://foo:815/", true},
+ {"wss://foo:80/", "wss://foo:80/", true},
+ {"wss://foo:81/", "wss://foo:81/", true},
+ {"wss://foo:443/", "wss://foo/", true},
+ {"wss://foo:815/", "wss://foo:815/", true},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+ int url_len = static_cast<int>(strlen(cases[i].input));
+ url_parse::Parsed parsed;
+ url_parse::ParseStandardURL(cases[i].input, url_len, &parsed);
+
+ url_parse::Parsed out_parsed;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ bool success = url_canon::CanonicalizeStandardURL(
+ cases[i].input, url_len, parsed, NULL, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cases[i].expected_success, success);
+ EXPECT_EQ(cases[i].expected, out_str);
+ }
+}
+
+// The codepath here is the same as for regular canonicalization, so we just
+// need to test that things are replaced or not correctly.
+TEST(URLCanonTest, ReplaceStandardURL) {
+ ReplaceCase replace_cases[] = {
+ // Common case of truncating the path.
+ {"http://www.google.com/foo?bar=baz#ref", NULL, NULL, NULL, NULL, NULL, "/", kDeleteComp, kDeleteComp, "http://www.google.com/"},
+ // Replace everything
+ {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"},
+ // Replace nothing
+ {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"},
+ // Replace scheme with filesystem. The result is garbage, but you asked
+ // for it.
+ {"http://a:b@google.com:22/foo?baz@cat", "filesystem", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem://a:b@google.com:22/foo?baz@cat"},
+ };
+
+ for (size_t i = 0; i < arraysize(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ url_parse::Parsed parsed;
+ url_parse::ParseStandardURL(cur.base, base_len, &parsed);
+
+ url_canon::Replacements<char> r;
+ typedef url_canon::Replacements<char> R; // Clean up syntax.
+
+ // Note that for the scheme we pass in a different clear function since
+ // there is no function to clear the scheme.
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_parse::Parsed out_parsed;
+ url_canon::ReplaceStandardURL(replace_cases[i].base, parsed,
+ r, NULL, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+
+ // The path pointer should be ignored if the address is invalid.
+ {
+ const char src[] = "http://www.google.com/here_is_the_path";
+ int src_len = static_cast<int>(strlen(src));
+
+ url_parse::Parsed parsed;
+ url_parse::ParseStandardURL(src, src_len, &parsed);
+
+ // Replace the path to 0 length string. By using 1 as the string address,
+ // the test should get an access violation if it tries to dereference it.
+ url_canon::Replacements<char> r;
+ r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component(0, 0));
+ std::string out_str1;
+ url_canon::StdStringCanonOutput output1(&out_str1);
+ url_parse::Parsed new_parsed;
+ url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output1, &new_parsed);
+ output1.Complete();
+ EXPECT_STREQ("http://www.google.com/", out_str1.c_str());
+
+ // Same with an "invalid" path.
+ r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component());
+ std::string out_str2;
+ url_canon::StdStringCanonOutput output2(&out_str2);
+ url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output2, &new_parsed);
+ output2.Complete();
+ EXPECT_STREQ("http://www.google.com/", out_str2.c_str());
+ }
+}
+
+TEST(URLCanonTest, ReplaceFileURL) {
+ ReplaceCase replace_cases[] = {
+ // Replace everything
+ {"file:///C:/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"},
+ // Replace nothing
+ {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"},
+ // Clear non-path components (common)
+ {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///C:/gaba"},
+ // Replace path with something that doesn't begin with a slash and make
+ // sure it gets added properly.
+ {"file:///C:/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"},
+ {"file:///home/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"},
+ {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///home/gaba?query#ref"},
+ {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///home/gaba"},
+ {"file:///home/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"},
+ // Replace scheme -- shouldn't do anything.
+ {"file:///C:/gaba?query#ref", "http", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"},
+ };
+
+ for (size_t i = 0; i < arraysize(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ url_parse::Parsed parsed;
+ url_parse::ParseFileURL(cur.base, base_len, &parsed);
+
+ url_canon::Replacements<char> r;
+ typedef url_canon::Replacements<char> R; // Clean up syntax.
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_parse::Parsed out_parsed;
+ url_canon::ReplaceFileURL(cur.base, parsed,
+ r, NULL, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+}
+
+TEST(URLCanonTest, ReplaceFileSystemURL) {
+ ReplaceCase replace_cases[] = {
+ // Replace everything in the outer URL.
+ {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, "/foo", "b", "c", "filesystem:file:///temporary/foo?b#c"},
+ // Replace nothing
+ {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem:file:///temporary/gaba?query#ref"},
+ // Clear non-path components (common)
+ {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "filesystem:file:///temporary/gaba"},
+ // Replace path with something that doesn't begin with a slash and make
+ // sure it gets added properly.
+ {"filesystem:file:///temporary/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "filesystem:file:///temporary/interesting/?query#ref"},
+ // Replace scheme -- shouldn't do anything.
+ {"filesystem:http://u:p@bar.com/t/gaba?query#ref", "http", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem:http://u:p@bar.com/t/gaba?query#ref"},
+ // Replace username -- shouldn't do anything.
+ {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, "u2", NULL, NULL, NULL, NULL, NULL, NULL, "filesystem:http://u:p@bar.com/t/gaba?query#ref"},
+ // Replace password -- shouldn't do anything.
+ {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, NULL, "pw2", NULL, NULL, NULL, NULL, NULL, "filesystem:http://u:p@bar.com/t/gaba?query#ref"},
+ // Replace host -- shouldn't do anything.
+ {"filesystem:http://u:p@bar.com/t/gaba?query#ref", NULL, NULL, NULL, "foo.com", NULL, NULL, NULL, NULL, "filesystem:http://u:p@bar.com/t/gaba?query#ref"},
+ // Replace port -- shouldn't do anything.
+ {"filesystem:http://u:p@bar.com:40/t/gaba?query#ref", NULL, NULL, NULL, NULL, "41", NULL, NULL, NULL, "filesystem:http://u:p@bar.com:40/t/gaba?query#ref"},
+ };
+
+ for (size_t i = 0; i < arraysize(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ url_parse::Parsed parsed;
+ url_parse::ParseFileSystemURL(cur.base, base_len, &parsed);
+
+ url_canon::Replacements<char> r;
+ typedef url_canon::Replacements<char> R; // Clean up syntax.
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_parse::Parsed out_parsed;
+ url_canon::ReplaceFileSystemURL(cur.base, parsed, r, NULL,
+ &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+}
+
+TEST(URLCanonTest, ReplacePathURL) {
+ ReplaceCase replace_cases[] = {
+ // Replace everything
+ {"data:foo", "javascript", NULL, NULL, NULL, NULL, "alert('foo?');", NULL, NULL, "javascript:alert('foo?');"},
+ // Replace nothing
+ {"data:foo", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "data:foo"},
+ // Replace one or the other
+ {"data:foo", "javascript", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "javascript:foo"},
+ {"data:foo", NULL, NULL, NULL, NULL, NULL, "bar", NULL, NULL, "data:bar"},
+ {"data:foo", NULL, NULL, NULL, NULL, NULL, kDeleteComp, NULL, NULL, "data:"},
+ };
+
+ for (size_t i = 0; i < arraysize(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ url_parse::Parsed parsed;
+ url_parse::ParsePathURL(cur.base, base_len, &parsed);
+
+ url_canon::Replacements<char> r;
+ typedef url_canon::Replacements<char> R; // Clean up syntax.
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_parse::Parsed out_parsed;
+ url_canon::ReplacePathURL(cur.base, parsed,
+ r, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+}
+
+TEST(URLCanonTest, ReplaceMailtoURL) {
+ ReplaceCase replace_cases[] = {
+ // Replace everything
+ {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"},
+ // Replace nothing
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"},
+ // Replace the path
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"},
+ // Replace the query
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"},
+ // Replace the path and query
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"},
+ // Set the query to empty (should leave trailing question mark)
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"},
+ // Clear the query
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"},
+ // Clear the path
+ {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"},
+ // Clear the path + query
+ {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"},
+ // Setting the ref should have no effect
+ {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"},
+ };
+
+ for (size_t i = 0; i < arraysize(replace_cases); i++) {
+ const ReplaceCase& cur = replace_cases[i];
+ int base_len = static_cast<int>(strlen(cur.base));
+ url_parse::Parsed parsed;
+ url_parse::ParseMailtoURL(cur.base, base_len, &parsed);
+
+ url_canon::Replacements<char> r;
+ typedef url_canon::Replacements<char> R;
+ SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
+ SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
+ SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
+ SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
+ SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
+ SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
+ SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
+ SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ url_parse::Parsed out_parsed;
+ url_canon::ReplaceMailtoURL(cur.base, parsed,
+ r, &output, &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(replace_cases[i].expected, out_str);
+ }
+}
+
+TEST(URLCanonTest, CanonicalizeFileURL) {
+ struct URLCase {
+ const char* input;
+ const char* expected;
+ bool expected_success;
+ url_parse::Component expected_host;
+ url_parse::Component expected_path;
+ } cases[] = {
+#ifdef _WIN32
+ // Windows-style paths
+ {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
+ {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
+ {"file:", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
+ {"file:UNChost/path", "file://unchost/path", true, url_parse::Component(7, 7), url_parse::Component(14, 5)},
+ // CanonicalizeFileURL supports absolute Windows style paths for IE
+ // compatability. Note that the caller must decide that this is a file
+ // URL itself so it can call the file canonicalizer. This is usually
+ // done automatically as part of relative URL resolving.
+ {"c:\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
+ {"C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
+ {"/C|\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
+ {"//C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)},
+ {"//server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
+ {"\\\\server\\file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
+ {"/\\server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)},
+ // We should preserve the number of slashes after the colon for IE
+ // compatability, except when there is none, in which case we should
+ // add one.
+ {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
+ {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
+ // Three slashes should be non-UNC, even if there is no drive spec (IE
+ // does this, which makes the resulting request invalid).
+ {"file:///foo/bar.txt", "file:///foo/bar.txt", true, url_parse::Component(), url_parse::Component(7, 12)},
+ // TODO(brettw) we should probably fail for invalid host names, which
+ // would change the expected result on this test. We also currently allow
+ // colon even though it's probably invalid, because its currently the
+ // "natural" result of the way the canonicalizer is written. There doesn't
+ // seem to be a strong argument for why allowing it here would be bad, so
+ // we just tolerate it and the load will fail later.
+ {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false, url_parse::Component(7, 2), url_parse::Component(9, 16)},
+ {"file:filer/home\\me", "file://filer/home/me", true, url_parse::Component(7, 5), url_parse::Component(12, 8)},
+ // Make sure relative paths can't go above the "C:"
+ {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, url_parse::Component(), url_parse::Component(7, 12)},
+ // Busted refs shouldn't make the whole thing fail.
+ {"file:///C:/asdf#\xc2", "file:///C:/asdf#\xef\xbf\xbd", true, url_parse::Component(), url_parse::Component(7, 8)},
+#else
+ // Unix-style paths
+ {"file:///home/me", "file:///home/me", true, url_parse::Component(), url_parse::Component(7, 8)},
+ // Windowsy ones should get still treated as Unix-style.
+ {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)},
+ {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)},
+ // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html)
+ {"//", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
+ {"///", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)},
+ {"///test", "file:///test", true, url_parse::Component(), url_parse::Component(7, 5)},
+ {"file://test", "file://test/", true, url_parse::Component(7, 4), url_parse::Component(11, 1)},
+ {"file://localhost", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)},
+ {"file://localhost/", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)},
+ {"file://localhost/test", "file://localhost/test", true, url_parse::Component(7, 9), url_parse::Component(16, 5)},
+#endif // _WIN32
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+ int url_len = static_cast<int>(strlen(cases[i].input));
+ url_parse::Parsed parsed;
+ url_parse::ParseFileURL(cases[i].input, url_len, &parsed);
+
+ url_parse::Parsed out_parsed;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ bool success = url_canon::CanonicalizeFileURL(cases[i].input, url_len,
+ parsed, NULL, &output,
+ &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cases[i].expected_success, success);
+ EXPECT_EQ(cases[i].expected, out_str);
+
+ // Make sure the spec was properly identified, the file canonicalizer has
+ // different code for writing the spec.
+ EXPECT_EQ(0, out_parsed.scheme.begin);
+ EXPECT_EQ(4, out_parsed.scheme.len);
+
+ EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin);
+ EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len);
+
+ EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
+ EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
+ }
+}
+
+TEST(URLCanonTest, CanonicalizeFileSystemURL) {
+ struct URLCase {
+ const char* input;
+ const char* expected;
+ bool expected_success;
+ } cases[] = {
+ {"Filesystem:htTp://www.Foo.com:80/tempoRary", "filesystem:http://www.foo.com/tempoRary/", true},
+ {"filesystem:httpS://www.foo.com/temporary/", "filesystem:https://www.foo.com/temporary/", true},
+ {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//", false},
+ {"filesystem:http://www.foo.com/persistent/bob?query#ref", "filesystem:http://www.foo.com/persistent/bob?query#ref", true},
+ {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true},
+ {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true},
+ {"filesystem:File:///temporary/Bob?qUery#reF", "filesystem:file:///temporary/Bob?qUery#reF", true},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+ int url_len = static_cast<int>(strlen(cases[i].input));
+ url_parse::Parsed parsed;
+ url_parse::ParseFileSystemURL(cases[i].input, url_len, &parsed);
+
+ url_parse::Parsed out_parsed;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ bool success = url_canon::CanonicalizeFileSystemURL(cases[i].input, url_len,
+ parsed, NULL, &output,
+ &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cases[i].expected_success, success);
+ EXPECT_EQ(cases[i].expected, out_str);
+
+ // Make sure the spec was properly identified, the filesystem canonicalizer
+ // has different code for writing the spec.
+ EXPECT_EQ(0, out_parsed.scheme.begin);
+ EXPECT_EQ(10, out_parsed.scheme.len);
+ if (success)
+ EXPECT_GT(out_parsed.path.len, 0);
+ }
+}
+
+TEST(URLCanonTest, CanonicalizePathURL) {
+ // Path URLs should get canonicalized schemes but nothing else.
+ struct PathCase {
+ const char* input;
+ const char* expected;
+ } path_cases[] = {
+ {"javascript:", "javascript:"},
+ {"JavaScript:Foo", "javascript:Foo"},
+ {":\":This /is interesting;?#", ":\":This /is interesting;?#"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) {
+ int url_len = static_cast<int>(strlen(path_cases[i].input));
+ url_parse::Parsed parsed;
+ url_parse::ParsePathURL(path_cases[i].input, url_len, &parsed);
+
+ url_parse::Parsed out_parsed;
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ bool success = url_canon::CanonicalizePathURL(path_cases[i].input, url_len,
+ parsed, &output,
+ &out_parsed);
+ output.Complete();
+
+ EXPECT_TRUE(success);
+ EXPECT_EQ(path_cases[i].expected, out_str);
+
+ EXPECT_EQ(0, out_parsed.host.begin);
+ EXPECT_EQ(-1, out_parsed.host.len);
+
+ // When we end with a colon at the end, there should be no path.
+ if (path_cases[i].input[url_len - 1] == ':') {
+ EXPECT_EQ(0, out_parsed.path.begin);
+ EXPECT_EQ(-1, out_parsed.path.len);
+ }
+ }
+}
+
+TEST(URLCanonTest, CanonicalizeMailtoURL) {
+ struct URLCase {
+ const char* input;
+ const char* expected;
+ bool expected_success;
+ url_parse::Component expected_path;
+ url_parse::Component expected_query;
+ } cases[] = {
+ {"mailto:addr1", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()},
+ {"mailto:addr1@foo.com", "mailto:addr1@foo.com", true, url_parse::Component(7, 13), url_parse::Component()},
+ // Trailing whitespace is stripped.
+ {"MaIlTo:addr1 \t ", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()},
+ {"MaIlTo:addr1?to=jon", "mailto:addr1?to=jon", true, url_parse::Component(7, 5), url_parse::Component(13,6)},
+ {"mailto:addr1,addr2", "mailto:addr1,addr2", true, url_parse::Component(7, 11), url_parse::Component()},
+ {"mailto:addr1, addr2", "mailto:addr1, addr2", true, url_parse::Component(7, 12), url_parse::Component()},
+ {"mailto:addr1%2caddr2", "mailto:addr1%2caddr2", true, url_parse::Component(7, 13), url_parse::Component()},
+ {"mailto:\xF0\x90\x8C\x80", "mailto:%F0%90%8C%80", true, url_parse::Component(7, 12), url_parse::Component()},
+ // Null character should be escaped to %00
+ {"mailto:addr1\0addr2?foo", "mailto:addr1%00addr2?foo", true, url_parse::Component(7, 13), url_parse::Component(21, 3)},
+ // Invalid -- UTF-8 encoded surrogate value.
+ {"mailto:\xed\xa0\x80", "mailto:%EF%BF%BD", false, url_parse::Component(7, 9), url_parse::Component()},
+ {"mailto:addr1?", "mailto:addr1?", true, url_parse::Component(7, 5), url_parse::Component(13, 0)},
+ };
+
+ // Define outside of loop to catch bugs where components aren't reset
+ url_parse::Parsed parsed;
+ url_parse::Parsed out_parsed;
+
+ for (size_t i = 0; i < ARRAYSIZE(cases); i++) {
+ int url_len = static_cast<int>(strlen(cases[i].input));
+ if (i == 8) {
+ // The 9th test case purposely has a '\0' in it -- don't count it
+ // as the string terminator.
+ url_len = 22;
+ }
+ url_parse::ParseMailtoURL(cases[i].input, url_len, &parsed);
+
+ std::string out_str;
+ url_canon::StdStringCanonOutput output(&out_str);
+ bool success = url_canon::CanonicalizeMailtoURL(cases[i].input, url_len,
+ parsed, &output,
+ &out_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cases[i].expected_success, success);
+ EXPECT_EQ(cases[i].expected, out_str);
+
+ // Make sure the spec was properly identified
+ EXPECT_EQ(0, out_parsed.scheme.begin);
+ EXPECT_EQ(6, out_parsed.scheme.len);
+
+ EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
+ EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
+
+ EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin);
+ EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len);
+ }
+}
+
+#ifndef WIN32
+
+TEST(URLCanonTest, _itoa_s) {
+ // We fill the buffer with 0xff to ensure that it's getting properly
+ // null-terminated. We also allocate one byte more than what we tell
+ // _itoa_s about, and ensure that the extra byte is untouched.
+ char buf[6];
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, url_canon::_itoa_s(12, buf, sizeof(buf) - 1, 10));
+ EXPECT_STREQ("12", buf);
+ EXPECT_EQ('\xFF', buf[3]);
+
+ // Test the edge cases - exactly the buffer size and one over
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 10));
+ EXPECT_STREQ("1234", buf);
+ EXPECT_EQ('\xFF', buf[5]);
+
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(EINVAL, url_canon::_itoa_s(12345, buf, sizeof(buf) - 1, 10));
+ EXPECT_EQ('\xFF', buf[5]); // should never write to this location
+
+ // Test the template overload (note that this will see the full buffer)
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, url_canon::_itoa_s(12, buf, 10));
+ EXPECT_STREQ("12", buf);
+ EXPECT_EQ('\xFF', buf[3]);
+
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, url_canon::_itoa_s(12345, buf, 10));
+ EXPECT_STREQ("12345", buf);
+
+ EXPECT_EQ(EINVAL, url_canon::_itoa_s(123456, buf, 10));
+
+ // Test that radix 16 is supported.
+ memset(buf, 0xff, sizeof(buf));
+ EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 16));
+ EXPECT_STREQ("4d2", buf);
+ EXPECT_EQ('\xFF', buf[5]);
+}
+
+TEST(URLCanonTest, _itow_s) {
+ // We fill the buffer with 0xff to ensure that it's getting properly
+ // null-terminated. We also allocate one byte more than what we tell
+ // _itoa_s about, and ensure that the extra byte is untouched.
+ char16 buf[6];
+ const char fill_mem = 0xff;
+ const char16 fill_char = 0xffff;
+ memset(buf, fill_mem, sizeof(buf));
+ EXPECT_EQ(0, url_canon::_itow_s(12, buf, sizeof(buf) / 2 - 1, 10));
+ EXPECT_EQ(WStringToUTF16(L"12"), string16(buf));
+ EXPECT_EQ(fill_char, buf[3]);
+
+ // Test the edge cases - exactly the buffer size and one over
+ EXPECT_EQ(0, url_canon::_itow_s(1234, buf, sizeof(buf) / 2 - 1, 10));
+ EXPECT_EQ(WStringToUTF16(L"1234"), string16(buf));
+ EXPECT_EQ(fill_char, buf[5]);
+
+ memset(buf, fill_mem, sizeof(buf));
+ EXPECT_EQ(EINVAL, url_canon::_itow_s(12345, buf, sizeof(buf) / 2 - 1, 10));
+ EXPECT_EQ(fill_char, buf[5]); // should never write to this location
+
+ // Test the template overload (note that this will see the full buffer)
+ memset(buf, fill_mem, sizeof(buf));
+ EXPECT_EQ(0, url_canon::_itow_s(12, buf, 10));
+ EXPECT_EQ(WStringToUTF16(L"12"), string16(buf));
+ EXPECT_EQ(fill_char, buf[3]);
+
+ memset(buf, fill_mem, sizeof(buf));
+ EXPECT_EQ(0, url_canon::_itow_s(12345, buf, 10));
+ EXPECT_EQ(WStringToUTF16(L"12345"), string16(buf));
+
+ EXPECT_EQ(EINVAL, url_canon::_itow_s(123456, buf, 10));
+}
+
+#endif // !WIN32
+
+// Returns true if the given two structures are the same.
+static bool ParsedIsEqual(const url_parse::Parsed& a,
+ const url_parse::Parsed& b) {
+ return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len &&
+ a.username.begin == b.username.begin && a.username.len == b.username.len &&
+ a.password.begin == b.password.begin && a.password.len == b.password.len &&
+ a.host.begin == b.host.begin && a.host.len == b.host.len &&
+ a.port.begin == b.port.begin && a.port.len == b.port.len &&
+ a.path.begin == b.path.begin && a.path.len == b.path.len &&
+ a.query.begin == b.query.begin && a.query.len == b.query.len &&
+ a.ref.begin == b.ref.begin && a.ref.len == b.ref.len;
+}
+
+TEST(URLCanonTest, ResolveRelativeURL) {
+ struct RelativeCase {
+ const char* base; // Input base URL: MUST BE CANONICAL
+ bool is_base_hier; // Is the base URL hierarchical
+ bool is_base_file; // Tells us if the base is a file URL.
+ const char* test; // Input URL to test against.
+ bool succeed_relative; // Whether we expect IsRelativeURL to succeed
+ bool is_rel; // Whether we expect |test| to be relative or not.
+ bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed.
+ const char* resolved; // What we expect in the result when resolving.
+ } rel_cases[] = {
+ // Basic absolute input.
+ {"http://host/a", true, false, "http://another/", true, false, false, NULL},
+ {"http://host/a", true, false, "http:////another/", true, false, false, NULL},
+ // Empty relative URLs should only remove the ref part of the URL,
+ // leaving the rest unchanged.
+ {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
+ {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"},
+ {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
+ // Spaces at the ends of the relative path should be ignored.
+ {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"},
+ {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"},
+ {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"},
+ // Matching schemes without two slashes are treated as relative.
+ {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"},
+ {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"},
+ {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"},
+ {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"},
+ // Nonmatching schemes are absolute.
+ {"http://host/a", true, false, "https:host2", true, false, false, NULL},
+ {"http://host/a", true, false, "htto:/host2", true, false, false, NULL},
+ // Absolute path input
+ {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"},
+ {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"},
+ {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"},
+ {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"},
+ {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"},
+ {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"},
+ // Relative path input
+ {"http://host/a", true, false, "b", true, true, true, "http://host/b"},
+ {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"},
+ {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"},
+ {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"},
+ {"http://host/a/", true, false, "..", true, true, true, "http://host/"},
+ {"http://host/a/", true, false, "./..", true, true, true, "http://host/"},
+ {"http://host/a/", true, false, "../.", true, true, true, "http://host/"},
+ {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"},
+ {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"},
+ // Query input
+ {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"},
+ {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"},
+ {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"},
+ // Ref input
+ {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"},
+ {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"},
+ {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"},
+ // Non-hierarchical base: no relative handling. Relative input should
+ // error, and if a scheme is present, it should be treated as absolute.
+ {"data:foobar", false, false, "baz.html", false, false, false, NULL},
+ {"data:foobar", false, false, "data:baz", true, false, false, NULL},
+ {"data:foobar", false, false, "data:/base", true, false, false, NULL},
+ // Non-hierarchical base: absolute input should succeed.
+ {"data:foobar", false, false, "http://host/", true, false, false, NULL},
+ {"data:foobar", false, false, "http:host", true, false, false, NULL},
+ // Invalid schemes should be treated as relative.
+ {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"},
+ {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"},
+ {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"},
+ {"data:asdf", false, false, ":foo", false, false, false, NULL},
+ // We should treat semicolons like any other character in URL resolving
+ {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"},
+ {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"},
+ {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"},
+ // Relative URLs can also be written as "//foo/bar" which is relative to
+ // the scheme. In this case, it would take the old scheme, so for http
+ // the example would resolve to "http://foo/bar".
+ {"http://host/a", true, false, "//another", true, true, true, "http://another/"},
+ {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"},
+ {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"},
+ {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"},
+ {"http://host/a", true, false, "//", true, true, false, "http:"},
+ // IE will also allow one or the other to be a backslash to get the same
+ // behavior.
+ {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"},
+ {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"},
+#ifdef WIN32
+ // Resolving against Windows file base URLs.
+ {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL},
+ {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"},
+ {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"},
+ {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"},
+ // But two backslashes on Windows should be UNC so should be treated
+ // as absolute.
+ {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL},
+ // IE doesn't support drive specs starting with two slashes. It fails
+ // immediately and doesn't even try to load. We fix it up to either
+ // an absolute path or UNC depending on what it looks like.
+ {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"},
+ {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"},
+ // Windows drive specs should be allowed and treated as absolute.
+ {"file:///C:/foo", true, true, "c:", true, false, false, NULL},
+ {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL},
+ {"http://host/a", true, false, "c:\\foo", true, false, false, NULL},
+ // Relative paths with drive letters should be allowed when the base is
+ // also a file.
+ {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"},
+ // Treat absolute paths as being off of the drive.
+ {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"},
+ {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"},
+ {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"},
+ // On Windows, two slashes without a drive letter when the base is a file
+ // means that the path is UNC.
+ {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"},
+ {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"},
+#else
+ // On Unix we fall back to relative behavior since there's nothing else
+ // reasonable to do.
+ {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"},
+#endif
+ // Even on Windows, we don't allow relative drive specs when the base
+ // is not file.
+ {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"},
+ {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"},
+ // Filesystem URL tests; filesystem URLs are only valid and relative if
+ // they have no scheme, e.g. "./index.html". There's no valid equivalent
+ // to http:index.html.
+ {"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL},
+ {"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL},
+ {"filesystem:http://host/t/path", true, false, "http://host/t/path2", true, false, false, NULL},
+ {"http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL},
+ {"filesystem:http://host/t/path", true, false, "./path2", true, true, true, "filesystem:http://host/t/path2"},
+ {"filesystem:http://host/t/path/", true, false, "path2", true, true, true, "filesystem:http://host/t/path/path2"},
+ {"filesystem:http://host/t/path", true, false, "filesystem:http:path2", true, false, false, NULL},
+ // Absolute URLs are still not relative to a non-standard base URL.
+ {"about:blank", false, false, "http://X/A", true, false, true, ""},
+ {"about:blank", false, false, "content://content.Provider/", true, false, true, ""},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(rel_cases); i++) {
+ const RelativeCase& cur_case = rel_cases[i];
+
+ url_parse::Parsed parsed;
+ int base_len = static_cast<int>(strlen(cur_case.base));
+ if (cur_case.is_base_file)
+ url_parse::ParseFileURL(cur_case.base, base_len, &parsed);
+ else if (cur_case.is_base_hier)
+ url_parse::ParseStandardURL(cur_case.base, base_len, &parsed);
+ else
+ url_parse::ParsePathURL(cur_case.base, base_len, &parsed);
+
+ // First see if it is relative.
+ int test_len = static_cast<int>(strlen(cur_case.test));
+ bool is_relative;
+ url_parse::Component relative_component;
+ bool succeed_is_rel = url_canon::IsRelativeURL(
+ cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier,
+ &is_relative, &relative_component);
+
+ EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) <<
+ "succeed is rel failure on " << cur_case.test;
+ EXPECT_EQ(cur_case.is_rel, is_relative) <<
+ "is rel failure on " << cur_case.test;
+ // Now resolve it.
+ if (succeed_is_rel && is_relative && cur_case.is_rel) {
+ std::string resolved;
+ url_canon::StdStringCanonOutput output(&resolved);
+ url_parse::Parsed resolved_parsed;
+
+ bool succeed_resolve = url_canon::ResolveRelativeURL(
+ cur_case.base, parsed, cur_case.is_base_file,
+ cur_case.test, relative_component, NULL, &output, &resolved_parsed);
+ output.Complete();
+
+ EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve);
+ EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test;
+
+ // Verify that the output parsed structure is the same as parsing a
+ // the URL freshly.
+ url_parse::Parsed ref_parsed;
+ int resolved_len = static_cast<int>(resolved.size());
+ if (cur_case.is_base_file)
+ url_parse::ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed);
+ else if (cur_case.is_base_hier)
+ url_parse::ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed);
+ else
+ url_parse::ParsePathURL(resolved.c_str(), resolved_len, &ref_parsed);
+ EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
+ }
+ }
+}
+
+// It used to be when we did a replacement with a long buffer of UTF-16
+// characters, we would get invalid data in the URL. This is because the buffer
+// it used to hold the UTF-8 data was resized, while some pointers were still
+// kept to the old buffer that was removed.
+TEST(URLCanonTest, ReplacementOverflow) {
+ const char src[] = "file:///C:/foo/bar";
+ int src_len = static_cast<int>(strlen(src));
+ url_parse::Parsed parsed;
+ url_parse::ParseFileURL(src, src_len, &parsed);
+
+ // Override two components, the path with something short, and the query with
+ // sonething long enough to trigger the bug.
+ url_canon::Replacements<char16> repl;
+ string16 new_query;
+ for (int i = 0; i < 4800; i++)
+ new_query.push_back('a');
+
+ string16 new_path(WStringToUTF16(L"/foo"));
+ repl.SetPath(new_path.c_str(), url_parse::Component(0, 4));
+ repl.SetQuery(new_query.c_str(),
+ url_parse::Component(0, static_cast<int>(new_query.length())));
+
+ // Call ReplaceComponents on the string. It doesn't matter if we call it for
+ // standard URLs, file URLs, etc, since they will go to the same replacement
+ // function that was buggy.
+ url_parse::Parsed repl_parsed;
+ std::string repl_str;
+ url_canon::StdStringCanonOutput repl_output(&repl_str);
+ url_canon::ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed);
+ repl_output.Complete();
+
+ // Generate the expected string and check.
+ std::string expected("file:///foo?");
+ for (size_t i = 0; i < new_query.length(); i++)
+ expected.push_back('a');
+ EXPECT_TRUE(expected == repl_str);
+}
diff --git a/url/url_common.h b/url/url_common.h
new file mode 100644
index 0000000..ac045a8
--- /dev/null
+++ b/url/url_common.h
@@ -0,0 +1,54 @@
+// Copyright 2010, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_COMMON_H__
+#define GOOGLEURL_SRC_URL_COMMON_H__
+
+#if !defined(GURL_IMPLEMENTATION)
+#define GURL_IMPLEMENTATION 0
+#endif
+
+#if defined(GURL_DLL)
+#if defined(WIN32)
+#if GURL_IMPLEMENTATION
+#define GURL_API __declspec(dllexport)
+#else
+#define GURL_API __declspec(dllimport)
+#endif
+#else
+// Non-Windows DLLs.
+#define GURL_API __attribute__((visibility("default")))
+#endif
+#else
+// Not a DLL.
+#define GURL_API
+#endif
+
+#endif // GOOGLEURL_SRC_URL_COMMON_H__
+
diff --git a/url/url_file.h b/url/url_file.h
new file mode 100644
index 0000000..c1b8ac9
--- /dev/null
+++ b/url/url_file.h
@@ -0,0 +1,108 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Provides shared functions used by the internals of the parser and
+// canonicalizer for file URLs. Do not use outside of these modules.
+
+#ifndef GOOGLEURL_SRC_URL_FILE_H__
+#define GOOGLEURL_SRC_URL_FILE_H__
+
+#include "googleurl/src/url_parse_internal.h"
+
+namespace url_parse {
+
+#ifdef WIN32
+
+// We allow both "c:" and "c|" as drive identifiers.
+inline bool IsWindowsDriveSeparator(char16 ch) {
+ return ch == ':' || ch == '|';
+}
+inline bool IsWindowsDriveLetter(char16 ch) {
+ return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
+}
+
+#endif // WIN32
+
+// Returns the index of the next slash in the input after the given index, or
+// spec_len if the end of the input is reached.
+template<typename CHAR>
+inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) {
+ int idx = begin_index;
+ while (idx < spec_len && !IsURLSlash(spec[idx]))
+ idx++;
+ return idx;
+}
+
+#ifdef WIN32
+
+// Returns true if the start_offset in the given spec looks like it begins a
+// drive spec, for example "c:". This function explicitly handles start_offset
+// values that are equal to or larger than the spec_len to simplify callers.
+//
+// If this returns true, the spec is guaranteed to have a valid drive letter
+// plus a colon starting at |start_offset|.
+template<typename CHAR>
+inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset,
+ int spec_len) {
+ int remaining_len = spec_len - start_offset;
+ if (remaining_len < 2)
+ return false; // Not enough room.
+ if (!IsWindowsDriveLetter(spec[start_offset]))
+ return false; // Doesn't start with a valid drive letter.
+ if (!IsWindowsDriveSeparator(spec[start_offset + 1]))
+ return false; // Isn't followed with a drive separator.
+ return true;
+}
+
+// Returns true if the start_offset in the given text looks like it begins a
+// UNC path, for example "\\". This function explicitly handles start_offset
+// values that are equal to or larger than the spec_len to simplify callers.
+//
+// When strict_slashes is set, this function will only accept backslashes as is
+// standard for Windows. Otherwise, it will accept forward slashes as well
+// which we use for a lot of URL handling.
+template<typename CHAR>
+inline bool DoesBeginUNCPath(const CHAR* text,
+ int start_offset,
+ int len,
+ bool strict_slashes) {
+ int remaining_len = len - start_offset;
+ if (remaining_len < 2)
+ return false;
+
+ if (strict_slashes)
+ return text[start_offset] == '\\' && text[start_offset + 1] == '\\';
+ return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]);
+}
+
+#endif // WIN32
+
+} // namespace url_parse
+
+#endif // GOOGLEURL_SRC_URL_FILE_H__
diff --git a/url/url_parse.cc b/url/url_parse.cc
new file mode 100644
index 0000000..b06f4bb
--- /dev/null
+++ b/url/url_parse.cc
@@ -0,0 +1,923 @@
+/* Based on nsURLParsers.cc from Mozilla
+ * -------------------------------------
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Darin Fisher (original author)
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "googleurl/src/url_parse.h"
+
+#include <stdlib.h>
+
+#include "base/logging.h"
+#include "googleurl/src/url_parse_internal.h"
+#include "googleurl/src/url_util.h"
+#include "googleurl/src/url_util_internal.h"
+
+namespace url_parse {
+
+namespace {
+
+// Returns true if the given character is a valid digit to use in a port.
+inline bool IsPortDigit(char16 ch) {
+ return ch >= '0' && ch <= '9';
+}
+
+// Returns the offset of the next authority terminator in the input starting
+// from start_offset. If no terminator is found, the return value will be equal
+// to spec_len.
+template<typename CHAR>
+int FindNextAuthorityTerminator(const CHAR* spec,
+ int start_offset,
+ int spec_len) {
+ for (int i = start_offset; i < spec_len; i++) {
+ if (IsAuthorityTerminator(spec[i]))
+ return i;
+ }
+ return spec_len; // Not found.
+}
+
+template<typename CHAR>
+void ParseUserInfo(const CHAR* spec,
+ const Component& user,
+ Component* username,
+ Component* password) {
+ // Find the first colon in the user section, which separates the username and
+ // password.
+ int colon_offset = 0;
+ while (colon_offset < user.len && spec[user.begin + colon_offset] != ':')
+ colon_offset++;
+
+ if (colon_offset < user.len) {
+ // Found separator: <username>:<password>
+ *username = Component(user.begin, colon_offset);
+ *password = MakeRange(user.begin + colon_offset + 1,
+ user.begin + user.len);
+ } else {
+ // No separator, treat everything as the username
+ *username = user;
+ *password = Component();
+ }
+}
+
+template<typename CHAR>
+void ParseServerInfo(const CHAR* spec,
+ const Component& serverinfo,
+ Component* hostname,
+ Component* port_num) {
+ if (serverinfo.len == 0) {
+ // No server info, host name is empty.
+ hostname->reset();
+ port_num->reset();
+ return;
+ }
+
+ // If the host starts with a left-bracket, assume the entire host is an
+ // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal.
+ // This assumption will be overridden if we find a right-bracket.
+ //
+ // Our IPv6 address canonicalization code requires both brackets to exist,
+ // but the ability to locate an incomplete address can still be useful.
+ int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1;
+ int colon = -1;
+
+ // Find the last right-bracket, and the last colon.
+ for (int i = serverinfo.begin; i < serverinfo.end(); i++) {
+ switch (spec[i]) {
+ case ']':
+ ipv6_terminator = i;
+ break;
+ case ':':
+ colon = i;
+ break;
+ }
+ }
+
+ if (colon > ipv6_terminator) {
+ // Found a port number: <hostname>:<port>
+ *hostname = MakeRange(serverinfo.begin, colon);
+ if (hostname->len == 0)
+ hostname->reset();
+ *port_num = MakeRange(colon + 1, serverinfo.end());
+ } else {
+ // No port: <hostname>
+ *hostname = serverinfo;
+ port_num->reset();
+ }
+}
+
+// Given an already-identified auth section, breaks it into its consituent
+// parts. The port number will be parsed and the resulting integer will be
+// filled into the given *port variable, or -1 if there is no port number or it
+// is invalid.
+template<typename CHAR>
+void DoParseAuthority(const CHAR* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num) {
+ DCHECK(auth.is_valid()) << "We should always get an authority";
+ if (auth.len == 0) {
+ username->reset();
+ password->reset();
+ hostname->reset();
+ port_num->reset();
+ return;
+ }
+
+ // Search backwards for @, which is the separator between the user info and
+ // the server info.
+ int i = auth.begin + auth.len - 1;
+ while (i > auth.begin && spec[i] != '@')
+ i--;
+
+ if (spec[i] == '@') {
+ // Found user info: <user-info>@<server-info>
+ ParseUserInfo(spec, Component(auth.begin, i - auth.begin),
+ username, password);
+ ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len),
+ hostname, port_num);
+ } else {
+ // No user info, everything is server info.
+ username->reset();
+ password->reset();
+ ParseServerInfo(spec, auth, hostname, port_num);
+ }
+}
+
+template<typename CHAR>
+void ParsePath(const CHAR* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref) {
+ // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref>
+
+ // Special case when there is no path.
+ if (path.len == -1) {
+ filepath->reset();
+ query->reset();
+ ref->reset();
+ return;
+ }
+ DCHECK(path.len > 0) << "We should never have 0 length paths";
+
+ // Search for first occurrence of either ? or #.
+ int path_end = path.begin + path.len;
+
+ int query_separator = -1; // Index of the '?'
+ int ref_separator = -1; // Index of the '#'
+ for (int i = path.begin; i < path_end; i++) {
+ switch (spec[i]) {
+ case '?':
+ // Only match the query string if it precedes the reference fragment
+ // and when we haven't found one already.
+ if (ref_separator < 0 && query_separator < 0)
+ query_separator = i;
+ break;
+ case '#':
+ // Record the first # sign only.
+ if (ref_separator < 0)
+ ref_separator = i;
+ break;
+ }
+ }
+
+ // Markers pointing to the character after each of these corresponding
+ // components. The code below words from the end back to the beginning,
+ // and will update these indices as it finds components that exist.
+ int file_end, query_end;
+
+ // Ref fragment: from the # to the end of the path.
+ if (ref_separator >= 0) {
+ file_end = query_end = ref_separator;
+ *ref = MakeRange(ref_separator + 1, path_end);
+ } else {
+ file_end = query_end = path_end;
+ ref->reset();
+ }
+
+ // Query fragment: everything from the ? to the next boundary (either the end
+ // of the path or the ref fragment).
+ if (query_separator >= 0) {
+ file_end = query_separator;
+ *query = MakeRange(query_separator + 1, query_end);
+ } else {
+ query->reset();
+ }
+
+ // File path: treat an empty file path as no file path.
+ if (file_end != path.begin)
+ *filepath = MakeRange(path.begin, file_end);
+ else
+ filepath->reset();
+}
+
+template<typename CHAR>
+bool DoExtractScheme(const CHAR* url,
+ int url_len,
+ Component* scheme) {
+ // Skip leading whitespace and control characters.
+ int begin = 0;
+ while (begin < url_len && ShouldTrimFromURL(url[begin]))
+ begin++;
+ if (begin == url_len)
+ return false; // Input is empty or all whitespace.
+
+ // Find the first colon character.
+ for (int i = begin; i < url_len; i++) {
+ if (url[i] == ':') {
+ *scheme = MakeRange(begin, i);
+ return true;
+ }
+ }
+ return false; // No colon found: no scheme
+}
+
+// Fills in all members of the Parsed structure except for the scheme.
+//
+// |spec| is the full spec being parsed, of length |spec_len|.
+// |after_scheme| is the character immediately following the scheme (after the
+// colon) where we'll begin parsing.
+//
+// Compatability data points. I list "host", "path" extracted:
+// Input IE6 Firefox Us
+// ----- -------------- -------------- --------------
+// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/"
+// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/"
+// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/"
+//
+// (*) Interestingly, although IE fails to load these URLs, its history
+// canonicalizer handles them, meaning if you've been to the corresponding
+// "http://foo.com/" link, it will be colored.
+template <typename CHAR>
+void DoParseAfterScheme(const CHAR* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed) {
+ int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
+ int after_slashes = after_scheme + num_slashes;
+
+ // First split into two main parts, the authority (username, password, host,
+ // and port) and the full path (path, query, and reference).
+ Component authority;
+ Component full_path;
+
+ // Found "//<some data>", looks like an authority section. Treat everything
+ // from there to the next slash (or end of spec) to be the authority. Note
+ // that we ignore the number of slashes and treat it as the authority.
+ int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len);
+ authority = Component(after_slashes, end_auth - after_slashes);
+
+ if (end_auth == spec_len) // No beginning of path found.
+ full_path = Component();
+ else // Everything starting from the slash to the end is the path.
+ full_path = Component(end_auth, spec_len - end_auth);
+
+ // Now parse those two sub-parts.
+ DoParseAuthority(spec, authority, &parsed->username, &parsed->password,
+ &parsed->host, &parsed->port);
+ ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref);
+}
+
+// The main parsing function for standard URLs. Standard URLs have a scheme,
+// host, path, etc.
+template<typename CHAR>
+void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ DCHECK(spec_len >= 0);
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ int after_scheme;
+ if (DoExtractScheme(spec, spec_len, &parsed->scheme)) {
+ after_scheme = parsed->scheme.end() + 1; // Skip past the colon.
+ } else {
+ // Say there's no scheme when there is no colon. We could also say that
+ // everything is the scheme. Both would produce an invalid URL, but this way
+ // seems less wrong in more cases.
+ parsed->scheme.reset();
+ after_scheme = begin;
+ }
+ DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+template<typename CHAR>
+void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ DCHECK(spec_len >= 0);
+
+ // Get the unused parts of the URL out of the way.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->host.reset();
+ parsed->port.reset();
+ parsed->path.reset(); // May use this; reset for convenience.
+ parsed->ref.reset(); // May use this; reset for convenience.
+ parsed->query.reset(); // May use this; reset for convenience.
+ parsed->clear_inner_parsed(); // May use this; reset for convenience.
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ // Handle empty specs or ones that contain only whitespace or control chars.
+ if (begin == spec_len) {
+ parsed->scheme.reset();
+ return;
+ }
+
+ int inner_start = -1;
+
+ // Extract the scheme. We also handle the case where there is no scheme.
+ if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += begin;
+
+ if (parsed->scheme.end() == spec_len - 1)
+ return;
+
+ inner_start = parsed->scheme.end() + 1;
+ } else {
+ // No scheme found; that's not valid for filesystem URLs.
+ parsed->scheme.reset();
+ return;
+ }
+
+ url_parse::Component inner_scheme;
+ const CHAR* inner_spec = &spec[inner_start];
+ int inner_spec_len = spec_len - inner_start;
+
+ if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ inner_scheme.begin += inner_start;
+
+ if (inner_scheme.end() == spec_len - 1)
+ return;
+ } else {
+ // No scheme found; that's not valid for filesystem URLs.
+ // The best we can do is return "filesystem://".
+ return;
+ }
+
+ Parsed inner_parsed;
+
+ if (url_util::CompareSchemeComponent(
+ spec, inner_scheme, url_util::kFileScheme)) {
+ // File URLs are special.
+ ParseFileURL(inner_spec, inner_spec_len, &inner_parsed);
+ } else if (url_util::CompareSchemeComponent(spec, inner_scheme,
+ url_util::kFileSystemScheme)) {
+ // Filesystem URLs don't nest.
+ return;
+ } else if (url_util::IsStandard(spec, inner_scheme)) {
+ // All "normal" URLs.
+ DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed);
+ } else {
+ return;
+ }
+
+ // All members of inner_parsed need to be offset by inner_start.
+ // If we had any scheme that supported nesting more than one level deep,
+ // we'd have to recurse into the inner_parsed's inner_parsed when
+ // adjusting by inner_start.
+ inner_parsed.scheme.begin += inner_start;
+ inner_parsed.username.begin += inner_start;
+ inner_parsed.password.begin += inner_start;
+ inner_parsed.host.begin += inner_start;
+ inner_parsed.port.begin += inner_start;
+ inner_parsed.query.begin += inner_start;
+ inner_parsed.ref.begin += inner_start;
+ inner_parsed.path.begin += inner_start;
+
+ // Query and ref move from inner_parsed to parsed.
+ parsed->query = inner_parsed.query;
+ inner_parsed.query.reset();
+ parsed->ref = inner_parsed.ref;
+ inner_parsed.ref.reset();
+
+ parsed->set_inner_parsed(inner_parsed);
+ if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() ||
+ inner_parsed.inner_parsed()) {
+ return;
+ }
+
+ // The path in inner_parsed should start with a slash, then have a filesystem
+ // type followed by a slash. From the first slash up to but excluding the
+ // second should be what it keeps; the rest goes to parsed. If the path ends
+ // before the second slash, it's still pretty clear what the user meant, so
+ // we'll let that through.
+ if (!IsURLSlash(spec[inner_parsed.path.begin])) {
+ return;
+ }
+ int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash
+ while (inner_path_end < spec_len &&
+ !IsURLSlash(spec[inner_path_end]))
+ ++inner_path_end;
+ parsed->path.begin = inner_path_end;
+ int new_inner_path_length = inner_path_end - inner_parsed.path.begin;
+ parsed->path.len = inner_parsed.path.len - new_inner_path_length;
+ parsed->inner_parsed()->path.len = new_inner_path_length;
+}
+
+// Initializes a path URL which is merely a scheme followed by a path. Examples
+// include "about:foo" and "javascript:alert('bar');"
+template<typename CHAR>
+void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ // Get the non-path and non-scheme parts of the URL out of the way, we never
+ // use them.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->host.reset();
+ parsed->port.reset();
+ parsed->query.reset();
+ parsed->ref.reset();
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ // Handle empty specs or ones that contain only whitespace or control chars.
+ if (begin == spec_len) {
+ parsed->scheme.reset();
+ parsed->path.reset();
+ return;
+ }
+
+ // Extract the scheme, with the path being everything following. We also
+ // handle the case where there is no scheme.
+ if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += begin;
+
+ // For compatability with the standard URL parser, we treat no path as
+ // -1, rather than having a length of 0 (we normally wouldn't care so
+ // much for these non-standard URLs).
+ if (parsed->scheme.end() == spec_len - 1)
+ parsed->path.reset();
+ else
+ parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len);
+ } else {
+ // No scheme found, just path.
+ parsed->scheme.reset();
+ parsed->path = MakeRange(begin, spec_len);
+ }
+}
+
+template<typename CHAR>
+void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ DCHECK(spec_len >= 0);
+
+ // Get the non-path and non-scheme parts of the URL out of the way, we never
+ // use them.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->host.reset();
+ parsed->port.reset();
+ parsed->ref.reset();
+ parsed->query.reset(); // May use this; reset for convenience.
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ // Handle empty specs or ones that contain only whitespace or control chars.
+ if (begin == spec_len) {
+ parsed->scheme.reset();
+ parsed->path.reset();
+ return;
+ }
+
+ int path_begin = -1;
+ int path_end = -1;
+
+ // Extract the scheme, with the path being everything following. We also
+ // handle the case where there is no scheme.
+ if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += begin;
+
+ if (parsed->scheme.end() != spec_len - 1) {
+ path_begin = parsed->scheme.end() + 1;
+ path_end = spec_len;
+ }
+ } else {
+ // No scheme found, just path.
+ parsed->scheme.reset();
+ path_begin = begin;
+ path_end = spec_len;
+ }
+
+ // Split [path_begin, path_end) into a path + query.
+ for (int i = path_begin; i < path_end; ++i) {
+ if (spec[i] == '?') {
+ parsed->query = MakeRange(i + 1, path_end);
+ path_end = i;
+ break;
+ }
+ }
+
+ // For compatability with the standard URL parser, treat no path as
+ // -1, rather than having a length of 0
+ if (path_begin == path_end) {
+ parsed->path.reset();
+ } else {
+ parsed->path = MakeRange(path_begin, path_end);
+ }
+}
+
+// Converts a port number in a string to an integer. We'd like to just call
+// sscanf but our input is not NULL-terminated, which sscanf requires. Instead,
+// we copy the digits to a small stack buffer (since we know the maximum number
+// of digits in a valid port number) that we can NULL terminate.
+template<typename CHAR>
+int DoParsePort(const CHAR* spec, const Component& component) {
+ // Easy success case when there is no port.
+ const int kMaxDigits = 5;
+ if (!component.is_nonempty())
+ return PORT_UNSPECIFIED;
+
+ // Skip over any leading 0s.
+ Component digits_comp(component.end(), 0);
+ for (int i = 0; i < component.len; i++) {
+ if (spec[component.begin + i] != '0') {
+ digits_comp = MakeRange(component.begin + i, component.end());
+ break;
+ }
+ }
+ if (digits_comp.len == 0)
+ return 0; // All digits were 0.
+
+ // Verify we don't have too many digits (we'll be copying to our buffer so
+ // we need to double-check).
+ if (digits_comp.len > kMaxDigits)
+ return PORT_INVALID;
+
+ // Copy valid digits to the buffer.
+ char digits[kMaxDigits + 1]; // +1 for null terminator
+ for (int i = 0; i < digits_comp.len; i++) {
+ CHAR ch = spec[digits_comp.begin + i];
+ if (!IsPortDigit(ch)) {
+ // Invalid port digit, fail.
+ return PORT_INVALID;
+ }
+ digits[i] = static_cast<char>(ch);
+ }
+
+ // Null-terminate the string and convert to integer. Since we guarantee
+ // only digits, atoi's lack of error handling is OK.
+ digits[digits_comp.len] = 0;
+ int port = atoi(digits);
+ if (port > 65535)
+ return PORT_INVALID; // Out of range.
+ return port;
+}
+
+template<typename CHAR>
+void DoExtractFileName(const CHAR* spec,
+ const Component& path,
+ Component* file_name) {
+ // Handle empty paths: they have no file names.
+ if (!path.is_nonempty()) {
+ file_name->reset();
+ return;
+ }
+
+ // Search backwards for a parameter, which is a normally unused field in a
+ // URL delimited by a semicolon. We parse the parameter as part of the
+ // path, but here, we don't want to count it. The last semicolon is the
+ // parameter. The path should start with a slash, so we don't need to check
+ // the first one.
+ int file_end = path.end();
+ for (int i = path.end() - 1; i > path.begin; i--) {
+ if (spec[i] == ';') {
+ file_end = i;
+ break;
+ }
+ }
+
+ // Now search backwards from the filename end to the previous slash
+ // to find the beginning of the filename.
+ for (int i = file_end - 1; i >= path.begin; i--) {
+ if (IsURLSlash(spec[i])) {
+ // File name is everything following this character to the end
+ *file_name = MakeRange(i + 1, file_end);
+ return;
+ }
+ }
+
+ // No slash found, this means the input was degenerate (generally paths
+ // will start with a slash). Let's call everything the file name.
+ *file_name = MakeRange(path.begin, file_end);
+ return;
+}
+
+template<typename CHAR>
+bool DoExtractQueryKeyValue(const CHAR* spec,
+ Component* query,
+ Component* key,
+ Component* value) {
+ if (!query->is_nonempty())
+ return false;
+
+ int start = query->begin;
+ int cur = start;
+ int end = query->end();
+
+ // We assume the beginning of the input is the beginning of the "key" and we
+ // skip to the end of it.
+ key->begin = cur;
+ while (cur < end && spec[cur] != '&' && spec[cur] != '=')
+ cur++;
+ key->len = cur - key->begin;
+
+ // Skip the separator after the key (if any).
+ if (cur < end && spec[cur] == '=')
+ cur++;
+
+ // Find the value part.
+ value->begin = cur;
+ while (cur < end && spec[cur] != '&')
+ cur++;
+ value->len = cur - value->begin;
+
+ // Finally skip the next separator if any
+ if (cur < end && spec[cur] == '&')
+ cur++;
+
+ // Save the new query
+ *query = url_parse::MakeRange(cur, end);
+ return true;
+}
+
+} // namespace
+
+Parsed::Parsed() : inner_parsed_(NULL) {
+}
+
+Parsed::Parsed(const Parsed& other) :
+ scheme(other.scheme),
+ username(other.username),
+ password(other.password),
+ host(other.host),
+ port(other.port),
+ path(other.path),
+ query(other.query),
+ ref(other.ref),
+ inner_parsed_(NULL) {
+ if (other.inner_parsed_)
+ set_inner_parsed(*other.inner_parsed_);
+}
+
+Parsed& Parsed::operator=(const Parsed& other) {
+ if (this != &other) {
+ scheme = other.scheme;
+ username = other.username;
+ password = other.password;
+ host = other.host;
+ port = other.port;
+ path = other.path;
+ query = other.query;
+ ref = other.ref;
+ if (other.inner_parsed_)
+ set_inner_parsed(*other.inner_parsed_);
+ else
+ clear_inner_parsed();
+ }
+ return *this;
+}
+
+Parsed::~Parsed() {
+ delete inner_parsed_;
+}
+
+int Parsed::Length() const {
+ if (ref.is_valid())
+ return ref.end();
+ return CountCharactersBefore(REF, false);
+}
+
+int Parsed::CountCharactersBefore(ComponentType type,
+ bool include_delimiter) const {
+ if (type == SCHEME)
+ return scheme.begin;
+
+ // There will be some characters after the scheme like "://" and we don't
+ // know how many. Search forwards for the next thing until we find one.
+ int cur = 0;
+ if (scheme.is_valid())
+ cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme.
+
+ if (username.is_valid()) {
+ if (type <= USERNAME)
+ return username.begin;
+ cur = username.end() + 1; // Advance over the '@' or ':' at the end.
+ }
+
+ if (password.is_valid()) {
+ if (type <= PASSWORD)
+ return password.begin;
+ cur = password.end() + 1; // Advance over the '@' at the end.
+ }
+
+ if (host.is_valid()) {
+ if (type <= HOST)
+ return host.begin;
+ cur = host.end();
+ }
+
+ if (port.is_valid()) {
+ if (type < PORT || (type == PORT && include_delimiter))
+ return port.begin - 1; // Back over delimiter.
+ if (type == PORT)
+ return port.begin; // Don't want delimiter counted.
+ cur = port.end();
+ }
+
+ if (path.is_valid()) {
+ if (type <= PATH)
+ return path.begin;
+ cur = path.end();
+ }
+
+ if (query.is_valid()) {
+ if (type < QUERY || (type == QUERY && include_delimiter))
+ return query.begin - 1; // Back over delimiter.
+ if (type == QUERY)
+ return query.begin; // Don't want delimiter counted.
+ cur = query.end();
+ }
+
+ if (ref.is_valid()) {
+ if (type == REF && !include_delimiter)
+ return ref.begin; // Back over delimiter.
+
+ // When there is a ref and we get here, the component we wanted was before
+ // this and not found, so we always know the beginning of the ref is right.
+ return ref.begin - 1; // Don't want delimiter counted.
+ }
+
+ return cur;
+}
+
+bool ExtractScheme(const char* url, int url_len, Component* scheme) {
+ return DoExtractScheme(url, url_len, scheme);
+}
+
+bool ExtractScheme(const char16* url, int url_len, Component* scheme) {
+ return DoExtractScheme(url, url_len, scheme);
+}
+
+// This handles everything that may be an authority terminator, including
+// backslash. For special backslash handling see DoParseAfterScheme.
+bool IsAuthorityTerminator(char16 ch) {
+ return IsURLSlash(ch) || ch == '?' || ch == '#';
+}
+
+void ExtractFileName(const char* url,
+ const Component& path,
+ Component* file_name) {
+ DoExtractFileName(url, path, file_name);
+}
+
+void ExtractFileName(const char16* url,
+ const Component& path,
+ Component* file_name) {
+ DoExtractFileName(url, path, file_name);
+}
+
+bool ExtractQueryKeyValue(const char* url,
+ Component* query,
+ Component* key,
+ Component* value) {
+ return DoExtractQueryKeyValue(url, query, key, value);
+}
+
+bool ExtractQueryKeyValue(const char16* url,
+ Component* query,
+ Component* key,
+ Component* value) {
+ return DoExtractQueryKeyValue(url, query, key, value);
+}
+
+void ParseAuthority(const char* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num) {
+ DoParseAuthority(spec, auth, username, password, hostname, port_num);
+}
+
+void ParseAuthority(const char16* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num) {
+ DoParseAuthority(spec, auth, username, password, hostname, port_num);
+}
+
+int ParsePort(const char* url, const Component& port) {
+ return DoParsePort(url, port);
+}
+
+int ParsePort(const char16* url, const Component& port) {
+ return DoParsePort(url, port);
+}
+
+void ParseStandardURL(const char* url, int url_len, Parsed* parsed) {
+ DoParseStandardURL(url, url_len, parsed);
+}
+
+void ParseStandardURL(const char16* url, int url_len, Parsed* parsed) {
+ DoParseStandardURL(url, url_len, parsed);
+}
+
+void ParsePathURL(const char* url, int url_len, Parsed* parsed) {
+ DoParsePathURL(url, url_len, parsed);
+}
+
+void ParsePathURL(const char16* url, int url_len, Parsed* parsed) {
+ DoParsePathURL(url, url_len, parsed);
+}
+
+void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) {
+ DoParseFileSystemURL(url, url_len, parsed);
+}
+
+void ParseFileSystemURL(const char16* url, int url_len, Parsed* parsed) {
+ DoParseFileSystemURL(url, url_len, parsed);
+}
+
+void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) {
+ DoParseMailtoURL(url, url_len, parsed);
+}
+
+void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed) {
+ DoParseMailtoURL(url, url_len, parsed);
+}
+
+void ParsePathInternal(const char* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref) {
+ ParsePath(spec, path, filepath, query, ref);
+}
+
+void ParsePathInternal(const char16* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref) {
+ ParsePath(spec, path, filepath, query, ref);
+}
+
+void ParseAfterScheme(const char* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed) {
+ DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+void ParseAfterScheme(const char16* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed) {
+ DoParseAfterScheme(spec, spec_len, after_scheme, parsed);
+}
+
+} // namespace url_parse
diff --git a/url/url_parse.h b/url/url_parse.h
new file mode 100644
index 0000000..3dbe98a
--- /dev/null
+++ b/url/url_parse.h
@@ -0,0 +1,373 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_PARSE_H__
+#define GOOGLEURL_SRC_URL_PARSE_H__
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/string16.h"
+#include "googleurl/src/url_common.h"
+
+namespace url_parse {
+
+// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and
+// KURLGoogle.cpp still rely on this type.
+typedef char16 UTF16Char;
+
+// Component ------------------------------------------------------------------
+
+// Represents a substring for URL parsing.
+struct Component {
+ Component() : begin(0), len(-1) {}
+
+ // Normal constructor: takes an offset and a length.
+ Component(int b, int l) : begin(b), len(l) {}
+
+ int end() const {
+ return begin + len;
+ }
+
+ // Returns true if this component is valid, meaning the length is given. Even
+ // valid components may be empty to record the fact that they exist.
+ bool is_valid() const {
+ return (len != -1);
+ }
+
+ // Returns true if the given component is specified on false, the component
+ // is either empty or invalid.
+ bool is_nonempty() const {
+ return (len > 0);
+ }
+
+ void reset() {
+ begin = 0;
+ len = -1;
+ }
+
+ bool operator==(const Component& other) const {
+ return begin == other.begin && len == other.len;
+ }
+
+ int begin; // Byte offset in the string of this component.
+ int len; // Will be -1 if the component is unspecified.
+};
+
+// Helper that returns a component created with the given begin and ending
+// points. The ending point is non-inclusive.
+inline Component MakeRange(int begin, int end) {
+ return Component(begin, end - begin);
+}
+
+// Parsed ---------------------------------------------------------------------
+
+// A structure that holds the identified parts of an input URL. This structure
+// does NOT store the URL itself. The caller will have to store the URL text
+// and its corresponding Parsed structure separately.
+//
+// Typical usage would be:
+//
+// url_parse::Parsed parsed;
+// url_parse::Component scheme;
+// if (!url_parse::ExtractScheme(url, url_len, &scheme))
+// return I_CAN_NOT_FIND_THE_SCHEME_DUDE;
+//
+// if (IsStandardScheme(url, scheme)) // Not provided by this component
+// url_parseParseStandardURL(url, url_len, &parsed);
+// else if (IsFileURL(url, scheme)) // Not provided by this component
+// url_parse::ParseFileURL(url, url_len, &parsed);
+// else
+// url_parse::ParsePathURL(url, url_len, &parsed);
+//
+struct Parsed {
+ // Identifies different components.
+ enum ComponentType {
+ SCHEME,
+ USERNAME,
+ PASSWORD,
+ HOST,
+ PORT,
+ PATH,
+ QUERY,
+ REF,
+ };
+
+ // The default constructor is sufficient for the components, but inner_parsed_
+ // requires special handling.
+ GURL_API Parsed();
+ GURL_API Parsed(const Parsed&);
+ GURL_API Parsed& operator=(const Parsed&);
+ GURL_API ~Parsed();
+
+ // Returns the length of the URL (the end of the last component).
+ //
+ // Note that for some invalid, non-canonical URLs, this may not be the length
+ // of the string. For example "http://": the parsed structure will only
+ // contain an entry for the four-character scheme, and it doesn't know about
+ // the "://". For all other last-components, it will return the real length.
+ GURL_API int Length() const;
+
+ // Returns the number of characters before the given component if it exists,
+ // or where the component would be if it did exist. This will return the
+ // string length if the component would be appended to the end.
+ //
+ // Note that this can get a little funny for the port, query, and ref
+ // components which have a delimiter that is not counted as part of the
+ // component. The |include_delimiter| flag controls if you want this counted
+ // as part of the component or not when the component exists.
+ //
+ // This example shows the difference between the two flags for two of these
+ // delimited components that is present (the port and query) and one that
+ // isn't (the reference). The components that this flag affects are marked
+ // with a *.
+ // 0 1 2
+ // 012345678901234567890
+ // Example input: http://foo:80/?query
+ // include_delim=true, ...=false ("<-" indicates different)
+ // SCHEME: 0 0
+ // USERNAME: 5 5
+ // PASSWORD: 5 5
+ // HOST: 7 7
+ // *PORT: 10 11 <-
+ // PATH: 13 13
+ // *QUERY: 14 15 <-
+ // *REF: 20 20
+ //
+ GURL_API int CountCharactersBefore(ComponentType type,
+ bool include_delimiter) const;
+
+ // Scheme without the colon: "http://foo"/ would have a scheme of "http".
+ // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there
+ // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed
+ // to start at the beginning of the string if there are preceeding whitespace
+ // or control characters.
+ Component scheme;
+
+ // Username. Specified in URLs with an @ sign before the host. See |password|
+ Component username;
+
+ // Password. The length will be -1 if unspecified, 0 if specified but empty.
+ // Not all URLs with a username have a password, as in "http://me@host/".
+ // The password is separated form the username with a colon, as in
+ // "http://me:secret@host/"
+ Component password;
+
+ // Host name.
+ Component host;
+
+ // Port number.
+ Component port;
+
+ // Path, this is everything following the host name. Length will be -1 if
+ // unspecified. This includes the preceeding slash, so the path on
+ // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to
+ // have a 0 length path, it will be -1 in cases like "http://host?foo".
+ // Note that we treat backslashes the same as slashes.
+ Component path;
+
+ // Stuff between the ? and the # after the path. This does not include the
+ // preceeding ? character. Length will be -1 if unspecified, 0 if there is
+ // a question mark but no query string.
+ Component query;
+
+ // Indicated by a #, this is everything following the hash sign (not
+ // including it). If there are multiple hash signs, we'll use the last one.
+ // Length will be -1 if there is no hash sign, or 0 if there is one but
+ // nothing follows it.
+ Component ref;
+
+ // This is used for nested URL types, currently only filesystem. If you
+ // parse a filesystem URL, the resulting Parsed will have a nested
+ // inner_parsed_ to hold the parsed inner URL's component information.
+ // For all other url types [including the inner URL], it will be NULL.
+ Parsed* inner_parsed() const {
+ return inner_parsed_;
+ }
+
+ void set_inner_parsed(const Parsed& inner_parsed) {
+ if (!inner_parsed_)
+ inner_parsed_ = new Parsed(inner_parsed);
+ else
+ *inner_parsed_ = inner_parsed;
+ }
+
+ void clear_inner_parsed() {
+ if (inner_parsed_) {
+ delete inner_parsed_;
+ inner_parsed_ = NULL;
+ }
+ }
+
+ private:
+ Parsed* inner_parsed_; // This object is owned and managed by this struct.
+};
+
+// Initialization functions ---------------------------------------------------
+//
+// These functions parse the given URL, filling in all of the structure's
+// components. These functions can not fail, they will always do their best
+// at interpreting the input given.
+//
+// The string length of the URL MUST be specified, we do not check for NULLs
+// at any point in the process, and will actually handle embedded NULLs.
+//
+// IMPORTANT: These functions do NOT hang on to the given pointer or copy it
+// in any way. See the comment above the struct.
+//
+// The 8-bit versions require UTF-8 encoding.
+
+// StandardURL is for when the scheme is known to be one that has an
+// authority (host) like "http". This function will not handle weird ones
+// like "about:" and "javascript:", or do the right thing for "file:" URLs.
+GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);
+
+// PathURL is for when the scheme is known not to have an authority (host)
+// section but that aren't file URLs either. The scheme is parsed, and
+// everything after the scheme is considered as the path. This is used for
+// things like "about:" and "javascript:"
+GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed);
+
+// FileURL is for file URLs. There are some special rules for interpreting
+// these.
+GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed);
+
+// Filesystem URLs are structured differently than other URLs.
+GURL_API void ParseFileSystemURL(const char* url,
+ int url_len,
+ Parsed* parsed);
+GURL_API void ParseFileSystemURL(const char16* url,
+ int url_len,
+ Parsed* parsed);
+
+// MailtoURL is for mailto: urls. They are made up scheme,path,query
+GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);
+GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);
+
+// Helper functions -----------------------------------------------------------
+
+// Locates the scheme according to the URL parser's rules. This function is
+// designed so the caller can find the scheme and call the correct Init*
+// function according to their known scheme types.
+//
+// It also does not perform any validation on the scheme.
+//
+// This function will return true if the scheme is found and will put the
+// scheme's range into *scheme. False means no scheme could be found. Note
+// that a URL beginning with a colon has a scheme, but it is empty, so this
+// function will return true but *scheme will = (0,0).
+//
+// The scheme is found by skipping spaces and control characters at the
+// beginning, and taking everything from there to the first colon to be the
+// scheme. The character at scheme.end() will be the colon (we may enhance
+// this to handle full width colons or something, so don't count on the
+// actual character value). The character at scheme.end()+1 will be the
+// beginning of the rest of the URL, be it the authority or the path (or the
+// end of the string).
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme);
+GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme);
+
+// Returns true if ch is a character that terminates the authority segment
+// of a URL.
+GURL_API bool IsAuthorityTerminator(char16 ch);
+
+// Does a best effort parse of input |spec|, in range |auth|. If a particular
+// component is not found, it will be set to invalid.
+GURL_API void ParseAuthority(const char* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num);
+GURL_API void ParseAuthority(const char16* spec,
+ const Component& auth,
+ Component* username,
+ Component* password,
+ Component* hostname,
+ Component* port_num);
+
+// Computes the integer port value from the given port component. The port
+// component should have been identified by one of the init functions on
+// |Parsed| for the given input url.
+//
+// The return value will be a positive integer between 0 and 64K, or one of
+// the two special values below.
+enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };
+GURL_API int ParsePort(const char* url, const Component& port);
+GURL_API int ParsePort(const char16* url, const Component& port);
+
+// Extracts the range of the file name in the given url. The path must
+// already have been computed by the parse function, and the matching URL
+// and extracted path are provided to this function. The filename is
+// defined as being everything from the last slash/backslash of the path
+// to the end of the path.
+//
+// The file name will be empty if the path is empty or there is nothing
+// following the last slash.
+//
+// The 8-bit version requires UTF-8 encoding.
+GURL_API void ExtractFileName(const char* url,
+ const Component& path,
+ Component* file_name);
+GURL_API void ExtractFileName(const char16* url,
+ const Component& path,
+ Component* file_name);
+
+// Extract the first key/value from the range defined by |*query|. Updates
+// |*query| to start at the end of the extracted key/value pair. This is
+// designed for use in a loop: you can keep calling it with the same query
+// object and it will iterate over all items in the query.
+//
+// Some key/value pairs may have the key, the value, or both be empty (for
+// example, the query string "?&"). These will be returned. Note that an empty
+// last parameter "foo.com?" or foo.com?a&" will not be returned, this case
+// is the same as "done."
+//
+// The initial query component should not include the '?' (this is the default
+// for parsed URLs).
+//
+// If no key/value are found |*key| and |*value| will be unchanged and it will
+// return false.
+GURL_API bool ExtractQueryKeyValue(const char* url,
+ Component* query,
+ Component* key,
+ Component* value);
+GURL_API bool ExtractQueryKeyValue(const char16* url,
+ Component* query,
+ Component* key,
+ Component* value);
+
+} // namespace url_parse
+
+#endif // GOOGLEURL_SRC_URL_PARSE_H__
diff --git a/url/url_parse_file.cc b/url/url_parse_file.cc
new file mode 100644
index 0000000..2e8429f
--- /dev/null
+++ b/url/url_parse_file.cc
@@ -0,0 +1,243 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "base/logging.h"
+#include "googleurl/src/url_file.h"
+#include "googleurl/src/url_parse.h"
+#include "googleurl/src/url_parse_internal.h"
+
+// Interesting IE file:isms...
+//
+// INPUT OUTPUT
+// ========================= ==============================
+// file:/foo/bar file:///foo/bar
+// The result here seems totally invalid!?!? This isn't UNC.
+//
+// file:/
+// file:// or any other number of slashes
+// IE6 doesn't do anything at all if you click on this link. No error:
+// nothing. IE6's history system seems to always color this link, so I'm
+// guessing that it maps internally to the empty URL.
+//
+// C:\ file:///C:/
+// When on a file: URL source page, this link will work. When over HTTP,
+// the file: URL will appear in the status bar but the link will not work
+// (security restriction for all file URLs).
+//
+// file:foo/ file:foo/ (invalid?!?!?)
+// file:/foo/ file:///foo/ (invalid?!?!?)
+// file://foo/ file://foo/ (UNC to server "foo")
+// file:///foo/ file:///foo/ (invalid, seems to be a file)
+// file:////foo/ file://foo/ (UNC to server "foo")
+// Any more than four slashes is also treated as UNC.
+//
+// file:C:/ file://C:/
+// file:/C:/ file://C:/
+// The number of slashes after "file:" don't matter if the thing following
+// it looks like an absolute drive path. Also, slashes and backslashes are
+// equally valid here.
+
+namespace url_parse {
+
+namespace {
+
+// A subcomponent of DoInitFileURL, the input of this function should be a UNC
+// path name, with the index of the first character after the slashes following
+// the scheme given in |after_slashes|. This will initialize the host, path,
+// query, and ref, and leave the other output components untouched
+// (DoInitFileURL handles these for us).
+template<typename CHAR>
+void DoParseUNC(const CHAR* spec,
+ int after_slashes,
+ int spec_len,
+ Parsed* parsed) {
+ int next_slash = FindNextSlash(spec, after_slashes, spec_len);
+ if (next_slash == spec_len) {
+ // No additional slash found, as in "file://foo", treat the text as the
+ // host with no path (this will end up being UNC to server "foo").
+ int host_len = spec_len - after_slashes;
+ if (host_len)
+ parsed->host = Component(after_slashes, host_len);
+ else
+ parsed->host.reset();
+ parsed->path.reset();
+ return;
+ }
+
+#ifdef WIN32
+ // See if we have something that looks like a path following the first
+ // component. As in "file://localhost/c:/", we get "c:/" out. We want to
+ // treat this as a having no host but the path given. Works on Windows only.
+ if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) {
+ parsed->host.reset();
+ ParsePathInternal(spec, MakeRange(next_slash, spec_len),
+ &parsed->path, &parsed->query, &parsed->ref);
+ return;
+ }
+#endif
+
+ // Otherwise, everything up until that first slash we found is the host name,
+ // which will end up being the UNC host. For example "file://foo/bar.txt"
+ // will get a server name of "foo" and a path of "/bar". Later, on Windows,
+ // this should be treated as the filename "\\foo\bar.txt" in proper UNC
+ // notation.
+ int host_len = next_slash - after_slashes;
+ if (host_len)
+ parsed->host = MakeRange(after_slashes, next_slash);
+ else
+ parsed->host.reset();
+ if (next_slash < spec_len) {
+ ParsePathInternal(spec, MakeRange(next_slash, spec_len),
+ &parsed->path, &parsed->query, &parsed->ref);
+ } else {
+ parsed->path.reset();
+ }
+}
+
+// A subcomponent of DoParseFileURL, the input should be a local file, with the
+// beginning of the path indicated by the index in |path_begin|. This will
+// initialize the host, path, query, and ref, and leave the other output
+// components untouched (DoInitFileURL handles these for us).
+template<typename CHAR>
+void DoParseLocalFile(const CHAR* spec,
+ int path_begin,
+ int spec_len,
+ Parsed* parsed) {
+ parsed->host.reset();
+ ParsePathInternal(spec, MakeRange(path_begin, spec_len),
+ &parsed->path, &parsed->query, &parsed->ref);
+}
+
+// Backend for the external functions that operates on either char type.
+// We are handed the character after the "file:" at the beginning of the spec.
+// Usually this is a slash, but needn't be; we allow paths like "file:c:\foo".
+template<typename CHAR>
+void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) {
+ DCHECK(spec_len >= 0);
+
+ // Get the parts we never use for file URLs out of the way.
+ parsed->username.reset();
+ parsed->password.reset();
+ parsed->port.reset();
+
+ // Many of the code paths don't set these, so it's convenient to just clear
+ // them. We'll write them in those cases we need them.
+ parsed->query.reset();
+ parsed->ref.reset();
+
+ // Strip leading & trailing spaces and control characters.
+ int begin = 0;
+ TrimURL(spec, &begin, &spec_len);
+
+ // Find the scheme.
+ int num_slashes;
+ int after_scheme;
+ int after_slashes;
+#ifdef WIN32
+ // See how many slashes there are. We want to handle cases like UNC but also
+ // "/c:/foo". This is when there is no scheme, so we can allow pages to do
+ // links like "c:/foo/bar" or "//foo/bar". This is also called by the
+ // relative URL resolver when it determines there is an absolute URL, which
+ // may give us input like "/c:/foo".
+ num_slashes = CountConsecutiveSlashes(spec, begin, spec_len);
+ after_slashes = begin + num_slashes;
+ if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) {
+ // Windows path, don't try to extract the scheme (for example, "c:\foo").
+ parsed->scheme.reset();
+ after_scheme = after_slashes;
+ } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) {
+ // Windows UNC path: don't try to extract the scheme, but keep the slashes.
+ parsed->scheme.reset();
+ after_scheme = begin;
+ } else
+#endif
+ {
+ if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) {
+ // Offset the results since we gave ExtractScheme a substring.
+ parsed->scheme.begin += begin;
+ after_scheme = parsed->scheme.end() + 1;
+ } else {
+ // No scheme found, remember that.
+ parsed->scheme.reset();
+ after_scheme = begin;
+ }
+ }
+
+ // Handle empty specs ones that contain only whitespace or control chars,
+ // or that are just the scheme (for example "file:").
+ if (after_scheme == spec_len) {
+ parsed->host.reset();
+ parsed->path.reset();
+ return;
+ }
+
+ num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len);
+
+ after_slashes = after_scheme + num_slashes;
+#ifdef WIN32
+ // Check whether the input is a drive again. We checked above for windows
+ // drive specs, but that's only at the very beginning to see if we have a
+ // scheme at all. This test will be duplicated in that case, but will
+ // additionally handle all cases with a real scheme such as "file:///C:/".
+ if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) &&
+ num_slashes != 3) {
+ // Anything not beginning with a drive spec ("c:\") on Windows is treated
+ // as UNC, with the exception of three slashes which always means a file.
+ // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails.
+ DoParseUNC(spec, after_slashes, spec_len, parsed);
+ return;
+ }
+#else
+ // file: URL with exactly 2 slashes is considered to have a host component.
+ if (num_slashes == 2) {
+ DoParseUNC(spec, after_slashes, spec_len, parsed);
+ return;
+ }
+#endif // WIN32
+
+ // Easy and common case, the full path immediately follows the scheme
+ // (modulo slashes), as in "file://c:/foo". Just treat everything from
+ // there to the end as the path. Empty hosts have 0 length instead of -1.
+ // We include the last slash as part of the path if there is one.
+ DoParseLocalFile(spec,
+ num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme,
+ spec_len, parsed);
+}
+
+} // namespace
+
+void ParseFileURL(const char* url, int url_len, Parsed* parsed) {
+ DoParseFileURL(url, url_len, parsed);
+}
+
+void ParseFileURL(const char16* url, int url_len, Parsed* parsed) {
+ DoParseFileURL(url, url_len, parsed);
+}
+
+} // namespace url_parse
diff --git a/url/url_parse_internal.h b/url/url_parse_internal.h
new file mode 100644
index 0000000..61bd068
--- /dev/null
+++ b/url/url_parse_internal.h
@@ -0,0 +1,112 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Contains common inline helper functions used by the URL parsing routines.
+
+#ifndef GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
+#define GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
+
+#include "googleurl/src/url_parse.h"
+
+namespace url_parse {
+
+// We treat slashes and backslashes the same for IE compatability.
+inline bool IsURLSlash(char16 ch) {
+ return ch == '/' || ch == '\\';
+}
+
+// Returns true if we should trim this character from the URL because it is a
+// space or a control character.
+inline bool ShouldTrimFromURL(char16 ch) {
+ return ch <= ' ';
+}
+
+// Given an already-initialized begin index and length, this shrinks the range
+// to eliminate "should-be-trimmed" characters. Note that the length does *not*
+// indicate the length of untrimmed data from |*begin|, but rather the position
+// in the input string (so the string starts at character |*begin| in the spec,
+// and goes until |*len|).
+template<typename CHAR>
+inline void TrimURL(const CHAR* spec, int* begin, int* len) {
+ // Strip leading whitespace and control characters.
+ while (*begin < *len && ShouldTrimFromURL(spec[*begin]))
+ (*begin)++;
+
+ // Strip trailing whitespace and control characters. We need the >i test for
+ // when the input string is all blanks; we don't want to back past the input.
+ while (*len > *begin && ShouldTrimFromURL(spec[*len - 1]))
+ (*len)--;
+}
+
+// Counts the number of consecutive slashes starting at the given offset
+// in the given string of the given length.
+template<typename CHAR>
+inline int CountConsecutiveSlashes(const CHAR *str,
+ int begin_offset, int str_len) {
+ int count = 0;
+ while (begin_offset + count < str_len &&
+ IsURLSlash(str[begin_offset + count]))
+ ++count;
+ return count;
+}
+
+// Internal functions in url_parse.cc that parse the path, that is, everything
+// following the authority section. The input is the range of everything
+// following the authority section, and the output is the identified ranges.
+//
+// This is designed for the file URL parser or other consumers who may do
+// special stuff at the beginning, but want regular path parsing, it just
+// maps to the internal parsing function for paths.
+void ParsePathInternal(const char* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref);
+void ParsePathInternal(const char16* spec,
+ const Component& path,
+ Component* filepath,
+ Component* query,
+ Component* ref);
+
+
+// Given a spec and a pointer to the character after the colon following the
+// scheme, this parses it and fills in the structure, Every item in the parsed
+// structure is filled EXCEPT for the scheme, which is untouched.
+void ParseAfterScheme(const char* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed);
+void ParseAfterScheme(const char16* spec,
+ int spec_len,
+ int after_scheme,
+ Parsed* parsed);
+
+} // namespace url_parse
+
+#endif // GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__
diff --git a/url/url_parse_unittest.cc b/url/url_parse_unittest.cc
new file mode 100644
index 0000000..cc3eb1b
--- /dev/null
+++ b/url/url_parse_unittest.cc
@@ -0,0 +1,649 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "base/basictypes.h"
+#include "googleurl/src/url_parse.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+// Some implementations of base/basictypes.h may define ARRAYSIZE.
+// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro
+// which is in our version of basictypes.h.
+#ifndef ARRAYSIZE
+#define ARRAYSIZE ARRAYSIZE_UNSAFE
+#endif
+
+// Interesting IE file:isms...
+//
+// file:/foo/bar file:///foo/bar
+// The result here seems totally invalid!?!? This isn't UNC.
+//
+// file:/
+// file:// or any other number of slashes
+// IE6 doesn't do anything at all if you click on this link. No error:
+// nothing. IE6's history system seems to always color this link, so I'm
+// guessing that it maps internally to the empty URL.
+//
+// C:\ file:///C:/
+// / file:///C:/
+// /foo file:///C:/foo
+// Interestingly, IE treats "/" as an alias for "c:\", which makes sense,
+// but is weird to think about on Windows.
+//
+// file:foo/ file:foo/ (invalid?!?!?)
+// file:/foo/ file:///foo/ (invalid?!?!?)
+// file://foo/ file://foo/ (UNC to server "foo")
+// file:///foo/ file:///foo/ (invalid)
+// file:////foo/ file://foo/ (UNC to server "foo")
+// Any more than four slashes is also treated as UNC.
+//
+// file:C:/ file://C:/
+// file:/C:/ file://C:/
+// The number of slashes after "file:" don't matter if the thing following
+// it looks like an absolute drive path. Also, slashes and backslashes are
+// equally valid here.
+
+namespace {
+
+// Used for regular URL parse cases.
+struct URLParseCase {
+ const char* input;
+
+ const char* scheme;
+ const char* username;
+ const char* password;
+ const char* host;
+ int port;
+ const char* path;
+ const char* query;
+ const char* ref;
+};
+
+// Simpler version of URLParseCase for testing path URLs.
+struct PathURLParseCase {
+ const char* input;
+
+ const char* scheme;
+ const char* path;
+};
+
+// Simpler version of URLParseCase for testing mailto URLs.
+struct MailtoURLParseCase {
+ const char* input;
+
+ const char* scheme;
+ const char* path;
+ const char* query;
+};
+
+// More complicated version of URLParseCase for testing filesystem URLs.
+struct FileSystemURLParseCase {
+ const char* input;
+
+ const char* inner_scheme;
+ const char* inner_username;
+ const char* inner_password;
+ const char* inner_host;
+ int inner_port;
+ const char* inner_path;
+ const char* path;
+ const char* query;
+ const char* ref;
+};
+
+bool ComponentMatches(const char* input,
+ const char* reference,
+ const url_parse::Component& component) {
+ // If the component is nonexistant (length == -1), it should begin at 0.
+ EXPECT_TRUE(component.len >= 0 || component.len == -1);
+
+ // Begin should be valid.
+ EXPECT_LE(0, component.begin);
+
+ // A NULL reference means the component should be nonexistant.
+ if (!reference)
+ return component.len == -1;
+ if (component.len < 0)
+ return false; // Reference is not NULL but we don't have anything
+
+ if (strlen(reference) != static_cast<size_t>(component.len))
+ return false; // Lengths don't match
+
+ // Now check the actual characters.
+ return strncmp(reference, &input[component.begin], component.len) == 0;
+}
+
+void ExpectInvalidComponent(const url_parse::Component& component) {
+ EXPECT_EQ(0, component.begin);
+ EXPECT_EQ(-1, component.len);
+}
+
+} // namespace
+
+// Parsed ----------------------------------------------------------------------
+
+TEST(URLParser, Length) {
+ const char* length_cases[] = {
+ // One with everything in it.
+ "http://user:pass@host:99/foo?bar#baz",
+ // One with nothing in it.
+ "",
+ // Working backwards, let's start taking off stuff from the full one.
+ "http://user:pass@host:99/foo?bar#",
+ "http://user:pass@host:99/foo?bar",
+ "http://user:pass@host:99/foo?",
+ "http://user:pass@host:99/foo",
+ "http://user:pass@host:99/",
+ "http://user:pass@host:99",
+ "http://user:pass@host:",
+ "http://user:pass@host",
+ "http://host",
+ "http://user@",
+ "http:",
+ };
+ for (size_t i = 0; i < arraysize(length_cases); i++) {
+ int true_length = static_cast<int>(strlen(length_cases[i]));
+
+ url_parse::Parsed parsed;
+ url_parse::ParseStandardURL(length_cases[i], true_length, &parsed);
+
+ EXPECT_EQ(true_length, parsed.Length());
+ }
+}
+
+TEST(URLParser, CountCharactersBefore) {
+ using namespace url_parse;
+ struct CountCase {
+ const char* url;
+ Parsed::ComponentType component;
+ bool include_delimiter;
+ int expected_count;
+ } count_cases[] = {
+ // Test each possibility in the case where all components are present.
+// 0 1 2
+// 0123456789012345678901
+ {"http://u:p@h:8/p?q#r", Parsed::SCHEME, true, 0},
+ {"http://u:p@h:8/p?q#r", Parsed::SCHEME, false, 0},
+ {"http://u:p@h:8/p?q#r", Parsed::USERNAME, true, 7},
+ {"http://u:p@h:8/p?q#r", Parsed::USERNAME, false, 7},
+ {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, true, 9},
+ {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, false, 9},
+ {"http://u:p@h:8/p?q#r", Parsed::HOST, true, 11},
+ {"http://u:p@h:8/p?q#r", Parsed::HOST, false, 11},
+ {"http://u:p@h:8/p?q#r", Parsed::PORT, true, 12},
+ {"http://u:p@h:8/p?q#r", Parsed::PORT, false, 13},
+ {"http://u:p@h:8/p?q#r", Parsed::PATH, false, 14},
+ {"http://u:p@h:8/p?q#r", Parsed::PATH, true, 14},
+ {"http://u:p@h:8/p?q#r", Parsed::QUERY, true, 16},
+ {"http://u:p@h:8/p?q#r", Parsed::QUERY, false, 17},
+ {"http://u:p@h:8/p?q#r", Parsed::REF, true, 18},
+ {"http://u:p@h:8/p?q#r", Parsed::REF, false, 19},
+ // Now test when the requested component is missing.
+ {"http://u:p@h:8/p?", Parsed::REF, true, 17},
+ {"http://u:p@h:8/p?q", Parsed::REF, true, 18},
+ {"http://u:p@h:8/p#r", Parsed::QUERY, true, 16},
+ {"http://u:p@h:8#r", Parsed::PATH, true, 14},
+ {"http://u:p@h/", Parsed::PORT, true, 12},
+ {"http://u:p@/", Parsed::HOST, true, 11},
+ // This case is a little weird. It will report that the password would
+ // start where the host begins. This is arguably correct, although you
+ // could also argue that it should start at the '@' sign. Doing it
+ // starting with the '@' sign is actually harder, so we don't bother.
+ {"http://u@h/", Parsed::PASSWORD, true, 9},
+ {"http://h/", Parsed::USERNAME, true, 7},
+ {"http:", Parsed::USERNAME, true, 5},
+ {"", Parsed::SCHEME, true, 0},
+ // Make sure a random component still works when there's nothing there.
+ {"", Parsed::REF, true, 0},
+ // File URLs are special with no host, so we test those.
+ {"file:///c:/foo", Parsed::USERNAME, true, 7},
+ {"file:///c:/foo", Parsed::PASSWORD, true, 7},
+ {"file:///c:/foo", Parsed::HOST, true, 7},
+ {"file:///c:/foo", Parsed::PATH, true, 7},
+ };
+ for (size_t i = 0; i < ARRAYSIZE(count_cases); i++) {
+ int length = static_cast<int>(strlen(count_cases[i].url));
+
+ // Simple test to distinguish file and standard URLs.
+ url_parse::Parsed parsed;
+ if (length > 0 && count_cases[i].url[0] == 'f')
+ url_parse::ParseFileURL(count_cases[i].url, length, &parsed);
+ else
+ url_parse::ParseStandardURL(count_cases[i].url, length, &parsed);
+
+ int chars_before = parsed.CountCharactersBefore(
+ count_cases[i].component, count_cases[i].include_delimiter);
+ EXPECT_EQ(count_cases[i].expected_count, chars_before);
+ }
+}
+
+// Standard --------------------------------------------------------------------
+
+// Input Scheme Usrname Passwd Host Port Path Query Ref
+// ------------------------------------ ------- ------- ---------- ------------ --- ---------- ------------ -----
+static URLParseCase cases[] = {
+ // Regular URL with all the parts
+{"http://user:pass@foo:21/bar;par?b#c", "http", "user", "pass", "foo", 21, "/bar;par","b", "c"},
+
+ // Known schemes should lean towards authority identification
+{"http:foo.com", "http", NULL, NULL, "foo.com", -1, NULL, NULL, NULL},
+
+ // Spaces!
+{"\t :foo.com \n", "", NULL, NULL, "foo.com", -1, NULL, NULL, NULL},
+{" foo.com ", NULL, NULL, NULL, "foo.com", -1, NULL, NULL, NULL},
+{"a:\t foo.com", "a", NULL, NULL, "\t foo.com", -1, NULL, NULL, NULL},
+{"http://f:21/ b ? d # e ", "http", NULL, NULL, "f", 21, "/ b ", " d ", " e"},
+
+ // Invalid port numbers should be identified and turned into -2, empty port
+ // numbers should be -1. Spaces aren't allowed in port numbers
+{"http://f:/c", "http", NULL, NULL, "f", -1, "/c", NULL, NULL},
+{"http://f:0/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL},
+{"http://f:00000000000000/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL},
+{"http://f:00000000000000000000080/c", "http", NULL, NULL, "f", 80, "/c", NULL, NULL},
+{"http://f:b/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f: /c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f:\n/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f:fifty-two/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f:999999/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL},
+{"http://f: 21 / b ? d # e ", "http", NULL, NULL, "f", -2, "/ b ", " d ", " e"},
+
+ // Creative URLs missing key elements
+{"", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{" \t", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":foo.com/", "", NULL, NULL, "foo.com", -1, "/", NULL, NULL},
+{":foo.com\\", "", NULL, NULL, "foo.com", -1, "\\", NULL, NULL},
+{":", "", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":a", "", NULL, NULL, "a", -1, NULL, NULL, NULL},
+{":/", "", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":\\", "", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":#", "", NULL, NULL, NULL, -1, NULL, NULL, ""},
+{"#", NULL, NULL, NULL, NULL, -1, NULL, NULL, ""},
+{"#/", NULL, NULL, NULL, NULL, -1, NULL, NULL, "/"},
+{"#\\", NULL, NULL, NULL, NULL, -1, NULL, NULL, "\\"},
+{"#;?", NULL, NULL, NULL, NULL, -1, NULL, NULL, ";?"},
+{"?", NULL, NULL, NULL, NULL, -1, NULL, "", NULL},
+{"/", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{":23", "", NULL, NULL, "23", -1, NULL, NULL, NULL},
+{"/:23", "/", NULL, NULL, "23", -1, NULL, NULL, NULL},
+{"//", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"::", "", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"::23", "", NULL, NULL, NULL, 23, NULL, NULL, NULL},
+{"foo://", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+
+ // Username/passwords and things that look like them
+{"http://a:b@c:29/d", "http", "a", "b", "c", 29, "/d", NULL, NULL},
+{"http::@c:29", "http", "", "", "c", 29, NULL, NULL, NULL},
+ // ... "]" in the password field isn't allowed, but we tolerate it here...
+{"http://&a:foo(b]c@d:2/", "http", "&a", "foo(b]c", "d", 2, "/", NULL, NULL},
+{"http://::@c@d:2", "http", "", ":@c", "d", 2, NULL, NULL, NULL},
+{"http://foo.com:b@d/", "http", "foo.com", "b", "d", -1, "/", NULL, NULL},
+
+{"http://foo.com/\\@", "http", NULL, NULL, "foo.com", -1, "/\\@", NULL, NULL},
+{"http:\\\\foo.com\\", "http", NULL, NULL, "foo.com", -1, "\\", NULL, NULL},
+{"http:\\\\a\\b:c\\d@foo.com\\", "http", NULL, NULL, "a", -1, "\\b:c\\d@foo.com\\", NULL, NULL},
+
+ // Tolerate different numbers of slashes.
+{"foo:/", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"foo:/bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL},
+{"foo://///////", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"foo://///////bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL},
+{"foo:////://///", "foo", NULL, NULL, NULL, -1, "/////", NULL, NULL},
+
+ // Raw file paths on Windows aren't handled by the parser.
+{"c:/foo", "c", NULL, NULL, "foo", -1, NULL, NULL, NULL},
+{"//foo/bar", NULL, NULL, NULL, "foo", -1, "/bar", NULL, NULL},
+
+ // Use the first question mark for the query and the ref.
+{"http://foo/path;a??e#f#g", "http", NULL, NULL, "foo", -1, "/path;a", "?e", "f#g"},
+{"http://foo/abcd?efgh?ijkl", "http", NULL, NULL, "foo", -1, "/abcd", "efgh?ijkl", NULL},
+{"http://foo/abcd#foo?bar", "http", NULL, NULL, "foo", -1, "/abcd", NULL, "foo?bar"},
+
+ // IPv6, check also interesting uses of colons.
+{"[61:24:74]:98", "[61", NULL, NULL, "24:74]", 98, NULL, NULL, NULL},
+{"http://[61:27]:98", "http", NULL, NULL, "[61:27]", 98, NULL, NULL, NULL},
+{"http:[61:27]/:foo", "http", NULL, NULL, "[61:27]", -1, "/:foo", NULL, NULL},
+{"http://[1::2]:3:4", "http", NULL, NULL, "[1::2]:3", 4, NULL, NULL, NULL},
+
+ // Partially-complete IPv6 literals, and related cases.
+{"http://2001::1", "http", NULL, NULL, "2001:", 1, NULL, NULL, NULL},
+{"http://[2001::1", "http", NULL, NULL, "[2001::1", -1, NULL, NULL, NULL},
+{"http://2001::1]", "http", NULL, NULL, "2001::1]", -1, NULL, NULL, NULL},
+{"http://2001::1]:80", "http", NULL, NULL, "2001::1]", 80, NULL, NULL, NULL},
+{"http://[2001::1]", "http", NULL, NULL, "[2001::1]", -1, NULL, NULL, NULL},
+{"http://[2001::1]:80", "http", NULL, NULL, "[2001::1]", 80, NULL, NULL, NULL},
+{"http://[[::]]", "http", NULL, NULL, "[[::]]", -1, NULL, NULL, NULL},
+
+};
+
+TEST(URLParser, Standard) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the constructor.
+ url_parse::Parsed parsed;
+ for (size_t i = 0; i < arraysize(cases); i++) {
+ const char* url = cases[i].input;
+ url_parse::ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed);
+ int port = url_parse::ParsePort(url, parsed.port);
+
+ EXPECT_TRUE(ComponentMatches(url, cases[i].scheme, parsed.scheme));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].username, parsed.username));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].password, parsed.password));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].host, parsed.host));
+ EXPECT_EQ(cases[i].port, port);
+ EXPECT_TRUE(ComponentMatches(url, cases[i].path, parsed.path));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].query, parsed.query));
+ EXPECT_TRUE(ComponentMatches(url, cases[i].ref, parsed.ref));
+ }
+}
+
+// PathURL --------------------------------------------------------------------
+
+// Various incarnations of path URLs.
+static PathURLParseCase path_cases[] = {
+{"", NULL, NULL},
+{":", "", NULL},
+{":/", "", "/"},
+{"/", NULL, "/"},
+{" This is \\interesting// \t", NULL, "This is \\interesting//"},
+{"about:", "about", NULL},
+{"about:blank", "about", "blank"},
+{" about: blank ", "about", " blank"},
+{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\");"},
+};
+
+TEST(URLParser, PathURL) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the construtor.
+ url_parse::Parsed parsed;
+ for (size_t i = 0; i < arraysize(path_cases); i++) {
+ const char* url = path_cases[i].input;
+ url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), &parsed);
+
+ EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme));
+ EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.path));
+
+ // The remaining components are never used for path urls.
+ ExpectInvalidComponent(parsed.username);
+ ExpectInvalidComponent(parsed.password);
+ ExpectInvalidComponent(parsed.host);
+ ExpectInvalidComponent(parsed.port);
+ ExpectInvalidComponent(parsed.query);
+ ExpectInvalidComponent(parsed.ref);
+ }
+}
+
+#ifdef WIN32
+
+// WindowsFile ----------------------------------------------------------------
+
+// Various incarnations of file URLs. These are for Windows only.
+static URLParseCase file_cases[] = {
+{"file:server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL},
+{" file: server \t", "file", NULL, NULL, " server",-1, NULL, NULL, NULL},
+{"FiLe:c|", "FiLe", NULL, NULL, NULL, -1, "c|", NULL, NULL},
+{"FILE:/\\\\/server/file", "FILE", NULL, NULL, "server", -1, "/file", NULL, NULL},
+{"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL},
+{"file://localhost/c:/", "file", NULL, NULL, NULL, -1, "/c:/", NULL, NULL},
+{"file://127.0.0.1/c|\\", "file", NULL, NULL, NULL, -1, "/c|\\", NULL, NULL},
+{"file:/", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+{"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL},
+ // If there is a Windows drive letter, treat any number of slashes as the
+ // path part.
+{"file:c:\\fo\\b", "file", NULL, NULL, NULL, -1, "c:\\fo\\b", NULL, NULL},
+{"file:/c:\\foo/bar", "file", NULL, NULL, NULL, -1, "/c:\\foo/bar",NULL, NULL},
+{"file://c:/f\\b", "file", NULL, NULL, NULL, -1, "/c:/f\\b", NULL, NULL},
+{"file:///C:/foo", "file", NULL, NULL, NULL, -1, "/C:/foo", NULL, NULL},
+{"file://///\\/\\/c:\\f\\b", "file", NULL, NULL, NULL, -1, "/c:\\f\\b", NULL, NULL},
+ // If there is not a drive letter, we should treat is as UNC EXCEPT for
+ // three slashes, which we treat as a Unix style path.
+{"file:server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL},
+{"file:/server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL},
+{"file://server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL},
+{"file:///server/file", "file", NULL, NULL, NULL, -1, "/server/file",NULL, NULL},
+{"file://\\server/file", "file", NULL, NULL, NULL, -1, "\\server/file",NULL, NULL},
+{"file:////server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL},
+ // Queries and refs are valid for file URLs as well.
+{"file:///C:/foo.html?#", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "", ""},
+{"file:///C:/foo.html?query=yes#ref", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "query=yes", "ref"},
+};
+
+TEST(URLParser, WindowsFile) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the construtor.
+ url_parse::Parsed parsed;
+ for (int i = 0; i < arraysize(file_cases); i++) {
+ const char* url = file_cases[i].input;
+ url_parse::ParseFileURL(url, static_cast<int>(strlen(url)), &parsed);
+ int port = url_parse::ParsePort(url, parsed.port);
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].scheme, parsed.scheme));
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].username, parsed.username));
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].password, parsed.password));
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].host, parsed.host));
+ EXPECT_EQ(file_cases[i].port, port);
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].path, parsed.path));
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].query, parsed.query));
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].ref, parsed.ref));
+ }
+}
+
+#endif // WIN32
+
+TEST(URLParser, ExtractFileName) {
+ struct FileCase {
+ const char* input;
+ const char* expected;
+ } file_cases[] = {
+ {"http://www.google.com", NULL},
+ {"http://www.google.com/", ""},
+ {"http://www.google.com/search", "search"},
+ {"http://www.google.com/search/", ""},
+ {"http://www.google.com/foo/bar.html?baz=22", "bar.html"},
+ {"http://www.google.com/foo/bar.html#ref", "bar.html"},
+ {"http://www.google.com/search/;param", ""},
+ {"http://www.google.com/foo/bar.html;param#ref", "bar.html"},
+ {"http://www.google.com/foo/bar.html;foo;param#ref", "bar.html;foo"},
+ {"http://www.google.com/foo/bar.html?query#ref", "bar.html"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE(file_cases); i++) {
+ const char* url = file_cases[i].input;
+ int len = static_cast<int>(strlen(url));
+
+ url_parse::Parsed parsed;
+ url_parse::ParseStandardURL(url, len, &parsed);
+
+ url_parse::Component file_name;
+ url_parse::ExtractFileName(url, parsed.path, &file_name);
+
+ EXPECT_TRUE(ComponentMatches(url, file_cases[i].expected, file_name));
+ }
+}
+
+// Returns true if the parameter with index |parameter| in the given URL's
+// query string. The expected key can be NULL to indicate no such key index
+// should exist. The parameter number is 1-based.
+static bool NthParameterIs(const char* url,
+ int parameter,
+ const char* expected_key,
+ const char* expected_value) {
+ url_parse::Parsed parsed;
+ url_parse::ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed);
+
+ url_parse::Component query = parsed.query;
+
+ for (int i = 1; i <= parameter; i++) {
+ url_parse::Component key, value;
+ if (!url_parse::ExtractQueryKeyValue(url, &query, &key, &value)) {
+ if (parameter >= i && !expected_key)
+ return true; // Expected nonexistant key, got one.
+ return false; // Not enough keys.
+ }
+
+ if (i == parameter) {
+ if (!expected_key)
+ return false;
+
+ if (strncmp(&url[key.begin], expected_key, key.len) != 0)
+ return false;
+ if (strncmp(&url[value.begin], expected_value, value.len) != 0)
+ return false;
+ return true;
+ }
+ }
+ return expected_key == NULL; // We didn't find that many parameters.
+}
+
+TEST(URLParser, ExtractQueryKeyValue) {
+ EXPECT_TRUE(NthParameterIs("http://www.google.com", 1, NULL, NULL));
+
+ // Basic case.
+ char a[] = "http://www.google.com?arg1=1&arg2=2&bar";
+ EXPECT_TRUE(NthParameterIs(a, 1, "arg1", "1"));
+ EXPECT_TRUE(NthParameterIs(a, 2, "arg2", "2"));
+ EXPECT_TRUE(NthParameterIs(a, 3, "bar", ""));
+ EXPECT_TRUE(NthParameterIs(a, 4, NULL, NULL));
+
+ // Empty param at the end.
+ char b[] = "http://www.google.com?foo=bar&";
+ EXPECT_TRUE(NthParameterIs(b, 1, "foo", "bar"));
+ EXPECT_TRUE(NthParameterIs(b, 2, NULL, NULL));
+
+ // Empty param at the beginning.
+ char c[] = "http://www.google.com?&foo=bar";
+ EXPECT_TRUE(NthParameterIs(c, 1, "", ""));
+ EXPECT_TRUE(NthParameterIs(c, 2, "foo", "bar"));
+ EXPECT_TRUE(NthParameterIs(c, 3, NULL, NULL));
+
+ // Empty key with value.
+ char d[] = "http://www.google.com?=foo";
+ EXPECT_TRUE(NthParameterIs(d, 1, "", "foo"));
+ EXPECT_TRUE(NthParameterIs(d, 2, NULL, NULL));
+
+ // Empty value with key.
+ char e[] = "http://www.google.com?foo=";
+ EXPECT_TRUE(NthParameterIs(e, 1, "foo", ""));
+ EXPECT_TRUE(NthParameterIs(e, 2, NULL, NULL));
+
+ // Empty key and values.
+ char f[] = "http://www.google.com?&&==&=";
+ EXPECT_TRUE(NthParameterIs(f, 1, "", ""));
+ EXPECT_TRUE(NthParameterIs(f, 2, "", ""));
+ EXPECT_TRUE(NthParameterIs(f, 3, "", "="));
+ EXPECT_TRUE(NthParameterIs(f, 4, "", ""));
+ EXPECT_TRUE(NthParameterIs(f, 5, NULL, NULL));
+}
+
+// MailtoURL --------------------------------------------------------------------
+
+static MailtoURLParseCase mailto_cases[] = {
+//|input |scheme |path |query
+{"mailto:foo@gmail.com", "mailto", "foo@gmail.com", NULL},
+{" mailto: to \t", "mailto", " to", NULL},
+{"mailto:addr1%2C%20addr2 ", "mailto", "addr1%2C%20addr2", NULL},
+{"Mailto:addr1, addr2 ", "Mailto", "addr1, addr2", NULL},
+{"mailto:addr1:addr2 ", "mailto", "addr1:addr2", NULL},
+{"mailto:?to=addr1,addr2", "mailto", NULL, "to=addr1,addr2"},
+{"mailto:?to=addr1%2C%20addr2", "mailto", NULL, "to=addr1%2C%20addr2"},
+{"mailto:addr1?to=addr2", "mailto", "addr1", "to=addr2"},
+{"mailto:?body=#foobar#", "mailto", NULL, "body=#foobar#",},
+{"mailto:#?body=#foobar#", "mailto", "#", "body=#foobar#"},
+};
+
+TEST(URLParser, MailtoUrl) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the construtor.
+ url_parse::Parsed parsed;
+ for (size_t i = 0; i < arraysize(mailto_cases); ++i) {
+ const char* url = mailto_cases[i].input;
+ url_parse::ParseMailtoURL(url, static_cast<int>(strlen(url)), &parsed);
+ int port = url_parse::ParsePort(url, parsed.port);
+
+ EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].scheme, parsed.scheme));
+ EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].path, parsed.path));
+ EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query));
+ EXPECT_EQ(url_parse::PORT_UNSPECIFIED, port);
+
+ // The remaining components are never used for mailto urls.
+ ExpectInvalidComponent(parsed.username);
+ ExpectInvalidComponent(parsed.password);
+ ExpectInvalidComponent(parsed.port);
+ ExpectInvalidComponent(parsed.ref);
+ }
+}
+
+// Various incarnations of filesystem URLs.
+static FileSystemURLParseCase filesystem_cases[] = {
+ // Regular URL with all the parts
+{"filesystem:http://user:pass@foo:21/temporary/bar;par?b#c", "http", "user", "pass", "foo", 21, "/temporary", "/bar;par", "b", "c"},
+{"filesystem:https://foo/persistent/bar;par/", "https", NULL, NULL, "foo", -1, "/persistent", "/bar;par/", NULL, NULL},
+{"filesystem:file:///persistent/bar;par/", "file", NULL, NULL, NULL, -1, "/persistent", "/bar;par/", NULL, NULL},
+{"filesystem:file:///persistent/bar;par/?query#ref", "file", NULL, NULL, NULL, -1, "/persistent", "/bar;par/", "query", "ref"},
+{"filesystem:file:///persistent", "file", NULL, NULL, NULL, -1, "/persistent", "", NULL, NULL},
+};
+
+TEST(URLParser, FileSystemURL) {
+ // Declared outside for loop to try to catch cases in init() where we forget
+ // to reset something that is reset by the construtor.
+ url_parse::Parsed parsed;
+ for (size_t i = 0; i < arraysize(filesystem_cases); i++) {
+ const FileSystemURLParseCase* parsecase = &filesystem_cases[i];
+ const char* url = parsecase->input;
+ url_parse::ParseFileSystemURL(url, static_cast<int>(strlen(url)), &parsed);
+
+ EXPECT_TRUE(ComponentMatches(url, "filesystem", parsed.scheme));
+ EXPECT_EQ(!parsecase->inner_scheme, !parsed.inner_parsed());
+ // Only check the inner_parsed if there is one.
+ if (parsed.inner_parsed()) {
+ EXPECT_TRUE(ComponentMatches(url, parsecase->inner_scheme,
+ parsed.inner_parsed()->scheme));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->inner_username,
+ parsed.inner_parsed()->username));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->inner_password,
+ parsed.inner_parsed()->password));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->inner_host,
+ parsed.inner_parsed()->host));
+ int port = url_parse::ParsePort(url, parsed.inner_parsed()->port);
+ EXPECT_EQ(parsecase->inner_port, port);
+
+ // The remaining components are never used for filesystem urls.
+ ExpectInvalidComponent(parsed.inner_parsed()->query);
+ ExpectInvalidComponent(parsed.inner_parsed()->ref);
+ }
+
+ EXPECT_TRUE(ComponentMatches(url, parsecase->path, parsed.path));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->query, parsed.query));
+ EXPECT_TRUE(ComponentMatches(url, parsecase->ref, parsed.ref));
+
+ // The remaining components are never used for filesystem urls.
+ ExpectInvalidComponent(parsed.username);
+ ExpectInvalidComponent(parsed.password);
+ ExpectInvalidComponent(parsed.host);
+ ExpectInvalidComponent(parsed.port);
+ }
+}
+
diff --git a/url/url_test_utils.h b/url/url_test_utils.h
new file mode 100644
index 0000000..6278e3f
--- /dev/null
+++ b/url/url_test_utils.h
@@ -0,0 +1,78 @@
+// Copyright 2007 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Convenience functions for string conversions.
+// These are mostly intended for use in unit tests.
+
+#ifndef GOOGLEURL_SRC_URL_TEST_UTILS_H__
+#define GOOGLEURL_SRC_URL_TEST_UTILS_H__
+
+#include <string>
+
+#include "base/string16.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace url_test_utils {
+
+// Converts a UTF-16 string from native wchar_t format to char16, by
+// truncating the high 32 bits. This is not meant to handle true UTF-32
+// encoded strings.
+inline string16 WStringToUTF16(const wchar_t* src) {
+ string16 str;
+ int length = static_cast<int>(wcslen(src));
+ for (int i = 0; i < length; ++i) {
+ str.push_back(static_cast<char16>(src[i]));
+ }
+ return str;
+}
+
+// Converts a string from UTF-8 to UTF-16
+inline string16 ConvertUTF8ToUTF16(const std::string& src) {
+ int length = static_cast<int>(src.length());
+ EXPECT_LT(length, 1024);
+ url_canon::RawCanonOutputW<1024> output;
+ EXPECT_TRUE(url_canon::ConvertUTF8ToUTF16(src.data(), length, &output));
+ return string16(output.data(), output.length());
+}
+
+// Converts a string from UTF-16 to UTF-8
+inline std::string ConvertUTF16ToUTF8(const string16& src) {
+ std::string str;
+ url_canon::StdStringCanonOutput output(&str);
+ EXPECT_TRUE(url_canon::ConvertUTF16ToUTF8(src.data(),
+ static_cast<int>(src.length()),
+ &output));
+ output.Complete();
+ return str;
+}
+
+} // namespace url_test_utils
+
+#endif // GOOGLEURL_SRC_URL_TEST_UTILS_H__
diff --git a/url/url_util.cc b/url/url_util.cc
new file mode 100644
index 0000000..9d621bc2
--- /dev/null
+++ b/url/url_util.cc
@@ -0,0 +1,618 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <string.h>
+#include <vector>
+
+#include "googleurl/src/url_util.h"
+
+#include "base/logging.h"
+#include "googleurl/src/url_canon_internal.h"
+#include "googleurl/src/url_file.h"
+#include "googleurl/src/url_util_internal.h"
+
+namespace url_util {
+
+const char kFileScheme[] = "file";
+const char kFileSystemScheme[] = "filesystem";
+const char kMailtoScheme[] = "mailto";
+
+namespace {
+
+// ASCII-specific tolower. The standard library's tolower is locale sensitive,
+// so we don't want to use it here.
+template <class Char> inline Char ToLowerASCII(Char c) {
+ return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
+}
+
+// Backend for LowerCaseEqualsASCII.
+template<typename Iter>
+inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
+ for (Iter it = a_begin; it != a_end; ++it, ++b) {
+ if (!*b || ToLowerASCII(*it) != *b)
+ return false;
+ }
+ return *b == 0;
+}
+
+const int kNumStandardURLSchemes = 8;
+const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
+ "http",
+ "https",
+ kFileScheme, // Yes, file urls can have a hostname!
+ "ftp",
+ "gopher",
+ "ws", // WebSocket.
+ "wss", // WebSocket secure.
+ kFileSystemScheme,
+};
+
+// List of the currently installed standard schemes. This list is lazily
+// initialized by InitStandardSchemes and is leaked on shutdown to prevent
+// any destructors from being called that will slow us down or cause problems.
+std::vector<const char*>* standard_schemes = NULL;
+
+// See the LockStandardSchemes declaration in the header.
+bool standard_schemes_locked = false;
+
+// Ensures that the standard_schemes list is initialized, does nothing if it
+// already has values.
+void InitStandardSchemes() {
+ if (standard_schemes)
+ return;
+ standard_schemes = new std::vector<const char*>;
+ for (int i = 0; i < kNumStandardURLSchemes; i++)
+ standard_schemes->push_back(kStandardURLSchemes[i]);
+}
+
+// Given a string and a range inside the string, compares it to the given
+// lower-case |compare_to| buffer.
+template<typename CHAR>
+inline bool DoCompareSchemeComponent(const CHAR* spec,
+ const url_parse::Component& component,
+ const char* compare_to) {
+ if (!component.is_nonempty())
+ return compare_to[0] == 0; // When component is empty, match empty scheme.
+ return LowerCaseEqualsASCII(&spec[component.begin],
+ &spec[component.end()],
+ compare_to);
+}
+
+// Returns true if the given scheme identified by |scheme| within |spec| is one
+// of the registered "standard" schemes.
+template<typename CHAR>
+bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {
+ if (!scheme.is_nonempty())
+ return false; // Empty or invalid schemes are non-standard.
+
+ InitStandardSchemes();
+ for (size_t i = 0; i < standard_schemes->size(); i++) {
+ if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
+ standard_schemes->at(i)))
+ return true;
+ }
+ return false;
+}
+
+template<typename CHAR>
+bool DoFindAndCompareScheme(const CHAR* str,
+ int str_len,
+ const char* compare,
+ url_parse::Component* found_scheme) {
+ // Before extracting scheme, canonicalize the URL to remove any whitespace.
+ // This matches the canonicalization done in DoCanonicalize function.
+ url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+ int spec_len;
+ const CHAR* spec = RemoveURLWhitespace(str, str_len,
+ &whitespace_buffer, &spec_len);
+
+ url_parse::Component our_scheme;
+ if (!url_parse::ExtractScheme(spec, spec_len, &our_scheme)) {
+ // No scheme.
+ if (found_scheme)
+ *found_scheme = url_parse::Component();
+ return false;
+ }
+ if (found_scheme)
+ *found_scheme = our_scheme;
+ return DoCompareSchemeComponent(spec, our_scheme, compare);
+}
+
+template<typename CHAR>
+bool DoCanonicalize(const CHAR* in_spec, int in_spec_len,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed) {
+ // Remove any whitespace from the middle of the relative URL, possibly
+ // copying to the new buffer.
+ url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+ int spec_len;
+ const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len,
+ &whitespace_buffer, &spec_len);
+
+ url_parse::Parsed parsed_input;
+#ifdef WIN32
+ // For Windows, we allow things that look like absolute Windows paths to be
+ // fixed up magically to file URLs. This is done for IE compatability. For
+ // example, this will change "c:/foo" into a file URL rather than treating
+ // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
+ // There is similar logic in url_canon_relative.cc for
+ //
+ // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
+ // has no meaning as an absolute path name. This is because browsers on Mac
+ // & Unix don't generally do this, so there is no compatibility reason for
+ // doing so.
+ if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) ||
+ url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
+ url_parse::ParseFileURL(spec, spec_len, &parsed_input);
+ return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
+ charset_converter,
+ output, output_parsed);
+ }
+#endif
+
+ url_parse::Component scheme;
+ if (!url_parse::ExtractScheme(spec, spec_len, &scheme))
+ return false;
+
+ // This is the parsed version of the input URL, we have to canonicalize it
+ // before storing it in our object.
+ bool success;
+ if (DoCompareSchemeComponent(spec, scheme, kFileScheme)) {
+ // File URLs are special.
+ url_parse::ParseFileURL(spec, spec_len, &parsed_input);
+ success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,
+ charset_converter, output,
+ output_parsed);
+ } else if (DoCompareSchemeComponent(spec, scheme, kFileSystemScheme)) {
+ // Filesystem URLs are special.
+ url_parse::ParseFileSystemURL(spec, spec_len, &parsed_input);
+ success = url_canon::CanonicalizeFileSystemURL(spec, spec_len,
+ parsed_input,
+ charset_converter,
+ output, output_parsed);
+
+ } else if (DoIsStandard(spec, scheme)) {
+ // All "normal" URLs.
+ url_parse::ParseStandardURL(spec, spec_len, &parsed_input);
+ success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,
+ charset_converter,
+ output, output_parsed);
+
+ } else if (DoCompareSchemeComponent(spec, scheme, kMailtoScheme)) {
+ // Mailto are treated like a standard url with only a scheme, path, query
+ url_parse::ParseMailtoURL(spec, spec_len, &parsed_input);
+ success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input,
+ output, output_parsed);
+
+ } else {
+ // "Weird" URLs like data: and javascript:
+ url_parse::ParsePathURL(spec, spec_len, &parsed_input);
+ success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input,
+ output, output_parsed);
+ }
+ return success;
+}
+
+template<typename CHAR>
+bool DoResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const url_parse::Parsed& base_parsed,
+ const CHAR* in_relative,
+ int in_relative_length,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed) {
+ // Remove any whitespace from the middle of the relative URL, possibly
+ // copying to the new buffer.
+ url_canon::RawCanonOutputT<CHAR> whitespace_buffer;
+ int relative_length;
+ const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
+ &whitespace_buffer,
+ &relative_length);
+ bool base_is_authority_based = false;
+ bool base_is_hierarchical = false;
+ if (base_spec &&
+ base_parsed.scheme.is_nonempty()) {
+ int after_scheme = base_parsed.scheme.end() + 1; // Skip past the colon.
+ int num_slashes = url_parse::CountConsecutiveSlashes(
+ base_spec, after_scheme, base_spec_len);
+ base_is_authority_based = num_slashes > 1;
+ base_is_hierarchical = num_slashes > 0;
+ }
+
+ bool standard_base_scheme =
+ base_parsed.scheme.is_nonempty() &&
+ DoIsStandard(base_spec, base_parsed.scheme);
+
+ bool is_relative;
+ url_parse::Component relative_component;
+ if (!url_canon::IsRelativeURL(base_spec, base_parsed,
+ relative, relative_length,
+ (base_is_hierarchical || standard_base_scheme),
+ &is_relative,
+ &relative_component)) {
+ // Error resolving.
+ return false;
+ }
+
+ // Pretend for a moment that |base_spec| is a standard URL. Normally
+ // non-standard URLs are treated as PathURLs, but if the base has an
+ // authority we would like to preserve it.
+ if (is_relative && base_is_authority_based && !standard_base_scheme) {
+ url_parse::Parsed base_parsed_authority;
+ ParseStandardURL(base_spec, base_spec_len, &base_parsed_authority);
+ if (base_parsed_authority.host.is_nonempty()) {
+ bool did_resolve_succeed =
+ url_canon::ResolveRelativeURL(base_spec, base_parsed_authority,
+ false, relative,
+ relative_component, charset_converter,
+ output, output_parsed);
+ // The output_parsed is incorrect at this point (because it was built
+ // based on base_parsed_authority instead of base_parsed) and needs to be
+ // re-created.
+ ParsePathURL(output->data(), output->length(), output_parsed);
+ return did_resolve_succeed;
+ }
+ } else if (is_relative) {
+ // Relative, resolve and canonicalize.
+ bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
+ DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
+ return url_canon::ResolveRelativeURL(base_spec, base_parsed,
+ file_base_scheme, relative,
+ relative_component, charset_converter,
+ output, output_parsed);
+ }
+
+ // Not relative, canonicalize the input.
+ return DoCanonicalize(relative, relative_length, charset_converter,
+ output, output_parsed);
+}
+
+template<typename CHAR>
+bool DoReplaceComponents(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ const url_canon::Replacements<CHAR>& replacements,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ // If the scheme is overridden, just do a simple string substitution and
+ // reparse the whole thing. There are lots of edge cases that we really don't
+ // want to deal with. Like what happens if I replace "http://e:8080/foo"
+ // with a file. Does it become "file:///E:/8080/foo" where the port number
+ // becomes part of the path? Parsing that string as a file URL says "yes"
+ // but almost no sane rule for dealing with the components individually would
+ // come up with that.
+ //
+ // Why allow these crazy cases at all? Programatically, there is almost no
+ // case for replacing the scheme. The most common case for hitting this is
+ // in JS when building up a URL using the location object. In this case, the
+ // JS code expects the string substitution behavior:
+ // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
+ if (replacements.IsSchemeOverridden()) {
+ // Canonicalize the new scheme so it is 8-bit and can be concatenated with
+ // the existing spec.
+ url_canon::RawCanonOutput<128> scheme_replaced;
+ url_parse::Component scheme_replaced_parsed;
+ url_canon::CanonicalizeScheme(
+ replacements.sources().scheme,
+ replacements.components().scheme,
+ &scheme_replaced, &scheme_replaced_parsed);
+
+ // We can assume that the input is canonicalized, which means it always has
+ // a colon after the scheme (or where the scheme would be).
+ int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
+ : 1;
+ if (spec_len - spec_after_colon > 0) {
+ scheme_replaced.Append(&spec[spec_after_colon],
+ spec_len - spec_after_colon);
+ }
+
+ // We now need to completely re-parse the resulting string since its meaning
+ // may have changed with the different scheme.
+ url_canon::RawCanonOutput<128> recanonicalized;
+ url_parse::Parsed recanonicalized_parsed;
+ DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),
+ charset_converter,
+ &recanonicalized, &recanonicalized_parsed);
+
+ // Recurse using the version with the scheme already replaced. This will now
+ // use the replacement rules for the new scheme.
+ //
+ // Warning: this code assumes that ReplaceComponents will re-check all
+ // components for validity. This is because we can't fail if DoCanonicalize
+ // failed above since theoretically the thing making it fail could be
+ // getting replaced here. If ReplaceComponents didn't re-check everything,
+ // we wouldn't know if something *not* getting replaced is a problem.
+ // If the scheme-specific replacers are made more intelligent so they don't
+ // re-check everything, we should instead recanonicalize the whole thing
+ // after this call to check validity (this assumes replacing the scheme is
+ // much much less common than other types of replacements, like clearing the
+ // ref).
+ url_canon::Replacements<CHAR> replacements_no_scheme = replacements;
+ replacements_no_scheme.SetScheme(NULL, url_parse::Component());
+ return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
+ recanonicalized_parsed, replacements_no_scheme,
+ charset_converter, output, out_parsed);
+ }
+
+ // If we get here, then we know the scheme doesn't need to be replaced, so can
+ // just key off the scheme in the spec to know how to do the replacements.
+ if (DoCompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {
+ return url_canon::ReplaceFileURL(spec, parsed, replacements,
+ charset_converter, output, out_parsed);
+ }
+ if (DoCompareSchemeComponent(spec, parsed.scheme, kFileSystemScheme)) {
+ return url_canon::ReplaceFileSystemURL(spec, parsed, replacements,
+ charset_converter, output,
+ out_parsed);
+ }
+ if (DoIsStandard(spec, parsed.scheme)) {
+ return url_canon::ReplaceStandardURL(spec, parsed, replacements,
+ charset_converter, output, out_parsed);
+ }
+ if (DoCompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {
+ return url_canon::ReplaceMailtoURL(spec, parsed, replacements,
+ output, out_parsed);
+ }
+
+ // Default is a path URL.
+ return url_canon::ReplacePathURL(spec, parsed, replacements,
+ output, out_parsed);
+}
+
+} // namespace
+
+void Initialize() {
+ InitStandardSchemes();
+}
+
+void Shutdown() {
+ if (standard_schemes) {
+ delete standard_schemes;
+ standard_schemes = NULL;
+ }
+}
+
+void AddStandardScheme(const char* new_scheme) {
+ // If this assert triggers, it means you've called AddStandardScheme after
+ // LockStandardSchemes have been called (see the header file for
+ // LockStandardSchemes for more).
+ //
+ // This normally means you're trying to set up a new standard scheme too late
+ // in your application's init process. Locate where your app does this
+ // initialization and calls LockStandardScheme, and add your new standard
+ // scheme there.
+ DCHECK(!standard_schemes_locked) <<
+ "Trying to add a standard scheme after the list has been locked.";
+
+ size_t scheme_len = strlen(new_scheme);
+ if (scheme_len == 0)
+ return;
+
+ // Dulicate the scheme into a new buffer and add it to the list of standard
+ // schemes. This pointer will be leaked on shutdown.
+ char* dup_scheme = new char[scheme_len + 1];
+ memcpy(dup_scheme, new_scheme, scheme_len + 1);
+
+ InitStandardSchemes();
+ standard_schemes->push_back(dup_scheme);
+}
+
+void LockStandardSchemes() {
+ standard_schemes_locked = true;
+}
+
+bool IsStandard(const char* spec, const url_parse::Component& scheme) {
+ return DoIsStandard(spec, scheme);
+}
+
+bool IsStandard(const char16* spec, const url_parse::Component& scheme) {
+ return DoIsStandard(spec, scheme);
+}
+
+bool FindAndCompareScheme(const char* str,
+ int str_len,
+ const char* compare,
+ url_parse::Component* found_scheme) {
+ return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
+}
+
+bool FindAndCompareScheme(const char16* str,
+ int str_len,
+ const char* compare,
+ url_parse::Component* found_scheme) {
+ return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
+}
+
+bool Canonicalize(const char* spec,
+ int spec_len,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed) {
+ return DoCanonicalize(spec, spec_len, charset_converter,
+ output, output_parsed);
+}
+
+bool Canonicalize(const char16* spec,
+ int spec_len,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed) {
+ return DoCanonicalize(spec, spec_len, charset_converter,
+ output, output_parsed);
+}
+
+bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const url_parse::Parsed& base_parsed,
+ const char* relative,
+ int relative_length,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed) {
+ return DoResolveRelative(base_spec, base_spec_len, base_parsed,
+ relative, relative_length,
+ charset_converter, output, output_parsed);
+}
+
+bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const url_parse::Parsed& base_parsed,
+ const char16* relative,
+ int relative_length,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed) {
+ return DoResolveRelative(base_spec, base_spec_len, base_parsed,
+ relative, relative_length,
+ charset_converter, output, output_parsed);
+}
+
+bool ReplaceComponents(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ const url_canon::Replacements<char>& replacements,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ return DoReplaceComponents(spec, spec_len, parsed, replacements,
+ charset_converter, output, out_parsed);
+}
+
+bool ReplaceComponents(const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ const url_canon::Replacements<char16>& replacements,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* out_parsed) {
+ return DoReplaceComponents(spec, spec_len, parsed, replacements,
+ charset_converter, output, out_parsed);
+}
+
+// Front-ends for LowerCaseEqualsASCII.
+bool LowerCaseEqualsASCII(const char* a_begin,
+ const char* a_end,
+ const char* b) {
+ return DoLowerCaseEqualsASCII(a_begin, a_end, b);
+}
+
+bool LowerCaseEqualsASCII(const char* a_begin,
+ const char* a_end,
+ const char* b_begin,
+ const char* b_end) {
+ while (a_begin != a_end && b_begin != b_end &&
+ ToLowerASCII(*a_begin) == *b_begin) {
+ a_begin++;
+ b_begin++;
+ }
+ return a_begin == a_end && b_begin == b_end;
+}
+
+bool LowerCaseEqualsASCII(const char16* a_begin,
+ const char16* a_end,
+ const char* b) {
+ return DoLowerCaseEqualsASCII(a_begin, a_end, b);
+}
+
+void DecodeURLEscapeSequences(const char* input, int length,
+ url_canon::CanonOutputW* output) {
+ url_canon::RawCanonOutputT<char> unescaped_chars;
+ for (int i = 0; i < length; i++) {
+ if (input[i] == '%') {
+ unsigned char ch;
+ if (url_canon::DecodeEscaped(input, &i, length, &ch)) {
+ unescaped_chars.push_back(ch);
+ } else {
+ // Invalid escape sequence, copy the percent literal.
+ unescaped_chars.push_back('%');
+ }
+ } else {
+ // Regular non-escaped 8-bit character.
+ unescaped_chars.push_back(input[i]);
+ }
+ }
+
+ // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
+ // JavaScript URLs, but Firefox and Safari do.
+ for (int i = 0; i < unescaped_chars.length(); i++) {
+ unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
+ if (uch < 0x80) {
+ // Non-UTF-8, just append directly
+ output->push_back(uch);
+ } else {
+ // next_ch will point to the last character of the decoded
+ // character.
+ int next_character = i;
+ unsigned code_point;
+ if (url_canon::ReadUTFChar(unescaped_chars.data(), &next_character,
+ unescaped_chars.length(), &code_point)) {
+ // Valid UTF-8 character, convert to UTF-16.
+ url_canon::AppendUTF16Value(code_point, output);
+ i = next_character;
+ } else {
+ // If there are any sequences that are not valid UTF-8, we keep
+ // invalid code points and promote to UTF-16. We copy all characters
+ // from the current position to the end of the identified sequence.
+ while (i < next_character) {
+ output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+ i++;
+ }
+ output->push_back(static_cast<unsigned char>(unescaped_chars.at(i)));
+ }
+ }
+ }
+}
+
+void EncodeURIComponent(const char* input, int length,
+ url_canon::CanonOutput* output) {
+ for (int i = 0; i < length; ++i) {
+ unsigned char c = static_cast<unsigned char>(input[i]);
+ if (url_canon::IsComponentChar(c))
+ output->push_back(c);
+ else
+ AppendEscapedChar(c, output);
+ }
+}
+
+bool CompareSchemeComponent(const char* spec,
+ const url_parse::Component& component,
+ const char* compare_to) {
+ return DoCompareSchemeComponent(spec, component, compare_to);
+}
+
+bool CompareSchemeComponent(const char16* spec,
+ const url_parse::Component& component,
+ const char* compare_to) {
+ return DoCompareSchemeComponent(spec, component, compare_to);
+}
+
+} // namespace url_util
diff --git a/url/url_util.h b/url/url_util.h
new file mode 100644
index 0000000..9e53d2d
--- /dev/null
+++ b/url/url_util.h
@@ -0,0 +1,228 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_UTIL_H__
+#define GOOGLEURL_SRC_URL_UTIL_H__
+
+#include <string>
+
+#include "base/string16.h"
+#include "googleurl/src/url_common.h"
+#include "googleurl/src/url_parse.h"
+#include "googleurl/src/url_canon.h"
+
+namespace url_util {
+
+// Init ------------------------------------------------------------------------
+
+// Initialization is NOT required, it will be implicitly initialized when first
+// used. However, this implicit initialization is NOT threadsafe. If you are
+// using this library in a threaded environment and don't have a consistent
+// "first call" (an example might be calling "AddStandardScheme" with your
+// special application-specific schemes) then you will want to call initialize
+// before spawning any threads.
+//
+// It is OK to call this function more than once, subsequent calls will simply
+// "noop", unless Shutdown() was called in the mean time. This will also be a
+// "noop" if other calls to the library have forced an initialization
+// beforehand.
+GURL_API void Initialize();
+
+// Cleanup is not required, except some strings may leak. For most user
+// applications, this is fine. If you're using it in a library that may get
+// loaded and unloaded, you'll want to unload to properly clean up your
+// library.
+GURL_API void Shutdown();
+
+// Schemes --------------------------------------------------------------------
+
+// Adds an application-defined scheme to the internal list of "standard" URL
+// schemes. This function is not threadsafe and can not be called concurrently
+// with any other url_util function. It will assert if the list of standard
+// schemes has been locked (see LockStandardSchemes).
+GURL_API void AddStandardScheme(const char* new_scheme);
+
+// Sets a flag to prevent future calls to AddStandardScheme from succeeding.
+//
+// This is designed to help prevent errors for multithreaded applications.
+// Normal usage would be to call AddStandardScheme for your custom schemes at
+// the beginning of program initialization, and then LockStandardSchemes. This
+// prevents future callers from mistakenly calling AddStandardScheme when the
+// program is running with multiple threads, where such usage would be
+// dangerous.
+//
+// We could have had AddStandardScheme use a lock instead, but that would add
+// some platform-specific dependencies we don't otherwise have now, and is
+// overkill considering the normal usage is so simple.
+GURL_API void LockStandardSchemes();
+
+// Locates the scheme in the given string and places it into |found_scheme|,
+// which may be NULL to indicate the caller does not care about the range.
+//
+// Returns whether the given |compare| scheme matches the scheme found in the
+// input (if any). The |compare| scheme must be a valid canonical scheme or
+// the result of the comparison is undefined.
+GURL_API bool FindAndCompareScheme(const char* str,
+ int str_len,
+ const char* compare,
+ url_parse::Component* found_scheme);
+GURL_API bool FindAndCompareScheme(const char16* str,
+ int str_len,
+ const char* compare,
+ url_parse::Component* found_scheme);
+inline bool FindAndCompareScheme(const std::string& str,
+ const char* compare,
+ url_parse::Component* found_scheme) {
+ return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
+ compare, found_scheme);
+}
+inline bool FindAndCompareScheme(const string16& str,
+ const char* compare,
+ url_parse::Component* found_scheme) {
+ return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
+ compare, found_scheme);
+}
+
+// Returns true if the given string represents a standard URL. This means that
+// either the scheme is in the list of known standard schemes.
+GURL_API bool IsStandard(const char* spec,
+ const url_parse::Component& scheme);
+GURL_API bool IsStandard(const char16* spec,
+ const url_parse::Component& scheme);
+
+// TODO(brettw) remove this. This is a temporary compatibility hack to avoid
+// breaking the WebKit build when this version is synced via Chrome.
+inline bool IsStandard(const char* spec, int spec_len,
+ const url_parse::Component& scheme) {
+ return IsStandard(spec, scheme);
+}
+
+// URL library wrappers -------------------------------------------------------
+
+// Parses the given spec according to the extracted scheme type. Normal users
+// should use the URL object, although this may be useful if performance is
+// critical and you don't want to do the heap allocation for the std::string.
+//
+// As with the url_canon::Canonicalize* functions, the charset converter can
+// be NULL to use UTF-8 (it will be faster in this case).
+//
+// Returns true if a valid URL was produced, false if not. On failure, the
+// output and parsed structures will still be filled and will be consistent,
+// but they will not represent a loadable URL.
+GURL_API bool Canonicalize(const char* spec,
+ int spec_len,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed);
+GURL_API bool Canonicalize(const char16* spec,
+ int spec_len,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed);
+
+// Resolves a potentially relative URL relative to the given parsed base URL.
+// The base MUST be valid. The resulting canonical URL and parsed information
+// will be placed in to the given out variables.
+//
+// The relative need not be relative. If we discover that it's absolute, this
+// will produce a canonical version of that URL. See Canonicalize() for more
+// about the charset_converter.
+//
+// Returns true if the output is valid, false if the input could not produce
+// a valid URL.
+GURL_API bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const url_parse::Parsed& base_parsed,
+ const char* relative,
+ int relative_length,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed);
+GURL_API bool ResolveRelative(const char* base_spec,
+ int base_spec_len,
+ const url_parse::Parsed& base_parsed,
+ const char16* relative,
+ int relative_length,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* output_parsed);
+
+// Replaces components in the given VALID input url. The new canonical URL info
+// is written to output and out_parsed.
+//
+// Returns true if the resulting URL is valid.
+GURL_API bool ReplaceComponents(
+ const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ const url_canon::Replacements<char>& replacements,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* out_parsed);
+GURL_API bool ReplaceComponents(
+ const char* spec,
+ int spec_len,
+ const url_parse::Parsed& parsed,
+ const url_canon::Replacements<char16>& replacements,
+ url_canon::CharsetConverter* charset_converter,
+ url_canon::CanonOutput* output,
+ url_parse::Parsed* out_parsed);
+
+// String helper functions ----------------------------------------------------
+
+// Compare the lower-case form of the given string against the given ASCII
+// string. This is useful for doing checking if an input string matches some
+// token, and it is optimized to avoid intermediate string copies.
+//
+// The versions of this function that don't take a b_end assume that the b
+// string is NULL terminated.
+GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
+ const char* a_end,
+ const char* b);
+GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
+ const char* a_end,
+ const char* b_begin,
+ const char* b_end);
+GURL_API bool LowerCaseEqualsASCII(const char16* a_begin,
+ const char16* a_end,
+ const char* b);
+
+// Unescapes the given string using URL escaping rules.
+GURL_API void DecodeURLEscapeSequences(const char* input, int length,
+ url_canon::CanonOutputW* output);
+
+// Escapes the given string as defined by the JS method encodeURIComponent. See
+// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent
+GURL_API void EncodeURIComponent(const char* input, int length,
+ url_canon::CanonOutput* output);
+
+
+} // namespace url_util
+
+#endif // GOOGLEURL_SRC_URL_UTIL_H__
diff --git a/url/url_util_internal.h b/url/url_util_internal.h
new file mode 100644
index 0000000..1fbb46a
--- /dev/null
+++ b/url/url_util_internal.h
@@ -0,0 +1,56 @@
+// Copyright 2011, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__
+#define GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__
+
+#include <string>
+
+#include "base/string16.h"
+#include "googleurl/src/url_common.h"
+#include "googleurl/src/url_parse.h"
+
+namespace url_util {
+
+extern const char kFileScheme[];
+extern const char kFileSystemScheme[];
+extern const char kMailtoScheme[];
+
+// Given a string and a range inside the string, compares it to the given
+// lower-case |compare_to| buffer.
+bool CompareSchemeComponent(const char* spec,
+ const url_parse::Component& component,
+ const char* compare_to);
+bool CompareSchemeComponent(const char16* spec,
+ const url_parse::Component& component,
+ const char* compare_to);
+
+} // namespace url_util
+
+#endif // GOOGLEURL_SRC_URL_UTIL_INTERNAL_H__
diff --git a/url/url_util_unittest.cc b/url/url_util_unittest.cc
new file mode 100644
index 0000000..c7b39fe
--- /dev/null
+++ b/url/url_util_unittest.cc
@@ -0,0 +1,310 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_canon_stdstring.h"
+#include "googleurl/src/url_parse.h"
+#include "googleurl/src/url_test_utils.h"
+#include "googleurl/src/url_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+TEST(URLUtilTest, FindAndCompareScheme) {
+ url_parse::Component found_scheme;
+
+ // Simple case where the scheme is found and matches.
+ const char kStr1[] = "http://www.com/";
+ EXPECT_TRUE(url_util::FindAndCompareScheme(
+ kStr1, static_cast<int>(strlen(kStr1)), "http", NULL));
+ EXPECT_TRUE(url_util::FindAndCompareScheme(
+ kStr1, static_cast<int>(strlen(kStr1)), "http", &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component(0, 4));
+
+ // A case where the scheme is found and doesn't match.
+ EXPECT_FALSE(url_util::FindAndCompareScheme(
+ kStr1, static_cast<int>(strlen(kStr1)), "https", &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component(0, 4));
+
+ // A case where there is no scheme.
+ const char kStr2[] = "httpfoobar";
+ EXPECT_FALSE(url_util::FindAndCompareScheme(
+ kStr2, static_cast<int>(strlen(kStr2)), "http", &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component());
+
+ // When there is an empty scheme, it should match the empty scheme.
+ const char kStr3[] = ":foo.com/";
+ EXPECT_TRUE(url_util::FindAndCompareScheme(
+ kStr3, static_cast<int>(strlen(kStr3)), "", &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component(0, 0));
+
+ // But when there is no scheme, it should fail.
+ EXPECT_FALSE(url_util::FindAndCompareScheme("", 0, "", &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component());
+
+ // When there is a whitespace char in scheme, it should canonicalize the url
+ // before comparison.
+ const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)";
+ EXPECT_TRUE(url_util::FindAndCompareScheme(
+ whtspc_str, static_cast<int>(strlen(whtspc_str)), "javascript",
+ &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component(1, 10));
+
+ // Control characters should be stripped out on the ends, and kept in the
+ // middle.
+ const char ctrl_str[] = "\02jav\02scr\03ipt:alert(1)";
+ EXPECT_FALSE(url_util::FindAndCompareScheme(
+ ctrl_str, static_cast<int>(strlen(ctrl_str)), "javascript",
+ &found_scheme));
+ EXPECT_TRUE(found_scheme == url_parse::Component(1, 11));
+}
+
+TEST(URLUtilTest, ReplaceComponents) {
+ url_parse::Parsed parsed;
+ url_canon::RawCanonOutputT<char> output;
+ url_parse::Parsed new_parsed;
+
+ // Check that the following calls do not cause crash
+ url_canon::Replacements<char> replacements;
+ replacements.SetRef("test", url_parse::Component(0, 4));
+ url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
+ &new_parsed);
+ url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
+ &new_parsed);
+ replacements.ClearRef();
+ replacements.SetHost("test", url_parse::Component(0, 4));
+ url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
+ &new_parsed);
+ url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
+ &new_parsed);
+
+ replacements.ClearHost();
+ url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
+ &new_parsed);
+ url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
+ &new_parsed);
+ url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output,
+ &new_parsed);
+ url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output,
+ &new_parsed);
+}
+
+static std::string CheckReplaceScheme(const char* base_url,
+ const char* scheme) {
+ // Make sure the input is canonicalized.
+ url_canon::RawCanonOutput<32> original;
+ url_parse::Parsed original_parsed;
+ url_util::Canonicalize(base_url, strlen(base_url), NULL,
+ &original, &original_parsed);
+
+ url_canon::Replacements<char> replacements;
+ replacements.SetScheme(scheme, url_parse::Component(0, strlen(scheme)));
+
+ std::string output_string;
+ url_canon::StdStringCanonOutput output(&output_string);
+ url_parse::Parsed output_parsed;
+ url_util::ReplaceComponents(original.data(), original.length(),
+ original_parsed, replacements, NULL,
+ &output, &output_parsed);
+
+ output.Complete();
+ return output_string;
+}
+
+TEST(URLUtilTest, ReplaceScheme) {
+ EXPECT_EQ("https://google.com/",
+ CheckReplaceScheme("http://google.com/", "https"));
+ EXPECT_EQ("file://google.com/",
+ CheckReplaceScheme("http://google.com/", "file"));
+ EXPECT_EQ("http://home/Build",
+ CheckReplaceScheme("file:///Home/Build", "http"));
+ EXPECT_EQ("javascript:foo",
+ CheckReplaceScheme("about:foo", "javascript"));
+ EXPECT_EQ("://google.com/",
+ CheckReplaceScheme("http://google.com/", ""));
+ EXPECT_EQ("http://google.com/",
+ CheckReplaceScheme("about:google.com", "http"));
+ EXPECT_EQ("http:", CheckReplaceScheme("", "http"));
+
+#ifdef WIN32
+ // Magic Windows drive letter behavior when converting to a file URL.
+ EXPECT_EQ("file:///E:/foo/",
+ CheckReplaceScheme("http://localhost/e:foo/", "file"));
+#endif
+
+ // This will probably change to "about://google.com/" when we fix
+ // http://crbug.com/160 which should also be an acceptable result.
+ EXPECT_EQ("about://google.com/",
+ CheckReplaceScheme("http://google.com/", "about"));
+}
+
+TEST(URLUtilTest, DecodeURLEscapeSequences) {
+ struct DecodeCase {
+ const char* input;
+ const char* output;
+ } decode_cases[] = {
+ {"hello, world", "hello, world"},
+ {"%01%02%03%04%05%06%07%08%09%0a%0B%0C%0D%0e%0f/",
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0B\x0C\x0D\x0e\x0f/"},
+ {"%10%11%12%13%14%15%16%17%18%19%1a%1B%1C%1D%1e%1f/",
+ "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1B\x1C\x1D\x1e\x1f/"},
+ {"%20%21%22%23%24%25%26%27%28%29%2a%2B%2C%2D%2e%2f/",
+ " !\"#$%&'()*+,-.//"},
+ {"%30%31%32%33%34%35%36%37%38%39%3a%3B%3C%3D%3e%3f/",
+ "0123456789:;<=>?/"},
+ {"%40%41%42%43%44%45%46%47%48%49%4a%4B%4C%4D%4e%4f/",
+ "@ABCDEFGHIJKLMNO/"},
+ {"%50%51%52%53%54%55%56%57%58%59%5a%5B%5C%5D%5e%5f/",
+ "PQRSTUVWXYZ[\\]^_/"},
+ {"%60%61%62%63%64%65%66%67%68%69%6a%6B%6C%6D%6e%6f/",
+ "`abcdefghijklmno/"},
+ {"%70%71%72%73%74%75%76%77%78%79%7a%7B%7C%7D%7e%7f/",
+ "pqrstuvwxyz{|}~\x7f/"},
+ // Test un-UTF-8-ization.
+ {"%e4%bd%a0%e5%a5%bd", "\xe4\xbd\xa0\xe5\xa5\xbd"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(decode_cases); i++) {
+ const char* input = decode_cases[i].input;
+ url_canon::RawCanonOutputT<char16> output;
+ url_util::DecodeURLEscapeSequences(input, strlen(input), &output);
+ EXPECT_EQ(decode_cases[i].output,
+ url_test_utils::ConvertUTF16ToUTF8(
+ string16(output.data(), output.length())));
+ }
+
+ // Our decode should decode %00
+ const char zero_input[] = "%00";
+ url_canon::RawCanonOutputT<char16> zero_output;
+ url_util::DecodeURLEscapeSequences(zero_input, strlen(zero_input),
+ &zero_output);
+ EXPECT_NE("%00",
+ url_test_utils::ConvertUTF16ToUTF8(
+ string16(zero_output.data(), zero_output.length())));
+
+ // Test the error behavior for invalid UTF-8.
+ const char invalid_input[] = "%e4%a0%e5%a5%bd";
+ const char16 invalid_expected[4] = {0x00e4, 0x00a0, 0x597d, 0};
+ url_canon::RawCanonOutputT<char16> invalid_output;
+ url_util::DecodeURLEscapeSequences(invalid_input, strlen(invalid_input),
+ &invalid_output);
+ EXPECT_EQ(string16(invalid_expected),
+ string16(invalid_output.data(), invalid_output.length()));
+}
+
+TEST(URLUtilTest, TestEncodeURIComponent) {
+ struct EncodeCase {
+ const char* input;
+ const char* output;
+ } encode_cases[] = {
+ {"hello, world", "hello%2C%20world"},
+ {"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
+ "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"},
+ {"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
+ "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F"},
+ {" !\"#$%&'()*+,-./",
+ "%20!%22%23%24%25%26'()*%2B%2C-.%2F"},
+ {"0123456789:;<=>?",
+ "0123456789%3A%3B%3C%3D%3E%3F"},
+ {"@ABCDEFGHIJKLMNO",
+ "%40ABCDEFGHIJKLMNO"},
+ {"PQRSTUVWXYZ[\\]^_",
+ "PQRSTUVWXYZ%5B%5C%5D%5E_"},
+ {"`abcdefghijklmno",
+ "%60abcdefghijklmno"},
+ {"pqrstuvwxyz{|}~\x7f",
+ "pqrstuvwxyz%7B%7C%7D~%7F"},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(encode_cases); i++) {
+ const char* input = encode_cases[i].input;
+ url_canon::RawCanonOutputT<char> buffer;
+ url_util::EncodeURIComponent(input, strlen(input), &buffer);
+ std::string output(buffer.data(), buffer.length());
+ EXPECT_EQ(encode_cases[i].output, output);
+ }
+}
+
+TEST(URLUtilTest, TestResolveRelativeWithNonStandardBase) {
+ // This tests non-standard (in the sense that GURL::IsStandard() == false)
+ // hierarchical schemes.
+ struct ResolveRelativeCase {
+ const char* base;
+ const char* rel;
+ bool is_valid;
+ const char* out;
+ } resolve_non_standard_cases[] = {
+ // Resolving a relative path against a non-hierarchical URL should fail.
+ {"scheme:opaque_data", "/path", false, ""},
+ // Resolving a relative path against a non-standard authority-based base
+ // URL doesn't alter the authority section.
+ {"scheme://Authority/", "../path", true, "scheme://Authority/path"},
+ // A non-standard hierarchical base is resolved with path URL
+ // canoncialization rules.
+ {"data:/Blah:Blah/", "file.html", true, "data:/Blah:Blah/file.html"},
+ {"data:/Path/../part/part2", "file.html", true, "data:/Path/../part/file.html"},
+ // Path URL canonicalization rules also apply to non-standard authority-
+ // based URLs.
+ {"custom://Authority/", "file.html", true, "custom://Authority/file.html"},
+ {"custom://Authority/", "other://Auth/", true, "other://Auth/"},
+ {"custom://Authority/", "../../file.html", true, "custom://Authority/file.html"},
+ {"custom://Authority/path/", "file.html", true, "custom://Authority/path/file.html"},
+ {"custom://Authority:NoCanon/path/", "file.html", true, "custom://Authority:NoCanon/path/file.html"},
+ // It's still possible to get an invalid path URL.
+ {"custom://Invalid:!#Auth/", "file.html", false, ""},
+ // A path with an authority section gets canonicalized under standard URL
+ // rules, even though the base was non-standard.
+ {"content://content.Provider/", "//other.Provider", true, "content://other.provider/"},
+ // Resolving an absolute URL doesn't cause canonicalization of the
+ // result.
+ {"about:blank", "custom://Authority", true, "custom://Authority"},
+ // Resolving should fail if the base URL is authority-based but is
+ // missing a path component (the '/' at the end).
+ {"scheme://Authority", "path", false, ""},
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(resolve_non_standard_cases); i++) {
+ const ResolveRelativeCase& test_data = resolve_non_standard_cases[i];
+ url_parse::Parsed base_parsed;
+ url_parse::ParsePathURL(test_data.base, strlen(test_data.base),
+ &base_parsed);
+
+ std::string resolved;
+ url_canon::StdStringCanonOutput output(&resolved);
+ url_parse::Parsed resolved_parsed;
+ bool valid =
+ url_util::ResolveRelative(test_data.base, strlen(test_data.base),
+ base_parsed,
+ test_data.rel, strlen(test_data.rel),
+ NULL, &output, &resolved_parsed);
+ output.Complete();
+
+ EXPECT_EQ(test_data.is_valid, valid) << i;
+ if (test_data.is_valid && valid)
+ EXPECT_EQ(test_data.out, resolved) << i;
+ }
+}