From f968211f7391a19f3383ab184d1461a2e3b367fb Mon Sep 17 00:00:00 2001 From: "tfarina@chromium.org" Date: Mon, 12 Aug 2013 05:01:49 +0000 Subject: Stop pulling googleurl through DEPS. This is the final patch in this series of merging the external googleurl repo into Chromium source code base. BUG=229660 R=brettw@chromium.org,joth@chromium.org,blundell@chromium.org,thakis@chromium.org,thestig@chromium.org TBR=brettw Review URL: https://chromiumcodereview.appspot.com/20349002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@216922 0039d316-1c4b-4281-b951-d872f2087c98 --- DEPS | 6 - android_webview/buildbot/deps_whitelist.py | 1 - ios/public/DEPS | 1 - tools/checklicenses/checklicenses.py | 4 - tools/clang/plugins/ChromeClassTester.cpp | 1 - tools/licenses.py | 8 +- url/third_party/mozilla/LICENSE.txt | 65 ++ url/third_party/mozilla/README.chromium | 8 + url/third_party/mozilla/url_parse.cc | 923 +++++++++++++++++++++++++++++ url/third_party/mozilla/url_parse.h | 361 +++++++++++ url/url.gyp | 4 +- url/url_parse.cc | 923 ----------------------------- url/url_parse.h | 354 +---------- 13 files changed, 1362 insertions(+), 1297 deletions(-) create mode 100644 url/third_party/mozilla/LICENSE.txt create mode 100644 url/third_party/mozilla/README.chromium create mode 100644 url/third_party/mozilla/url_parse.cc create mode 100644 url/third_party/mozilla/url_parse.h delete mode 100644 url/url_parse.cc diff --git a/DEPS b/DEPS index 9c5d932..bc6feb5 100644 --- a/DEPS +++ b/DEPS @@ -55,9 +55,6 @@ deps = { "src/breakpad/src": (Var("googlecode_url") % "google-breakpad") + "/trunk/src@1199", - "src/googleurl": - (Var("googlecode_url") % "google-url") + "/trunk@185", - "src/sdch/open-vcdiff": (Var("googlecode_url") % "open-vcdiff") + "/trunk@42", @@ -513,8 +510,6 @@ include_rules = [ "+third_party/icu/source/common/unicode", "+third_party/icu/source/i18n/unicode", "+url", - # TODO(tfarina): Temporary, until we finish the migration to url. Remove this! - "!googleurl", ] @@ -523,7 +518,6 @@ skip_child_includes = [ "breakpad", "chrome_frame", "delegate_execute", - "googleurl", "metro_driver", "native_client_sdk", "o3d", diff --git a/android_webview/buildbot/deps_whitelist.py b/android_webview/buildbot/deps_whitelist.py index 69c87be..105646f 100755 --- a/android_webview/buildbot/deps_whitelist.py +++ b/android_webview/buildbot/deps_whitelist.py @@ -32,7 +32,6 @@ class DepsWhitelist(object): # Dependencies that need to be merged into the Android tree. self._snapshot_into_android_dependencies = [ - 'googleurl', 'sdch/open-vcdiff', 'testing/gtest', 'third_party/WebKit', diff --git a/ios/public/DEPS b/ios/public/DEPS index 3f0cd59..c04c1d5 100644 --- a/ios/public/DEPS +++ b/ios/public/DEPS @@ -4,7 +4,6 @@ include_rules = [ # be kept in sync with src/DEPS. "-base", "-build", - "-googleurl", "-library_loaders", "-testing", "-third_party/icu/source/common/unicode", diff --git a/tools/checklicenses/checklicenses.py b/tools/checklicenses/checklicenses.py index 9c27256..25dedc2 100755 --- a/tools/checklicenses/checklicenses.py +++ b/tools/checklicenses/checklicenses.py @@ -128,10 +128,6 @@ PATH_SPECIFIC_WHITELISTED_LICENSES = { 'data/tab_switching': [ 'UNKNOWN', ], - 'googleurl': [ # http://code.google.com/p/google-url/issues/detail?id=15 - 'UNKNOWN', - ], - 'native_client': [ # http://crbug.com/98099 'UNKNOWN', ], diff --git a/tools/clang/plugins/ChromeClassTester.cpp b/tools/clang/plugins/ChromeClassTester.cpp index 5784334..ee8452d 100644 --- a/tools/clang/plugins/ChromeClassTester.cpp +++ b/tools/clang/plugins/ChromeClassTester.cpp @@ -151,7 +151,6 @@ void ChromeClassTester::BuildBannedLists() { banned_directories_.push_back("ppapi/"); banned_directories_.push_back("usr/"); banned_directories_.push_back("testing/"); - banned_directories_.push_back("googleurl/"); banned_directories_.push_back("v8/"); banned_directories_.push_back("dart/"); banned_directories_.push_back("sdch/"); diff --git a/tools/licenses.py b/tools/licenses.py index 4d1cb01..d2c535b 100755 --- a/tools/licenses.py +++ b/tools/licenses.py @@ -82,7 +82,6 @@ ADDITIONAL_PATHS = ( os.path.join('chrome', 'common', 'extensions', 'docs', 'examples'), os.path.join('chrome', 'test', 'chromeos', 'autotest'), os.path.join('chrome', 'test', 'data'), - os.path.join('googleurl'), os.path.join('native_client'), os.path.join('native_client_sdk'), os.path.join('net', 'tools', 'spdyshark'), @@ -97,6 +96,7 @@ ADDITIONAL_PATHS = ( os.path.join('tools', 'grit'), os.path.join('tools', 'gyp'), os.path.join('tools', 'page_cycler', 'acid3'), + os.path.join('url', 'third_party', 'mozilla'), os.path.join('v8'), # Fake directory so we can include the strongtalk license. os.path.join('v8', 'strongtalk'), @@ -107,12 +107,6 @@ ADDITIONAL_PATHS = ( # can't provide a README.chromium. Please prefer a README.chromium # wherever possible. SPECIAL_CASES = { - os.path.join('googleurl'): { - "Name": "google-url", - "URL": "http://code.google.com/p/google-url/", - "License": "BSD and MPL 1.1/GPL 2.0/LGPL 2.1", - "License File": "LICENSE.txt", - }, os.path.join('native_client'): { "Name": "native client", "URL": "http://code.google.com/p/nativeclient", diff --git a/url/third_party/mozilla/LICENSE.txt b/url/third_party/mozilla/LICENSE.txt new file mode 100644 index 0000000..ac40837 --- /dev/null +++ b/url/third_party/mozilla/LICENSE.txt @@ -0,0 +1,65 @@ +Copyright 2007, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is +licensed separately as follows: + +The contents of this file are subject to the Mozilla Public License Version +1.1 (the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at +http://www.mozilla.org/MPL/ + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +for the specific language governing rights and limitations under the +License. + +The Original Code is mozilla.org code. + +The Initial Developer of the Original Code is +Netscape Communications Corporation. +Portions created by the Initial Developer are Copyright (C) 1998 +the Initial Developer. All Rights Reserved. + +Contributor(s): + Darin Fisher (original author) + +Alternatively, the contents of this file may be used under the terms of +either the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +in which case the provisions of the GPL or the LGPL are applicable instead +of those above. If you wish to allow use of your version of this file only +under the terms of either the GPL or the LGPL, and not to allow others to +use your version of this file under the terms of the MPL, indicate your +decision by deleting the provisions above and replace them with the notice +and other provisions required by the GPL or the LGPL. If you do not delete +the provisions above, a recipient may use your version of this file under +the terms of any one of the MPL, the GPL or the LGPL. diff --git a/url/third_party/mozilla/README.chromium b/url/third_party/mozilla/README.chromium new file mode 100644 index 0000000..ef396d3 --- /dev/null +++ b/url/third_party/mozilla/README.chromium @@ -0,0 +1,8 @@ +Name: url_parse +URL: http://mxr.mozilla.org/comm-central/source/mozilla/netwerk/base/src/nsURLParsers.cpp +License: BSD and MPL 1.1/GPL 2.0/LGPL 2.1 +License File: LICENSE.txt + +Description: + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla. diff --git a/url/third_party/mozilla/url_parse.cc b/url/third_party/mozilla/url_parse.cc new file mode 100644 index 0000000..52c6196 --- /dev/null +++ b/url/third_party/mozilla/url_parse.cc @@ -0,0 +1,923 @@ +/* Based on nsURLParsers.cc from Mozilla + * ------------------------------------- + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Darin Fisher (original author) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "url/third_party/mozilla/url_parse.h" + +#include + +#include "base/logging.h" +#include "url/url_parse_internal.h" +#include "url/url_util.h" +#include "url/url_util_internal.h" + +namespace url_parse { + +namespace { + +// Returns true if the given character is a valid digit to use in a port. +inline bool IsPortDigit(base::char16 ch) { + return ch >= '0' && ch <= '9'; +} + +// Returns the offset of the next authority terminator in the input starting +// from start_offset. If no terminator is found, the return value will be equal +// to spec_len. +template +int FindNextAuthorityTerminator(const CHAR* spec, + int start_offset, + int spec_len) { + for (int i = start_offset; i < spec_len; i++) { + if (IsAuthorityTerminator(spec[i])) + return i; + } + return spec_len; // Not found. +} + +template +void ParseUserInfo(const CHAR* spec, + const Component& user, + Component* username, + Component* password) { + // Find the first colon in the user section, which separates the username and + // password. + int colon_offset = 0; + while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') + colon_offset++; + + if (colon_offset < user.len) { + // Found separator: : + *username = Component(user.begin, colon_offset); + *password = MakeRange(user.begin + colon_offset + 1, + user.begin + user.len); + } else { + // No separator, treat everything as the username + *username = user; + *password = Component(); + } +} + +template +void ParseServerInfo(const CHAR* spec, + const Component& serverinfo, + Component* hostname, + Component* port_num) { + if (serverinfo.len == 0) { + // No server info, host name is empty. + hostname->reset(); + port_num->reset(); + return; + } + + // If the host starts with a left-bracket, assume the entire host is an + // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. + // This assumption will be overridden if we find a right-bracket. + // + // Our IPv6 address canonicalization code requires both brackets to exist, + // but the ability to locate an incomplete address can still be useful. + int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; + int colon = -1; + + // Find the last right-bracket, and the last colon. + for (int i = serverinfo.begin; i < serverinfo.end(); i++) { + switch (spec[i]) { + case ']': + ipv6_terminator = i; + break; + case ':': + colon = i; + break; + } + } + + if (colon > ipv6_terminator) { + // Found a port number: : + *hostname = MakeRange(serverinfo.begin, colon); + if (hostname->len == 0) + hostname->reset(); + *port_num = MakeRange(colon + 1, serverinfo.end()); + } else { + // No port: + *hostname = serverinfo; + port_num->reset(); + } +} + +// Given an already-identified auth section, breaks it into its consituent +// parts. The port number will be parsed and the resulting integer will be +// filled into the given *port variable, or -1 if there is no port number or it +// is invalid. +template +void DoParseAuthority(const CHAR* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DCHECK(auth.is_valid()) << "We should always get an authority"; + if (auth.len == 0) { + username->reset(); + password->reset(); + hostname->reset(); + port_num->reset(); + return; + } + + // Search backwards for @, which is the separator between the user info and + // the server info. + int i = auth.begin + auth.len - 1; + while (i > auth.begin && spec[i] != '@') + i--; + + if (spec[i] == '@') { + // Found user info: @ + ParseUserInfo(spec, Component(auth.begin, i - auth.begin), + username, password); + ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), + hostname, port_num); + } else { + // No user info, everything is server info. + username->reset(); + password->reset(); + ParseServerInfo(spec, auth, hostname, port_num); + } +} + +template +void ParsePath(const CHAR* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + // path = [/]//<...>/;?# + + // Special case when there is no path. + if (path.len == -1) { + filepath->reset(); + query->reset(); + ref->reset(); + return; + } + DCHECK(path.len > 0) << "We should never have 0 length paths"; + + // Search for first occurrence of either ? or #. + int path_end = path.begin + path.len; + + int query_separator = -1; // Index of the '?' + int ref_separator = -1; // Index of the '#' + for (int i = path.begin; i < path_end; i++) { + switch (spec[i]) { + case '?': + // Only match the query string if it precedes the reference fragment + // and when we haven't found one already. + if (ref_separator < 0 && query_separator < 0) + query_separator = i; + break; + case '#': + // Record the first # sign only. + if (ref_separator < 0) + ref_separator = i; + break; + } + } + + // Markers pointing to the character after each of these corresponding + // components. The code below words from the end back to the beginning, + // and will update these indices as it finds components that exist. + int file_end, query_end; + + // Ref fragment: from the # to the end of the path. + if (ref_separator >= 0) { + file_end = query_end = ref_separator; + *ref = MakeRange(ref_separator + 1, path_end); + } else { + file_end = query_end = path_end; + ref->reset(); + } + + // Query fragment: everything from the ? to the next boundary (either the end + // of the path or the ref fragment). + if (query_separator >= 0) { + file_end = query_separator; + *query = MakeRange(query_separator + 1, query_end); + } else { + query->reset(); + } + + // File path: treat an empty file path as no file path. + if (file_end != path.begin) + *filepath = MakeRange(path.begin, file_end); + else + filepath->reset(); +} + +template +bool DoExtractScheme(const CHAR* url, + int url_len, + Component* scheme) { + // Skip leading whitespace and control characters. + int begin = 0; + while (begin < url_len && ShouldTrimFromURL(url[begin])) + begin++; + if (begin == url_len) + return false; // Input is empty or all whitespace. + + // Find the first colon character. + for (int i = begin; i < url_len; i++) { + if (url[i] == ':') { + *scheme = MakeRange(begin, i); + return true; + } + } + return false; // No colon found: no scheme +} + +// Fills in all members of the Parsed structure except for the scheme. +// +// |spec| is the full spec being parsed, of length |spec_len|. +// |after_scheme| is the character immediately following the scheme (after the +// colon) where we'll begin parsing. +// +// Compatability data points. I list "host", "path" extracted: +// Input IE6 Firefox Us +// ----- -------------- -------------- -------------- +// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" +// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" +// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// +// (*) Interestingly, although IE fails to load these URLs, its history +// canonicalizer handles them, meaning if you've been to the corresponding +// "http://foo.com/" link, it will be colored. +template +void DoParseAfterScheme(const CHAR* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + int after_slashes = after_scheme + num_slashes; + + // First split into two main parts, the authority (username, password, host, + // and port) and the full path (path, query, and reference). + Component authority; + Component full_path; + + // Found "//", looks like an authority section. Treat everything + // from there to the next slash (or end of spec) to be the authority. Note + // that we ignore the number of slashes and treat it as the authority. + int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); + authority = Component(after_slashes, end_auth - after_slashes); + + if (end_auth == spec_len) // No beginning of path found. + full_path = Component(); + else // Everything starting from the slash to the end is the path. + full_path = Component(end_auth, spec_len - end_auth); + + // Now parse those two sub-parts. + DoParseAuthority(spec, authority, &parsed->username, &parsed->password, + &parsed->host, &parsed->port); + ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); +} + +// The main parsing function for standard URLs. Standard URLs have a scheme, +// host, path, etc. +template +void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + int after_scheme; + if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { + after_scheme = parsed->scheme.end() + 1; // Skip past the colon. + } else { + // Say there's no scheme when there is no colon. We could also say that + // everything is the scheme. Both would produce an invalid URL, but this way + // seems less wrong in more cases. + parsed->scheme.reset(); + after_scheme = begin; + } + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +template +void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the unused parts of the URL out of the way. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->path.reset(); // May use this; reset for convenience. + parsed->ref.reset(); // May use this; reset for convenience. + parsed->query.reset(); // May use this; reset for convenience. + parsed->clear_inner_parsed(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + return; + } + + int inner_start = -1; + + // Extract the scheme. We also handle the case where there is no scheme. + if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() == spec_len - 1) + return; + + inner_start = parsed->scheme.end() + 1; + } else { + // No scheme found; that's not valid for filesystem URLs. + parsed->scheme.reset(); + return; + } + + url_parse::Component inner_scheme; + const CHAR* inner_spec = &spec[inner_start]; + int inner_spec_len = spec_len - inner_start; + + if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { + // Offset the results since we gave ExtractScheme a substring. + inner_scheme.begin += inner_start; + + if (inner_scheme.end() == spec_len - 1) + return; + } else { + // No scheme found; that's not valid for filesystem URLs. + // The best we can do is return "filesystem://". + return; + } + + Parsed inner_parsed; + + if (url_util::CompareSchemeComponent( + spec, inner_scheme, url_util::kFileScheme)) { + // File URLs are special. + ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); + } else if (url_util::CompareSchemeComponent(spec, inner_scheme, + url_util::kFileSystemScheme)) { + // Filesystem URLs don't nest. + return; + } else if (url_util::IsStandard(spec, inner_scheme)) { + // All "normal" URLs. + DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); + } else { + return; + } + + // All members of inner_parsed need to be offset by inner_start. + // If we had any scheme that supported nesting more than one level deep, + // we'd have to recurse into the inner_parsed's inner_parsed when + // adjusting by inner_start. + inner_parsed.scheme.begin += inner_start; + inner_parsed.username.begin += inner_start; + inner_parsed.password.begin += inner_start; + inner_parsed.host.begin += inner_start; + inner_parsed.port.begin += inner_start; + inner_parsed.query.begin += inner_start; + inner_parsed.ref.begin += inner_start; + inner_parsed.path.begin += inner_start; + + // Query and ref move from inner_parsed to parsed. + parsed->query = inner_parsed.query; + inner_parsed.query.reset(); + parsed->ref = inner_parsed.ref; + inner_parsed.ref.reset(); + + parsed->set_inner_parsed(inner_parsed); + if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || + inner_parsed.inner_parsed()) { + return; + } + + // The path in inner_parsed should start with a slash, then have a filesystem + // type followed by a slash. From the first slash up to but excluding the + // second should be what it keeps; the rest goes to parsed. If the path ends + // before the second slash, it's still pretty clear what the user meant, so + // we'll let that through. + if (!IsURLSlash(spec[inner_parsed.path.begin])) { + return; + } + int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash + while (inner_path_end < spec_len && + !IsURLSlash(spec[inner_path_end])) + ++inner_path_end; + parsed->path.begin = inner_path_end; + int new_inner_path_length = inner_path_end - inner_parsed.path.begin; + parsed->path.len = inner_parsed.path.len - new_inner_path_length; + parsed->inner_parsed()->path.len = new_inner_path_length; +} + +// Initializes a path URL which is merely a scheme followed by a path. Examples +// include "about:foo" and "javascript:alert('bar');" +template +void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) { + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + // For compatability with the standard URL parser, we treat no path as + // -1, rather than having a length of 0 (we normally wouldn't care so + // much for these non-standard URLs). + if (parsed->scheme.end() == spec_len - 1) + parsed->path.reset(); + else + parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len); + } else { + // No scheme found, just path. + parsed->scheme.reset(); + parsed->path = MakeRange(begin, spec_len); + } +} + +template +void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->ref.reset(); + parsed->query.reset(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin = -1; + int path_end = -1; + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() != spec_len - 1) { + path_begin = parsed->scheme.end() + 1; + path_end = spec_len; + } + } else { + // No scheme found, just path. + parsed->scheme.reset(); + path_begin = begin; + path_end = spec_len; + } + + // Split [path_begin, path_end) into a path + query. + for (int i = path_begin; i < path_end; ++i) { + if (spec[i] == '?') { + parsed->query = MakeRange(i + 1, path_end); + path_end = i; + break; + } + } + + // For compatability with the standard URL parser, treat no path as + // -1, rather than having a length of 0 + if (path_begin == path_end) { + parsed->path.reset(); + } else { + parsed->path = MakeRange(path_begin, path_end); + } +} + +// Converts a port number in a string to an integer. We'd like to just call +// sscanf but our input is not NULL-terminated, which sscanf requires. Instead, +// we copy the digits to a small stack buffer (since we know the maximum number +// of digits in a valid port number) that we can NULL terminate. +template +int DoParsePort(const CHAR* spec, const Component& component) { + // Easy success case when there is no port. + const int kMaxDigits = 5; + if (!component.is_nonempty()) + return PORT_UNSPECIFIED; + + // Skip over any leading 0s. + Component digits_comp(component.end(), 0); + for (int i = 0; i < component.len; i++) { + if (spec[component.begin + i] != '0') { + digits_comp = MakeRange(component.begin + i, component.end()); + break; + } + } + if (digits_comp.len == 0) + return 0; // All digits were 0. + + // Verify we don't have too many digits (we'll be copying to our buffer so + // we need to double-check). + if (digits_comp.len > kMaxDigits) + return PORT_INVALID; + + // Copy valid digits to the buffer. + char digits[kMaxDigits + 1]; // +1 for null terminator + for (int i = 0; i < digits_comp.len; i++) { + CHAR ch = spec[digits_comp.begin + i]; + if (!IsPortDigit(ch)) { + // Invalid port digit, fail. + return PORT_INVALID; + } + digits[i] = static_cast(ch); + } + + // Null-terminate the string and convert to integer. Since we guarantee + // only digits, atoi's lack of error handling is OK. + digits[digits_comp.len] = 0; + int port = atoi(digits); + if (port > 65535) + return PORT_INVALID; // Out of range. + return port; +} + +template +void DoExtractFileName(const CHAR* spec, + const Component& path, + Component* file_name) { + // Handle empty paths: they have no file names. + if (!path.is_nonempty()) { + file_name->reset(); + return; + } + + // Search backwards for a parameter, which is a normally unused field in a + // URL delimited by a semicolon. We parse the parameter as part of the + // path, but here, we don't want to count it. The last semicolon is the + // parameter. The path should start with a slash, so we don't need to check + // the first one. + int file_end = path.end(); + for (int i = path.end() - 1; i > path.begin; i--) { + if (spec[i] == ';') { + file_end = i; + break; + } + } + + // Now search backwards from the filename end to the previous slash + // to find the beginning of the filename. + for (int i = file_end - 1; i >= path.begin; i--) { + if (IsURLSlash(spec[i])) { + // File name is everything following this character to the end + *file_name = MakeRange(i + 1, file_end); + return; + } + } + + // No slash found, this means the input was degenerate (generally paths + // will start with a slash). Let's call everything the file name. + *file_name = MakeRange(path.begin, file_end); + return; +} + +template +bool DoExtractQueryKeyValue(const CHAR* spec, + Component* query, + Component* key, + Component* value) { + if (!query->is_nonempty()) + return false; + + int start = query->begin; + int cur = start; + int end = query->end(); + + // We assume the beginning of the input is the beginning of the "key" and we + // skip to the end of it. + key->begin = cur; + while (cur < end && spec[cur] != '&' && spec[cur] != '=') + cur++; + key->len = cur - key->begin; + + // Skip the separator after the key (if any). + if (cur < end && spec[cur] == '=') + cur++; + + // Find the value part. + value->begin = cur; + while (cur < end && spec[cur] != '&') + cur++; + value->len = cur - value->begin; + + // Finally skip the next separator if any + if (cur < end && spec[cur] == '&') + cur++; + + // Save the new query + *query = url_parse::MakeRange(cur, end); + return true; +} + +} // namespace + +Parsed::Parsed() : inner_parsed_(NULL) { +} + +Parsed::Parsed(const Parsed& other) : + scheme(other.scheme), + username(other.username), + password(other.password), + host(other.host), + port(other.port), + path(other.path), + query(other.query), + ref(other.ref), + inner_parsed_(NULL) { + if (other.inner_parsed_) + set_inner_parsed(*other.inner_parsed_); +} + +Parsed& Parsed::operator=(const Parsed& other) { + if (this != &other) { + scheme = other.scheme; + username = other.username; + password = other.password; + host = other.host; + port = other.port; + path = other.path; + query = other.query; + ref = other.ref; + if (other.inner_parsed_) + set_inner_parsed(*other.inner_parsed_); + else + clear_inner_parsed(); + } + return *this; +} + +Parsed::~Parsed() { + delete inner_parsed_; +} + +int Parsed::Length() const { + if (ref.is_valid()) + return ref.end(); + return CountCharactersBefore(REF, false); +} + +int Parsed::CountCharactersBefore(ComponentType type, + bool include_delimiter) const { + if (type == SCHEME) + return scheme.begin; + + // There will be some characters after the scheme like "://" and we don't + // know how many. Search forwards for the next thing until we find one. + int cur = 0; + if (scheme.is_valid()) + cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. + + if (username.is_valid()) { + if (type <= USERNAME) + return username.begin; + cur = username.end() + 1; // Advance over the '@' or ':' at the end. + } + + if (password.is_valid()) { + if (type <= PASSWORD) + return password.begin; + cur = password.end() + 1; // Advance over the '@' at the end. + } + + if (host.is_valid()) { + if (type <= HOST) + return host.begin; + cur = host.end(); + } + + if (port.is_valid()) { + if (type < PORT || (type == PORT && include_delimiter)) + return port.begin - 1; // Back over delimiter. + if (type == PORT) + return port.begin; // Don't want delimiter counted. + cur = port.end(); + } + + if (path.is_valid()) { + if (type <= PATH) + return path.begin; + cur = path.end(); + } + + if (query.is_valid()) { + if (type < QUERY || (type == QUERY && include_delimiter)) + return query.begin - 1; // Back over delimiter. + if (type == QUERY) + return query.begin; // Don't want delimiter counted. + cur = query.end(); + } + + if (ref.is_valid()) { + if (type == REF && !include_delimiter) + return ref.begin; // Back over delimiter. + + // When there is a ref and we get here, the component we wanted was before + // this and not found, so we always know the beginning of the ref is right. + return ref.begin - 1; // Don't want delimiter counted. + } + + return cur; +} + +bool ExtractScheme(const char* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +bool ExtractScheme(const base::char16* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +// This handles everything that may be an authority terminator, including +// backslash. For special backslash handling see DoParseAfterScheme. +bool IsAuthorityTerminator(base::char16 ch) { + return IsURLSlash(ch) || ch == '?' || ch == '#'; +} + +void ExtractFileName(const char* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +void ExtractFileName(const base::char16* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +bool ExtractQueryKeyValue(const base::char16* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +void ParseAuthority(const base::char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +int ParsePort(const char* url, const Component& port) { + return DoParsePort(url, port); +} + +int ParsePort(const base::char16* url, const Component& port) { + return DoParsePort(url, port); +} + +void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParsePathURL(const char* url, int url_len, Parsed* parsed) { + DoParsePathURL(url, url_len, parsed); +} + +void ParsePathURL(const base::char16* url, int url_len, Parsed* parsed) { + DoParsePathURL(url, url_len, parsed); +} + +void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { + DoParseFileSystemURL(url, url_len, parsed); +} + +void ParseFileSystemURL(const base::char16* url, int url_len, Parsed* parsed) { + DoParseFileSystemURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParseMailtoURL(const base::char16* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParsePathInternal(const base::char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +void ParseAfterScheme(const base::char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +} // namespace url_parse diff --git a/url/third_party/mozilla/url_parse.h b/url/third_party/mozilla/url_parse.h new file mode 100644 index 0000000..fd974f8 --- /dev/null +++ b/url/third_party/mozilla/url_parse.h @@ -0,0 +1,361 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ +#define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ + +#include + +#include "base/basictypes.h" +#include "base/strings/string16.h" +#include "url/url_export.h" + +namespace url_parse { + +// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and +// KURLGoogle.cpp still rely on this type. +typedef base::char16 UTF16Char; + +// Component ------------------------------------------------------------------ + +// Represents a substring for URL parsing. +struct Component { + Component() : begin(0), len(-1) {} + + // Normal constructor: takes an offset and a length. + Component(int b, int l) : begin(b), len(l) {} + + int end() const { + return begin + len; + } + + // Returns true if this component is valid, meaning the length is given. Even + // valid components may be empty to record the fact that they exist. + bool is_valid() const { + return (len != -1); + } + + // Returns true if the given component is specified on false, the component + // is either empty or invalid. + bool is_nonempty() const { + return (len > 0); + } + + void reset() { + begin = 0; + len = -1; + } + + bool operator==(const Component& other) const { + return begin == other.begin && len == other.len; + } + + int begin; // Byte offset in the string of this component. + int len; // Will be -1 if the component is unspecified. +}; + +// Helper that returns a component created with the given begin and ending +// points. The ending point is non-inclusive. +inline Component MakeRange(int begin, int end) { + return Component(begin, end - begin); +} + +// Parsed --------------------------------------------------------------------- + +// A structure that holds the identified parts of an input URL. This structure +// does NOT store the URL itself. The caller will have to store the URL text +// and its corresponding Parsed structure separately. +// +// Typical usage would be: +// +// url_parse::Parsed parsed; +// url_parse::Component scheme; +// if (!url_parse::ExtractScheme(url, url_len, &scheme)) +// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; +// +// if (IsStandardScheme(url, scheme)) // Not provided by this component +// url_parseParseStandardURL(url, url_len, &parsed); +// else if (IsFileURL(url, scheme)) // Not provided by this component +// url_parse::ParseFileURL(url, url_len, &parsed); +// else +// url_parse::ParsePathURL(url, url_len, &parsed); +// +struct URL_EXPORT Parsed { + // Identifies different components. + enum ComponentType { + SCHEME, + USERNAME, + PASSWORD, + HOST, + PORT, + PATH, + QUERY, + REF, + }; + + // The default constructor is sufficient for the components, but inner_parsed_ + // requires special handling. + Parsed(); + Parsed(const Parsed&); + Parsed& operator=(const Parsed&); + ~Parsed(); + + // Returns the length of the URL (the end of the last component). + // + // Note that for some invalid, non-canonical URLs, this may not be the length + // of the string. For example "http://": the parsed structure will only + // contain an entry for the four-character scheme, and it doesn't know about + // the "://". For all other last-components, it will return the real length. + int Length() const; + + // Returns the number of characters before the given component if it exists, + // or where the component would be if it did exist. This will return the + // string length if the component would be appended to the end. + // + // Note that this can get a little funny for the port, query, and ref + // components which have a delimiter that is not counted as part of the + // component. The |include_delimiter| flag controls if you want this counted + // as part of the component or not when the component exists. + // + // This example shows the difference between the two flags for two of these + // delimited components that is present (the port and query) and one that + // isn't (the reference). The components that this flag affects are marked + // with a *. + // 0 1 2 + // 012345678901234567890 + // Example input: http://foo:80/?query + // include_delim=true, ...=false ("<-" indicates different) + // SCHEME: 0 0 + // USERNAME: 5 5 + // PASSWORD: 5 5 + // HOST: 7 7 + // *PORT: 10 11 <- + // PATH: 13 13 + // *QUERY: 14 15 <- + // *REF: 20 20 + // + int CountCharactersBefore(ComponentType type, bool include_delimiter) const; + + // Scheme without the colon: "http://foo"/ would have a scheme of "http". + // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there + // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed + // to start at the beginning of the string if there are preceeding whitespace + // or control characters. + Component scheme; + + // Username. Specified in URLs with an @ sign before the host. See |password| + Component username; + + // Password. The length will be -1 if unspecified, 0 if specified but empty. + // Not all URLs with a username have a password, as in "http://me@host/". + // The password is separated form the username with a colon, as in + // "http://me:secret@host/" + Component password; + + // Host name. + Component host; + + // Port number. + Component port; + + // Path, this is everything following the host name. Length will be -1 if + // unspecified. This includes the preceeding slash, so the path on + // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to + // have a 0 length path, it will be -1 in cases like "http://host?foo". + // Note that we treat backslashes the same as slashes. + Component path; + + // Stuff between the ? and the # after the path. This does not include the + // preceeding ? character. Length will be -1 if unspecified, 0 if there is + // a question mark but no query string. + Component query; + + // Indicated by a #, this is everything following the hash sign (not + // including it). If there are multiple hash signs, we'll use the last one. + // Length will be -1 if there is no hash sign, or 0 if there is one but + // nothing follows it. + Component ref; + + // This is used for nested URL types, currently only filesystem. If you + // parse a filesystem URL, the resulting Parsed will have a nested + // inner_parsed_ to hold the parsed inner URL's component information. + // For all other url types [including the inner URL], it will be NULL. + Parsed* inner_parsed() const { + return inner_parsed_; + } + + void set_inner_parsed(const Parsed& inner_parsed) { + if (!inner_parsed_) + inner_parsed_ = new Parsed(inner_parsed); + else + *inner_parsed_ = inner_parsed; + } + + void clear_inner_parsed() { + if (inner_parsed_) { + delete inner_parsed_; + inner_parsed_ = NULL; + } + } + + private: + Parsed* inner_parsed_; // This object is owned and managed by this struct. +}; + +// Initialization functions --------------------------------------------------- +// +// These functions parse the given URL, filling in all of the structure's +// components. These functions can not fail, they will always do their best +// at interpreting the input given. +// +// The string length of the URL MUST be specified, we do not check for NULLs +// at any point in the process, and will actually handle embedded NULLs. +// +// IMPORTANT: These functions do NOT hang on to the given pointer or copy it +// in any way. See the comment above the struct. +// +// The 8-bit versions require UTF-8 encoding. + +// StandardURL is for when the scheme is known to be one that has an +// authority (host) like "http". This function will not handle weird ones +// like "about:" and "javascript:", or do the right thing for "file:" URLs. +URL_EXPORT void ParseStandardURL(const char* url, + int url_len, + Parsed* parsed); +URL_EXPORT void ParseStandardURL(const base::char16* url, + int url_len, + Parsed* parsed); + +// PathURL is for when the scheme is known not to have an authority (host) +// section but that aren't file URLs either. The scheme is parsed, and +// everything after the scheme is considered as the path. This is used for +// things like "about:" and "javascript:" +URL_EXPORT void ParsePathURL(const char* url, int url_len, Parsed* parsed); +URL_EXPORT void ParsePathURL(const base::char16* url, + int url_len, + Parsed* parsed); + +// FileURL is for file URLs. There are some special rules for interpreting +// these. +URL_EXPORT void ParseFileURL(const char* url, int url_len, Parsed* parsed); +URL_EXPORT void ParseFileURL(const base::char16* url, + int url_len, + Parsed* parsed); + +// Filesystem URLs are structured differently than other URLs. +URL_EXPORT void ParseFileSystemURL(const char* url, + int url_len, + Parsed* parsed); +URL_EXPORT void ParseFileSystemURL(const base::char16* url, + int url_len, + Parsed* parsed); + +// MailtoURL is for mailto: urls. They are made up scheme,path,query +URL_EXPORT void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); +URL_EXPORT void ParseMailtoURL(const base::char16* url, + int url_len, + Parsed* parsed); + +// Helper functions ----------------------------------------------------------- + +// Locates the scheme according to the URL parser's rules. This function is +// designed so the caller can find the scheme and call the correct Init* +// function according to their known scheme types. +// +// It also does not perform any validation on the scheme. +// +// This function will return true if the scheme is found and will put the +// scheme's range into *scheme. False means no scheme could be found. Note +// that a URL beginning with a colon has a scheme, but it is empty, so this +// function will return true but *scheme will = (0,0). +// +// The scheme is found by skipping spaces and control characters at the +// beginning, and taking everything from there to the first colon to be the +// scheme. The character at scheme.end() will be the colon (we may enhance +// this to handle full width colons or something, so don't count on the +// actual character value). The character at scheme.end()+1 will be the +// beginning of the rest of the URL, be it the authority or the path (or the +// end of the string). +// +// The 8-bit version requires UTF-8 encoding. +URL_EXPORT bool ExtractScheme(const char* url, + int url_len, + Component* scheme); +URL_EXPORT bool ExtractScheme(const base::char16* url, + int url_len, + Component* scheme); + +// Returns true if ch is a character that terminates the authority segment +// of a URL. +URL_EXPORT bool IsAuthorityTerminator(base::char16 ch); + +// Does a best effort parse of input |spec|, in range |auth|. If a particular +// component is not found, it will be set to invalid. +URL_EXPORT void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); +URL_EXPORT void ParseAuthority(const base::char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); + +// Computes the integer port value from the given port component. The port +// component should have been identified by one of the init functions on +// |Parsed| for the given input url. +// +// The return value will be a positive integer between 0 and 64K, or one of +// the two special values below. +enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; +URL_EXPORT int ParsePort(const char* url, const Component& port); +URL_EXPORT int ParsePort(const base::char16* url, const Component& port); + +// Extracts the range of the file name in the given url. The path must +// already have been computed by the parse function, and the matching URL +// and extracted path are provided to this function. The filename is +// defined as being everything from the last slash/backslash of the path +// to the end of the path. +// +// The file name will be empty if the path is empty or there is nothing +// following the last slash. +// +// The 8-bit version requires UTF-8 encoding. +URL_EXPORT void ExtractFileName(const char* url, + const Component& path, + Component* file_name); +URL_EXPORT void ExtractFileName(const base::char16* url, + const Component& path, + Component* file_name); + +// Extract the first key/value from the range defined by |*query|. Updates +// |*query| to start at the end of the extracted key/value pair. This is +// designed for use in a loop: you can keep calling it with the same query +// object and it will iterate over all items in the query. +// +// Some key/value pairs may have the key, the value, or both be empty (for +// example, the query string "?&"). These will be returned. Note that an empty +// last parameter "foo.com?" or foo.com?a&" will not be returned, this case +// is the same as "done." +// +// The initial query component should not include the '?' (this is the default +// for parsed URLs). +// +// If no key/value are found |*key| and |*value| will be unchanged and it will +// return false. +URL_EXPORT bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); +URL_EXPORT bool ExtractQueryKeyValue(const base::char16* url, + Component* query, + Component* key, + Component* value); + +} // namespace url_parse + +#endif // URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ diff --git a/url/url.gyp b/url/url.gyp index a9a1a19d..fe4d5fd 100644 --- a/url/url.gyp +++ b/url/url.gyp @@ -22,6 +22,8 @@ 'sources': [ 'gurl.cc', 'gurl.h', + 'third_party/mozilla/url_parse.cc', + 'third_party/mozilla/url_parse.h', 'url_canon.h', 'url_canon_etc.cc', 'url_canon_filesystemurl.cc', @@ -42,8 +44,6 @@ 'url_canon_stdstring.h', 'url_canon_stdurl.cc', 'url_file.h', - 'url_parse.cc', - 'url_parse.h', 'url_parse_file.cc', 'url_parse_internal.h', 'url_util.cc', diff --git a/url/url_parse.cc b/url/url_parse.cc deleted file mode 100644 index 0d9c6dd1c..0000000 --- a/url/url_parse.cc +++ /dev/null @@ -1,923 +0,0 @@ -/* Based on nsURLParsers.cc from Mozilla - * ------------------------------------- - * The contents of this file are subject to the Mozilla Public License Version - * 1.1 (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" basis, - * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License - * for the specific language governing rights and limitations under the - * License. - * - * The Original Code is mozilla.org code. - * - * The Initial Developer of the Original Code is - * Netscape Communications Corporation. - * Portions created by the Initial Developer are Copyright (C) 1998 - * the Initial Developer. All Rights Reserved. - * - * Contributor(s): - * Darin Fisher (original author) - * - * Alternatively, the contents of this file may be used under the terms of - * either the GNU General Public License Version 2 or later (the "GPL"), or - * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), - * in which case the provisions of the GPL or the LGPL are applicable instead - * of those above. If you wish to allow use of your version of this file only - * under the terms of either the GPL or the LGPL, and not to allow others to - * use your version of this file under the terms of the MPL, indicate your - * decision by deleting the provisions above and replace them with the notice - * and other provisions required by the GPL or the LGPL. If you do not delete - * the provisions above, a recipient may use your version of this file under - * the terms of any one of the MPL, the GPL or the LGPL. - * - * ***** END LICENSE BLOCK ***** */ - -#include "url/url_parse.h" - -#include - -#include "base/logging.h" -#include "url/url_parse_internal.h" -#include "url/url_util.h" -#include "url/url_util_internal.h" - -namespace url_parse { - -namespace { - -// Returns true if the given character is a valid digit to use in a port. -inline bool IsPortDigit(base::char16 ch) { - return ch >= '0' && ch <= '9'; -} - -// Returns the offset of the next authority terminator in the input starting -// from start_offset. If no terminator is found, the return value will be equal -// to spec_len. -template -int FindNextAuthorityTerminator(const CHAR* spec, - int start_offset, - int spec_len) { - for (int i = start_offset; i < spec_len; i++) { - if (IsAuthorityTerminator(spec[i])) - return i; - } - return spec_len; // Not found. -} - -template -void ParseUserInfo(const CHAR* spec, - const Component& user, - Component* username, - Component* password) { - // Find the first colon in the user section, which separates the username and - // password. - int colon_offset = 0; - while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') - colon_offset++; - - if (colon_offset < user.len) { - // Found separator: : - *username = Component(user.begin, colon_offset); - *password = MakeRange(user.begin + colon_offset + 1, - user.begin + user.len); - } else { - // No separator, treat everything as the username - *username = user; - *password = Component(); - } -} - -template -void ParseServerInfo(const CHAR* spec, - const Component& serverinfo, - Component* hostname, - Component* port_num) { - if (serverinfo.len == 0) { - // No server info, host name is empty. - hostname->reset(); - port_num->reset(); - return; - } - - // If the host starts with a left-bracket, assume the entire host is an - // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. - // This assumption will be overridden if we find a right-bracket. - // - // Our IPv6 address canonicalization code requires both brackets to exist, - // but the ability to locate an incomplete address can still be useful. - int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; - int colon = -1; - - // Find the last right-bracket, and the last colon. - for (int i = serverinfo.begin; i < serverinfo.end(); i++) { - switch (spec[i]) { - case ']': - ipv6_terminator = i; - break; - case ':': - colon = i; - break; - } - } - - if (colon > ipv6_terminator) { - // Found a port number: : - *hostname = MakeRange(serverinfo.begin, colon); - if (hostname->len == 0) - hostname->reset(); - *port_num = MakeRange(colon + 1, serverinfo.end()); - } else { - // No port: - *hostname = serverinfo; - port_num->reset(); - } -} - -// Given an already-identified auth section, breaks it into its consituent -// parts. The port number will be parsed and the resulting integer will be -// filled into the given *port variable, or -1 if there is no port number or it -// is invalid. -template -void DoParseAuthority(const CHAR* spec, - const Component& auth, - Component* username, - Component* password, - Component* hostname, - Component* port_num) { - DCHECK(auth.is_valid()) << "We should always get an authority"; - if (auth.len == 0) { - username->reset(); - password->reset(); - hostname->reset(); - port_num->reset(); - return; - } - - // Search backwards for @, which is the separator between the user info and - // the server info. - int i = auth.begin + auth.len - 1; - while (i > auth.begin && spec[i] != '@') - i--; - - if (spec[i] == '@') { - // Found user info: @ - ParseUserInfo(spec, Component(auth.begin, i - auth.begin), - username, password); - ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), - hostname, port_num); - } else { - // No user info, everything is server info. - username->reset(); - password->reset(); - ParseServerInfo(spec, auth, hostname, port_num); - } -} - -template -void ParsePath(const CHAR* spec, - const Component& path, - Component* filepath, - Component* query, - Component* ref) { - // path = [/]//<...>/;?# - - // Special case when there is no path. - if (path.len == -1) { - filepath->reset(); - query->reset(); - ref->reset(); - return; - } - DCHECK(path.len > 0) << "We should never have 0 length paths"; - - // Search for first occurrence of either ? or #. - int path_end = path.begin + path.len; - - int query_separator = -1; // Index of the '?' - int ref_separator = -1; // Index of the '#' - for (int i = path.begin; i < path_end; i++) { - switch (spec[i]) { - case '?': - // Only match the query string if it precedes the reference fragment - // and when we haven't found one already. - if (ref_separator < 0 && query_separator < 0) - query_separator = i; - break; - case '#': - // Record the first # sign only. - if (ref_separator < 0) - ref_separator = i; - break; - } - } - - // Markers pointing to the character after each of these corresponding - // components. The code below words from the end back to the beginning, - // and will update these indices as it finds components that exist. - int file_end, query_end; - - // Ref fragment: from the # to the end of the path. - if (ref_separator >= 0) { - file_end = query_end = ref_separator; - *ref = MakeRange(ref_separator + 1, path_end); - } else { - file_end = query_end = path_end; - ref->reset(); - } - - // Query fragment: everything from the ? to the next boundary (either the end - // of the path or the ref fragment). - if (query_separator >= 0) { - file_end = query_separator; - *query = MakeRange(query_separator + 1, query_end); - } else { - query->reset(); - } - - // File path: treat an empty file path as no file path. - if (file_end != path.begin) - *filepath = MakeRange(path.begin, file_end); - else - filepath->reset(); -} - -template -bool DoExtractScheme(const CHAR* url, - int url_len, - Component* scheme) { - // Skip leading whitespace and control characters. - int begin = 0; - while (begin < url_len && ShouldTrimFromURL(url[begin])) - begin++; - if (begin == url_len) - return false; // Input is empty or all whitespace. - - // Find the first colon character. - for (int i = begin; i < url_len; i++) { - if (url[i] == ':') { - *scheme = MakeRange(begin, i); - return true; - } - } - return false; // No colon found: no scheme -} - -// Fills in all members of the Parsed structure except for the scheme. -// -// |spec| is the full spec being parsed, of length |spec_len|. -// |after_scheme| is the character immediately following the scheme (after the -// colon) where we'll begin parsing. -// -// Compatability data points. I list "host", "path" extracted: -// Input IE6 Firefox Us -// ----- -------------- -------------- -------------- -// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" -// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" -// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" -// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" -// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" -// -// (*) Interestingly, although IE fails to load these URLs, its history -// canonicalizer handles them, meaning if you've been to the corresponding -// "http://foo.com/" link, it will be colored. -template -void DoParseAfterScheme(const CHAR* spec, - int spec_len, - int after_scheme, - Parsed* parsed) { - int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); - int after_slashes = after_scheme + num_slashes; - - // First split into two main parts, the authority (username, password, host, - // and port) and the full path (path, query, and reference). - Component authority; - Component full_path; - - // Found "//", looks like an authority section. Treat everything - // from there to the next slash (or end of spec) to be the authority. Note - // that we ignore the number of slashes and treat it as the authority. - int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); - authority = Component(after_slashes, end_auth - after_slashes); - - if (end_auth == spec_len) // No beginning of path found. - full_path = Component(); - else // Everything starting from the slash to the end is the path. - full_path = Component(end_auth, spec_len - end_auth); - - // Now parse those two sub-parts. - DoParseAuthority(spec, authority, &parsed->username, &parsed->password, - &parsed->host, &parsed->port); - ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); -} - -// The main parsing function for standard URLs. Standard URLs have a scheme, -// host, path, etc. -template -void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { - DCHECK(spec_len >= 0); - - // Strip leading & trailing spaces and control characters. - int begin = 0; - TrimURL(spec, &begin, &spec_len); - - int after_scheme; - if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { - after_scheme = parsed->scheme.end() + 1; // Skip past the colon. - } else { - // Say there's no scheme when there is no colon. We could also say that - // everything is the scheme. Both would produce an invalid URL, but this way - // seems less wrong in more cases. - parsed->scheme.reset(); - after_scheme = begin; - } - DoParseAfterScheme(spec, spec_len, after_scheme, parsed); -} - -template -void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { - DCHECK(spec_len >= 0); - - // Get the unused parts of the URL out of the way. - parsed->username.reset(); - parsed->password.reset(); - parsed->host.reset(); - parsed->port.reset(); - parsed->path.reset(); // May use this; reset for convenience. - parsed->ref.reset(); // May use this; reset for convenience. - parsed->query.reset(); // May use this; reset for convenience. - parsed->clear_inner_parsed(); // May use this; reset for convenience. - - // Strip leading & trailing spaces and control characters. - int begin = 0; - TrimURL(spec, &begin, &spec_len); - - // Handle empty specs or ones that contain only whitespace or control chars. - if (begin == spec_len) { - parsed->scheme.reset(); - return; - } - - int inner_start = -1; - - // Extract the scheme. We also handle the case where there is no scheme. - if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { - // Offset the results since we gave ExtractScheme a substring. - parsed->scheme.begin += begin; - - if (parsed->scheme.end() == spec_len - 1) - return; - - inner_start = parsed->scheme.end() + 1; - } else { - // No scheme found; that's not valid for filesystem URLs. - parsed->scheme.reset(); - return; - } - - url_parse::Component inner_scheme; - const CHAR* inner_spec = &spec[inner_start]; - int inner_spec_len = spec_len - inner_start; - - if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { - // Offset the results since we gave ExtractScheme a substring. - inner_scheme.begin += inner_start; - - if (inner_scheme.end() == spec_len - 1) - return; - } else { - // No scheme found; that's not valid for filesystem URLs. - // The best we can do is return "filesystem://". - return; - } - - Parsed inner_parsed; - - if (url_util::CompareSchemeComponent( - spec, inner_scheme, url_util::kFileScheme)) { - // File URLs are special. - ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); - } else if (url_util::CompareSchemeComponent(spec, inner_scheme, - url_util::kFileSystemScheme)) { - // Filesystem URLs don't nest. - return; - } else if (url_util::IsStandard(spec, inner_scheme)) { - // All "normal" URLs. - DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); - } else { - return; - } - - // All members of inner_parsed need to be offset by inner_start. - // If we had any scheme that supported nesting more than one level deep, - // we'd have to recurse into the inner_parsed's inner_parsed when - // adjusting by inner_start. - inner_parsed.scheme.begin += inner_start; - inner_parsed.username.begin += inner_start; - inner_parsed.password.begin += inner_start; - inner_parsed.host.begin += inner_start; - inner_parsed.port.begin += inner_start; - inner_parsed.query.begin += inner_start; - inner_parsed.ref.begin += inner_start; - inner_parsed.path.begin += inner_start; - - // Query and ref move from inner_parsed to parsed. - parsed->query = inner_parsed.query; - inner_parsed.query.reset(); - parsed->ref = inner_parsed.ref; - inner_parsed.ref.reset(); - - parsed->set_inner_parsed(inner_parsed); - if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || - inner_parsed.inner_parsed()) { - return; - } - - // The path in inner_parsed should start with a slash, then have a filesystem - // type followed by a slash. From the first slash up to but excluding the - // second should be what it keeps; the rest goes to parsed. If the path ends - // before the second slash, it's still pretty clear what the user meant, so - // we'll let that through. - if (!IsURLSlash(spec[inner_parsed.path.begin])) { - return; - } - int inner_path_end = inner_parsed.path.begin + 1; // skip the leading slash - while (inner_path_end < spec_len && - !IsURLSlash(spec[inner_path_end])) - ++inner_path_end; - parsed->path.begin = inner_path_end; - int new_inner_path_length = inner_path_end - inner_parsed.path.begin; - parsed->path.len = inner_parsed.path.len - new_inner_path_length; - parsed->inner_parsed()->path.len = new_inner_path_length; -} - -// Initializes a path URL which is merely a scheme followed by a path. Examples -// include "about:foo" and "javascript:alert('bar');" -template -void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) { - // Get the non-path and non-scheme parts of the URL out of the way, we never - // use them. - parsed->username.reset(); - parsed->password.reset(); - parsed->host.reset(); - parsed->port.reset(); - parsed->query.reset(); - parsed->ref.reset(); - - // Strip leading & trailing spaces and control characters. - int begin = 0; - TrimURL(spec, &begin, &spec_len); - - // Handle empty specs or ones that contain only whitespace or control chars. - if (begin == spec_len) { - parsed->scheme.reset(); - parsed->path.reset(); - return; - } - - // Extract the scheme, with the path being everything following. We also - // handle the case where there is no scheme. - if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { - // Offset the results since we gave ExtractScheme a substring. - parsed->scheme.begin += begin; - - // For compatability with the standard URL parser, we treat no path as - // -1, rather than having a length of 0 (we normally wouldn't care so - // much for these non-standard URLs). - if (parsed->scheme.end() == spec_len - 1) - parsed->path.reset(); - else - parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len); - } else { - // No scheme found, just path. - parsed->scheme.reset(); - parsed->path = MakeRange(begin, spec_len); - } -} - -template -void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { - DCHECK(spec_len >= 0); - - // Get the non-path and non-scheme parts of the URL out of the way, we never - // use them. - parsed->username.reset(); - parsed->password.reset(); - parsed->host.reset(); - parsed->port.reset(); - parsed->ref.reset(); - parsed->query.reset(); // May use this; reset for convenience. - - // Strip leading & trailing spaces and control characters. - int begin = 0; - TrimURL(spec, &begin, &spec_len); - - // Handle empty specs or ones that contain only whitespace or control chars. - if (begin == spec_len) { - parsed->scheme.reset(); - parsed->path.reset(); - return; - } - - int path_begin = -1; - int path_end = -1; - - // Extract the scheme, with the path being everything following. We also - // handle the case where there is no scheme. - if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { - // Offset the results since we gave ExtractScheme a substring. - parsed->scheme.begin += begin; - - if (parsed->scheme.end() != spec_len - 1) { - path_begin = parsed->scheme.end() + 1; - path_end = spec_len; - } - } else { - // No scheme found, just path. - parsed->scheme.reset(); - path_begin = begin; - path_end = spec_len; - } - - // Split [path_begin, path_end) into a path + query. - for (int i = path_begin; i < path_end; ++i) { - if (spec[i] == '?') { - parsed->query = MakeRange(i + 1, path_end); - path_end = i; - break; - } - } - - // For compatability with the standard URL parser, treat no path as - // -1, rather than having a length of 0 - if (path_begin == path_end) { - parsed->path.reset(); - } else { - parsed->path = MakeRange(path_begin, path_end); - } -} - -// Converts a port number in a string to an integer. We'd like to just call -// sscanf but our input is not NULL-terminated, which sscanf requires. Instead, -// we copy the digits to a small stack buffer (since we know the maximum number -// of digits in a valid port number) that we can NULL terminate. -template -int DoParsePort(const CHAR* spec, const Component& component) { - // Easy success case when there is no port. - const int kMaxDigits = 5; - if (!component.is_nonempty()) - return PORT_UNSPECIFIED; - - // Skip over any leading 0s. - Component digits_comp(component.end(), 0); - for (int i = 0; i < component.len; i++) { - if (spec[component.begin + i] != '0') { - digits_comp = MakeRange(component.begin + i, component.end()); - break; - } - } - if (digits_comp.len == 0) - return 0; // All digits were 0. - - // Verify we don't have too many digits (we'll be copying to our buffer so - // we need to double-check). - if (digits_comp.len > kMaxDigits) - return PORT_INVALID; - - // Copy valid digits to the buffer. - char digits[kMaxDigits + 1]; // +1 for null terminator - for (int i = 0; i < digits_comp.len; i++) { - CHAR ch = spec[digits_comp.begin + i]; - if (!IsPortDigit(ch)) { - // Invalid port digit, fail. - return PORT_INVALID; - } - digits[i] = static_cast(ch); - } - - // Null-terminate the string and convert to integer. Since we guarantee - // only digits, atoi's lack of error handling is OK. - digits[digits_comp.len] = 0; - int port = atoi(digits); - if (port > 65535) - return PORT_INVALID; // Out of range. - return port; -} - -template -void DoExtractFileName(const CHAR* spec, - const Component& path, - Component* file_name) { - // Handle empty paths: they have no file names. - if (!path.is_nonempty()) { - file_name->reset(); - return; - } - - // Search backwards for a parameter, which is a normally unused field in a - // URL delimited by a semicolon. We parse the parameter as part of the - // path, but here, we don't want to count it. The last semicolon is the - // parameter. The path should start with a slash, so we don't need to check - // the first one. - int file_end = path.end(); - for (int i = path.end() - 1; i > path.begin; i--) { - if (spec[i] == ';') { - file_end = i; - break; - } - } - - // Now search backwards from the filename end to the previous slash - // to find the beginning of the filename. - for (int i = file_end - 1; i >= path.begin; i--) { - if (IsURLSlash(spec[i])) { - // File name is everything following this character to the end - *file_name = MakeRange(i + 1, file_end); - return; - } - } - - // No slash found, this means the input was degenerate (generally paths - // will start with a slash). Let's call everything the file name. - *file_name = MakeRange(path.begin, file_end); - return; -} - -template -bool DoExtractQueryKeyValue(const CHAR* spec, - Component* query, - Component* key, - Component* value) { - if (!query->is_nonempty()) - return false; - - int start = query->begin; - int cur = start; - int end = query->end(); - - // We assume the beginning of the input is the beginning of the "key" and we - // skip to the end of it. - key->begin = cur; - while (cur < end && spec[cur] != '&' && spec[cur] != '=') - cur++; - key->len = cur - key->begin; - - // Skip the separator after the key (if any). - if (cur < end && spec[cur] == '=') - cur++; - - // Find the value part. - value->begin = cur; - while (cur < end && spec[cur] != '&') - cur++; - value->len = cur - value->begin; - - // Finally skip the next separator if any - if (cur < end && spec[cur] == '&') - cur++; - - // Save the new query - *query = url_parse::MakeRange(cur, end); - return true; -} - -} // namespace - -Parsed::Parsed() : inner_parsed_(NULL) { -} - -Parsed::Parsed(const Parsed& other) : - scheme(other.scheme), - username(other.username), - password(other.password), - host(other.host), - port(other.port), - path(other.path), - query(other.query), - ref(other.ref), - inner_parsed_(NULL) { - if (other.inner_parsed_) - set_inner_parsed(*other.inner_parsed_); -} - -Parsed& Parsed::operator=(const Parsed& other) { - if (this != &other) { - scheme = other.scheme; - username = other.username; - password = other.password; - host = other.host; - port = other.port; - path = other.path; - query = other.query; - ref = other.ref; - if (other.inner_parsed_) - set_inner_parsed(*other.inner_parsed_); - else - clear_inner_parsed(); - } - return *this; -} - -Parsed::~Parsed() { - delete inner_parsed_; -} - -int Parsed::Length() const { - if (ref.is_valid()) - return ref.end(); - return CountCharactersBefore(REF, false); -} - -int Parsed::CountCharactersBefore(ComponentType type, - bool include_delimiter) const { - if (type == SCHEME) - return scheme.begin; - - // There will be some characters after the scheme like "://" and we don't - // know how many. Search forwards for the next thing until we find one. - int cur = 0; - if (scheme.is_valid()) - cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. - - if (username.is_valid()) { - if (type <= USERNAME) - return username.begin; - cur = username.end() + 1; // Advance over the '@' or ':' at the end. - } - - if (password.is_valid()) { - if (type <= PASSWORD) - return password.begin; - cur = password.end() + 1; // Advance over the '@' at the end. - } - - if (host.is_valid()) { - if (type <= HOST) - return host.begin; - cur = host.end(); - } - - if (port.is_valid()) { - if (type < PORT || (type == PORT && include_delimiter)) - return port.begin - 1; // Back over delimiter. - if (type == PORT) - return port.begin; // Don't want delimiter counted. - cur = port.end(); - } - - if (path.is_valid()) { - if (type <= PATH) - return path.begin; - cur = path.end(); - } - - if (query.is_valid()) { - if (type < QUERY || (type == QUERY && include_delimiter)) - return query.begin - 1; // Back over delimiter. - if (type == QUERY) - return query.begin; // Don't want delimiter counted. - cur = query.end(); - } - - if (ref.is_valid()) { - if (type == REF && !include_delimiter) - return ref.begin; // Back over delimiter. - - // When there is a ref and we get here, the component we wanted was before - // this and not found, so we always know the beginning of the ref is right. - return ref.begin - 1; // Don't want delimiter counted. - } - - return cur; -} - -bool ExtractScheme(const char* url, int url_len, Component* scheme) { - return DoExtractScheme(url, url_len, scheme); -} - -bool ExtractScheme(const base::char16* url, int url_len, Component* scheme) { - return DoExtractScheme(url, url_len, scheme); -} - -// This handles everything that may be an authority terminator, including -// backslash. For special backslash handling see DoParseAfterScheme. -bool IsAuthorityTerminator(base::char16 ch) { - return IsURLSlash(ch) || ch == '?' || ch == '#'; -} - -void ExtractFileName(const char* url, - const Component& path, - Component* file_name) { - DoExtractFileName(url, path, file_name); -} - -void ExtractFileName(const base::char16* url, - const Component& path, - Component* file_name) { - DoExtractFileName(url, path, file_name); -} - -bool ExtractQueryKeyValue(const char* url, - Component* query, - Component* key, - Component* value) { - return DoExtractQueryKeyValue(url, query, key, value); -} - -bool ExtractQueryKeyValue(const base::char16* url, - Component* query, - Component* key, - Component* value) { - return DoExtractQueryKeyValue(url, query, key, value); -} - -void ParseAuthority(const char* spec, - const Component& auth, - Component* username, - Component* password, - Component* hostname, - Component* port_num) { - DoParseAuthority(spec, auth, username, password, hostname, port_num); -} - -void ParseAuthority(const base::char16* spec, - const Component& auth, - Component* username, - Component* password, - Component* hostname, - Component* port_num) { - DoParseAuthority(spec, auth, username, password, hostname, port_num); -} - -int ParsePort(const char* url, const Component& port) { - return DoParsePort(url, port); -} - -int ParsePort(const base::char16* url, const Component& port) { - return DoParsePort(url, port); -} - -void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { - DoParseStandardURL(url, url_len, parsed); -} - -void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) { - DoParseStandardURL(url, url_len, parsed); -} - -void ParsePathURL(const char* url, int url_len, Parsed* parsed) { - DoParsePathURL(url, url_len, parsed); -} - -void ParsePathURL(const base::char16* url, int url_len, Parsed* parsed) { - DoParsePathURL(url, url_len, parsed); -} - -void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { - DoParseFileSystemURL(url, url_len, parsed); -} - -void ParseFileSystemURL(const base::char16* url, int url_len, Parsed* parsed) { - DoParseFileSystemURL(url, url_len, parsed); -} - -void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { - DoParseMailtoURL(url, url_len, parsed); -} - -void ParseMailtoURL(const base::char16* url, int url_len, Parsed* parsed) { - DoParseMailtoURL(url, url_len, parsed); -} - -void ParsePathInternal(const char* spec, - const Component& path, - Component* filepath, - Component* query, - Component* ref) { - ParsePath(spec, path, filepath, query, ref); -} - -void ParsePathInternal(const base::char16* spec, - const Component& path, - Component* filepath, - Component* query, - Component* ref) { - ParsePath(spec, path, filepath, query, ref); -} - -void ParseAfterScheme(const char* spec, - int spec_len, - int after_scheme, - Parsed* parsed) { - DoParseAfterScheme(spec, spec_len, after_scheme, parsed); -} - -void ParseAfterScheme(const base::char16* spec, - int spec_len, - int after_scheme, - Parsed* parsed) { - DoParseAfterScheme(spec, spec_len, after_scheme, parsed); -} - -} // namespace url_parse diff --git a/url/url_parse.h b/url/url_parse.h index 21033dd..3b9c546 100644 --- a/url/url_parse.h +++ b/url/url_parse.h @@ -5,357 +5,7 @@ #ifndef URL_URL_PARSE_H_ #define URL_URL_PARSE_H_ -#include - -#include "base/basictypes.h" -#include "base/strings/string16.h" -#include "url/url_export.h" - -namespace url_parse { - -// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and -// KURLGoogle.cpp still rely on this type. -typedef base::char16 UTF16Char; - -// Component ------------------------------------------------------------------ - -// Represents a substring for URL parsing. -struct Component { - Component() : begin(0), len(-1) {} - - // Normal constructor: takes an offset and a length. - Component(int b, int l) : begin(b), len(l) {} - - int end() const { - return begin + len; - } - - // Returns true if this component is valid, meaning the length is given. Even - // valid components may be empty to record the fact that they exist. - bool is_valid() const { - return (len != -1); - } - - // Returns true if the given component is specified on false, the component - // is either empty or invalid. - bool is_nonempty() const { - return (len > 0); - } - - void reset() { - begin = 0; - len = -1; - } - - bool operator==(const Component& other) const { - return begin == other.begin && len == other.len; - } - - int begin; // Byte offset in the string of this component. - int len; // Will be -1 if the component is unspecified. -}; - -// Helper that returns a component created with the given begin and ending -// points. The ending point is non-inclusive. -inline Component MakeRange(int begin, int end) { - return Component(begin, end - begin); -} - -// Parsed --------------------------------------------------------------------- - -// A structure that holds the identified parts of an input URL. This structure -// does NOT store the URL itself. The caller will have to store the URL text -// and its corresponding Parsed structure separately. -// -// Typical usage would be: -// -// url_parse::Parsed parsed; -// url_parse::Component scheme; -// if (!url_parse::ExtractScheme(url, url_len, &scheme)) -// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; -// -// if (IsStandardScheme(url, scheme)) // Not provided by this component -// url_parseParseStandardURL(url, url_len, &parsed); -// else if (IsFileURL(url, scheme)) // Not provided by this component -// url_parse::ParseFileURL(url, url_len, &parsed); -// else -// url_parse::ParsePathURL(url, url_len, &parsed); -// -struct URL_EXPORT Parsed { - // Identifies different components. - enum ComponentType { - SCHEME, - USERNAME, - PASSWORD, - HOST, - PORT, - PATH, - QUERY, - REF, - }; - - // The default constructor is sufficient for the components, but inner_parsed_ - // requires special handling. - Parsed(); - Parsed(const Parsed&); - Parsed& operator=(const Parsed&); - ~Parsed(); - - // Returns the length of the URL (the end of the last component). - // - // Note that for some invalid, non-canonical URLs, this may not be the length - // of the string. For example "http://": the parsed structure will only - // contain an entry for the four-character scheme, and it doesn't know about - // the "://". For all other last-components, it will return the real length. - int Length() const; - - // Returns the number of characters before the given component if it exists, - // or where the component would be if it did exist. This will return the - // string length if the component would be appended to the end. - // - // Note that this can get a little funny for the port, query, and ref - // components which have a delimiter that is not counted as part of the - // component. The |include_delimiter| flag controls if you want this counted - // as part of the component or not when the component exists. - // - // This example shows the difference between the two flags for two of these - // delimited components that is present (the port and query) and one that - // isn't (the reference). The components that this flag affects are marked - // with a *. - // 0 1 2 - // 012345678901234567890 - // Example input: http://foo:80/?query - // include_delim=true, ...=false ("<-" indicates different) - // SCHEME: 0 0 - // USERNAME: 5 5 - // PASSWORD: 5 5 - // HOST: 7 7 - // *PORT: 10 11 <- - // PATH: 13 13 - // *QUERY: 14 15 <- - // *REF: 20 20 - // - int CountCharactersBefore(ComponentType type, bool include_delimiter) const; - - // Scheme without the colon: "http://foo"/ would have a scheme of "http". - // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there - // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed - // to start at the beginning of the string if there are preceeding whitespace - // or control characters. - Component scheme; - - // Username. Specified in URLs with an @ sign before the host. See |password| - Component username; - - // Password. The length will be -1 if unspecified, 0 if specified but empty. - // Not all URLs with a username have a password, as in "http://me@host/". - // The password is separated form the username with a colon, as in - // "http://me:secret@host/" - Component password; - - // Host name. - Component host; - - // Port number. - Component port; - - // Path, this is everything following the host name. Length will be -1 if - // unspecified. This includes the preceeding slash, so the path on - // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to - // have a 0 length path, it will be -1 in cases like "http://host?foo". - // Note that we treat backslashes the same as slashes. - Component path; - - // Stuff between the ? and the # after the path. This does not include the - // preceeding ? character. Length will be -1 if unspecified, 0 if there is - // a question mark but no query string. - Component query; - - // Indicated by a #, this is everything following the hash sign (not - // including it). If there are multiple hash signs, we'll use the last one. - // Length will be -1 if there is no hash sign, or 0 if there is one but - // nothing follows it. - Component ref; - - // This is used for nested URL types, currently only filesystem. If you - // parse a filesystem URL, the resulting Parsed will have a nested - // inner_parsed_ to hold the parsed inner URL's component information. - // For all other url types [including the inner URL], it will be NULL. - Parsed* inner_parsed() const { - return inner_parsed_; - } - - void set_inner_parsed(const Parsed& inner_parsed) { - if (!inner_parsed_) - inner_parsed_ = new Parsed(inner_parsed); - else - *inner_parsed_ = inner_parsed; - } - - void clear_inner_parsed() { - if (inner_parsed_) { - delete inner_parsed_; - inner_parsed_ = NULL; - } - } - - private: - Parsed* inner_parsed_; // This object is owned and managed by this struct. -}; - -// Initialization functions --------------------------------------------------- -// -// These functions parse the given URL, filling in all of the structure's -// components. These functions can not fail, they will always do their best -// at interpreting the input given. -// -// The string length of the URL MUST be specified, we do not check for NULLs -// at any point in the process, and will actually handle embedded NULLs. -// -// IMPORTANT: These functions do NOT hang on to the given pointer or copy it -// in any way. See the comment above the struct. -// -// The 8-bit versions require UTF-8 encoding. - -// StandardURL is for when the scheme is known to be one that has an -// authority (host) like "http". This function will not handle weird ones -// like "about:" and "javascript:", or do the right thing for "file:" URLs. -URL_EXPORT void ParseStandardURL(const char* url, - int url_len, - Parsed* parsed); -URL_EXPORT void ParseStandardURL(const base::char16* url, - int url_len, - Parsed* parsed); - -// PathURL is for when the scheme is known not to have an authority (host) -// section but that aren't file URLs either. The scheme is parsed, and -// everything after the scheme is considered as the path. This is used for -// things like "about:" and "javascript:" -URL_EXPORT void ParsePathURL(const char* url, int url_len, Parsed* parsed); -URL_EXPORT void ParsePathURL(const base::char16* url, - int url_len, - Parsed* parsed); - -// FileURL is for file URLs. There are some special rules for interpreting -// these. -URL_EXPORT void ParseFileURL(const char* url, int url_len, Parsed* parsed); -URL_EXPORT void ParseFileURL(const base::char16* url, - int url_len, - Parsed* parsed); - -// Filesystem URLs are structured differently than other URLs. -URL_EXPORT void ParseFileSystemURL(const char* url, - int url_len, - Parsed* parsed); -URL_EXPORT void ParseFileSystemURL(const base::char16* url, - int url_len, - Parsed* parsed); - -// MailtoURL is for mailto: urls. They are made up scheme,path,query -URL_EXPORT void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); -URL_EXPORT void ParseMailtoURL(const base::char16* url, - int url_len, - Parsed* parsed); - -// Helper functions ----------------------------------------------------------- - -// Locates the scheme according to the URL parser's rules. This function is -// designed so the caller can find the scheme and call the correct Init* -// function according to their known scheme types. -// -// It also does not perform any validation on the scheme. -// -// This function will return true if the scheme is found and will put the -// scheme's range into *scheme. False means no scheme could be found. Note -// that a URL beginning with a colon has a scheme, but it is empty, so this -// function will return true but *scheme will = (0,0). -// -// The scheme is found by skipping spaces and control characters at the -// beginning, and taking everything from there to the first colon to be the -// scheme. The character at scheme.end() will be the colon (we may enhance -// this to handle full width colons or something, so don't count on the -// actual character value). The character at scheme.end()+1 will be the -// beginning of the rest of the URL, be it the authority or the path (or the -// end of the string). -// -// The 8-bit version requires UTF-8 encoding. -URL_EXPORT bool ExtractScheme(const char* url, - int url_len, - Component* scheme); -URL_EXPORT bool ExtractScheme(const base::char16* url, - int url_len, - Component* scheme); - -// Returns true if ch is a character that terminates the authority segment -// of a URL. -URL_EXPORT bool IsAuthorityTerminator(base::char16 ch); - -// Does a best effort parse of input |spec|, in range |auth|. If a particular -// component is not found, it will be set to invalid. -URL_EXPORT void ParseAuthority(const char* spec, - const Component& auth, - Component* username, - Component* password, - Component* hostname, - Component* port_num); -URL_EXPORT void ParseAuthority(const base::char16* spec, - const Component& auth, - Component* username, - Component* password, - Component* hostname, - Component* port_num); - -// Computes the integer port value from the given port component. The port -// component should have been identified by one of the init functions on -// |Parsed| for the given input url. -// -// The return value will be a positive integer between 0 and 64K, or one of -// the two special values below. -enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; -URL_EXPORT int ParsePort(const char* url, const Component& port); -URL_EXPORT int ParsePort(const base::char16* url, const Component& port); - -// Extracts the range of the file name in the given url. The path must -// already have been computed by the parse function, and the matching URL -// and extracted path are provided to this function. The filename is -// defined as being everything from the last slash/backslash of the path -// to the end of the path. -// -// The file name will be empty if the path is empty or there is nothing -// following the last slash. -// -// The 8-bit version requires UTF-8 encoding. -URL_EXPORT void ExtractFileName(const char* url, - const Component& path, - Component* file_name); -URL_EXPORT void ExtractFileName(const base::char16* url, - const Component& path, - Component* file_name); - -// Extract the first key/value from the range defined by |*query|. Updates -// |*query| to start at the end of the extracted key/value pair. This is -// designed for use in a loop: you can keep calling it with the same query -// object and it will iterate over all items in the query. -// -// Some key/value pairs may have the key, the value, or both be empty (for -// example, the query string "?&"). These will be returned. Note that an empty -// last parameter "foo.com?" or foo.com?a&" will not be returned, this case -// is the same as "done." -// -// The initial query component should not include the '?' (this is the default -// for parsed URLs). -// -// If no key/value are found |*key| and |*value| will be unchanged and it will -// return false. -URL_EXPORT bool ExtractQueryKeyValue(const char* url, - Component* query, - Component* key, - Component* value); -URL_EXPORT bool ExtractQueryKeyValue(const base::char16* url, - Component* query, - Component* key, - Component* value); - -} // namespace url_parse +// TODO(tfarina): Remove this file when the callers are updated. +#include "url/third_party/mozilla/url_parse.h" #endif // URL_URL_PARSE_H_ -- cgit v1.1