diff options
Diffstat (limited to 'googleurl')
52 files changed, 14496 insertions, 0 deletions
diff --git a/googleurl/LICENSE.txt b/googleurl/LICENSE.txt new file mode 100644 index 0000000..ac40837 --- /dev/null +++ b/googleurl/LICENSE.txt @@ -0,0 +1,65 @@ +Copyright 2007, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + +The file url_parse.cc is based on nsURLParsers.cc from Mozilla. This file is +licensed separately as follows: + +The contents of this file are subject to the Mozilla Public License Version +1.1 (the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at +http://www.mozilla.org/MPL/ + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +for the specific language governing rights and limitations under the +License. + +The Original Code is mozilla.org code. + +The Initial Developer of the Original Code is +Netscape Communications Corporation. +Portions created by the Initial Developer are Copyright (C) 1998 +the Initial Developer. All Rights Reserved. + +Contributor(s): + Darin Fisher (original author) + +Alternatively, the contents of this file may be used under the terms of +either the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +in which case the provisions of the GPL or the LGPL are applicable instead +of those above. If you wish to allow use of your version of this file only +under the terms of either the GPL or the LGPL, and not to allow others to +use your version of this file under the terms of the MPL, indicate your +decision by deleting the provisions above and replace them with the notice +and other provisions required by the GPL or the LGPL. If you do not delete +the provisions above, a recipient may use your version of this file under +the terms of any one of the MPL, the GPL or the LGPL. diff --git a/googleurl/PRESUBMIT.py b/googleurl/PRESUBMIT.py new file mode 100644 index 0000000..6cfbe74 --- /dev/null +++ b/googleurl/PRESUBMIT.py @@ -0,0 +1,108 @@ +#!/usr/bin/python +# Copyright (c) 2009 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""Top-level presubmit script for googleurl. + +See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts for +details on the presubmit API built into gcl. +""" + +# Files with these extensions will be considered source files +SOURCE_FILE_EXTENSIONS = [ + '.c', '.cc', '.cpp', '.h', '.m', '.mm', '.py', '.mk', '.am', '.json', +] +EXCLUDED_PATHS = [ + r".*third_party[\\\/].*", +] + +def ReadFile(path): + """Given a path, returns the full contents of the file. + + Reads files in binary format. + """ + fo = open(path, 'rb') + try: + contents = fo.read() + finally: + fo.close() + return contents + + +def CheckChangeOnUpload(input_api, output_api): + # TODO(brettw) Enforce 80 cols. + return LocalChecks(input_api, output_api, max_cols=0) + + +def CheckChangeOnCommit(input_api, output_api): + # TODO(brettw) Enforce 80 cols. + return (LocalChecks(input_api, output_api, max_cols=0) + + input_api.canned_checks.CheckDoNotSubmit(input_api, output_api)) + + +def LocalChecks(input_api, output_api, max_cols=80): + """Reports an error if for any source file in SOURCE_FILE_EXTENSIONS: + - uses CR (or CRLF) + - contains a TAB + - has a line that ends with whitespace + - contains a line >|max_cols| cols unless |max_cols| is 0. + + Note that the whole file is checked, not only the changes. + """ + cr_files = [] + results = [] + excluded_paths = [input_api.re.compile(x) for x in EXCLUDED_PATHS] + files = input_api.AffectedFiles() + for f in files: + path = f.LocalPath() + root, ext = input_api.os_path.splitext(path) + # Look for unsupported extensions. + if not ext in SOURCE_FILE_EXTENSIONS: + continue + # Look for excluded paths. + found = False + for item in excluded_paths: + if item.match(path): + found = True + break + if found: + continue + + # Need to read the file ourselves since AffectedFile.NewContents() + # will normalize line endings. + contents = ReadFile(path) + if '\r' in contents: + cr_files.append(path) + + local_errors = [] + # Remove EOL character. + lines = contents.splitlines() + line_num = 1 + for line in lines: + if line.endswith(' '): + local_errors.append(output_api.PresubmitError( + '%s, line %s ends with whitespaces.' % + (path, line_num))) + # Accept lines with http:// to exceed the max_cols rule. + if max_cols and len(line) > max_cols and not 'http://' in line: + local_errors.append(output_api.PresubmitError( + '%s, line %s has %s chars, please reduce to %d chars.' % + (path, line_num, len(line), max_cols))) + if '\t' in line: + local_errors.append(output_api.PresubmitError( + "%s, line %s contains a tab character." % + (path, line_num))) + line_num += 1 + # Just show the first 5 errors. + if len(local_errors) == 6: + local_errors.pop() + local_errors.append(output_api.PresubmitError("... and more.")) + break + results.extend(local_errors) + + if cr_files: + results.append(output_api.PresubmitError( + 'Found CR (or CRLF) line ending in these files, please use only LF:', + items=cr_files)) + return results diff --git a/googleurl/PRESUBMIT_unittest.py b/googleurl/PRESUBMIT_unittest.py new file mode 100644 index 0000000..19231db --- /dev/null +++ b/googleurl/PRESUBMIT_unittest.py @@ -0,0 +1,88 @@ +#!/usr/bin/python +# Copyright (c) 2009 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""Unit tests for top-level Chromium presubmit script. +""" + + +import os +import PRESUBMIT +import re +import unittest + + +class MockInputApi(object): + def __init__(self): + self.affected_files = [] + self.re = re + self.os_path = os.path + + def AffectedFiles(self): + return self.affected_files + + def AffectedTextFiles(self, include_deletes=True): + return self.affected_files + + +class MockAffectedFile(object): + def __init__(self, path): + self.path = path + + def LocalPath(self): + return self.path + + +class MockOutputApi(object): + class PresubmitError(object): + def __init__(self, msg, items=[], long_text=''): + self.msg = msg + self.items = items + + +class PresubmitUnittest(unittest.TestCase): + def setUp(self): + self.file_contents = '' + def MockReadFile(path): + self.failIf(path.endswith('notsource')) + return self.file_contents + self._ReadFile = PRESUBMIT.ReadFile + PRESUBMIT.ReadFile = MockReadFile + + def tearDown(self): + PRESUBMIT.ReadFile = self._ReadFile + + def testLocalChecks(self): + api = MockInputApi() + api.affected_files = [ + MockAffectedFile('foo/blat/yoo.notsource'), + MockAffectedFile('third_party/blat/source.cc'), + MockAffectedFile('foo/blat/source.h'), + MockAffectedFile('foo/blat/source.mm'), + MockAffectedFile('foo/blat/source.py'), + ] + self.file_contents = 'file with \n\terror\nhere\r\nyes there' + # 3 source files, 2 errors by file + 1 global CR error. + self.failUnless(len(PRESUBMIT.LocalChecks(api, MockOutputApi)) == 7) + + self.file_contents = 'file\twith\ttabs' + # 3 source files, 1 error by file. + self.failUnless(len(PRESUBMIT.LocalChecks(api, MockOutputApi)) == 3) + + self.file_contents = 'file\rusing\rCRs' + # One global CR error. + self.failUnless(len(PRESUBMIT.LocalChecks(api, MockOutputApi)) == 1) + self.failUnless( + len(PRESUBMIT.LocalChecks(api, MockOutputApi)[0].items) == 3) + + self.file_contents = 'both\ttabs and\r\nCRLF' + # 3 source files, 1 error by file + 1 global CR error. + self.failUnless(len(PRESUBMIT.LocalChecks(api, MockOutputApi)) == 4) + + self.file_contents = 'file with\nzero \\t errors \\r\\n' + self.failIf(PRESUBMIT.LocalChecks(api, MockOutputApi)) + + +if __name__ == '__main__': + unittest.main() diff --git a/googleurl/README.txt b/googleurl/README.txt new file mode 100644 index 0000000..b28fd04 --- /dev/null +++ b/googleurl/README.txt @@ -0,0 +1,180 @@ + ============================== + The Google URL Parsing Library + ============================== + +This is the Google URL Parsing Library which parses and canonicalizes URLs. +Please see the LICENSE.txt file for licensing information. + +Features +======== + + * Easily embeddable: This library was written for a variety of client and + server programs in mind, so unlike most implementations of URL parsing + and canonicalization, it can be easily emdedded. + + * Fast: hundreds of thousands of typical URLs can be parsed and + canonicalized per second on a modern CPU. It is much faster than, for + example, calling WinInet's corresponding functions. + + * Compatible: When possible, this library has strived for IE7 compatability + for both general web compatability, and so IE addons or other applications + that communicate with or embed IE will work properly. + + It supports Unix-style file URLs, as well as the more complex rules for + Window file URLs. Note that total compatability is not possible (for + example, IE6 and IE7 disagree about how to parse certain IP addresses), + and that this is more strict about certain illegal, rarely used, and + potentially dangerous constructs such as escaped control characters in + host names that IE will allow. It is typically a little less strict than + Firefox. + + +Example +======= + +An example implementation of a URL object that uses this library is provided +in src/gurl.*. This implementation uses the "application integration" layer +discussed below to interface with the low-level parsing and canonicalization +functions. + + +Building +======== + +The canonicalization files require ICU for some UTF-8 and UTF-16 conversion +macros. If your project does not use ICU, it should be straightforward to +factor out the macros and functions used in ICU, there are only a few well- +isolated things that are used. + +TODO(brettw) ADD INSTRUCTIONS FOR GETTING ICU HERE! + +logging.h and logging.cc are Windows-only because the corresponding Unix +logging system has many dependencies. This library uses few of the logging +macros, and a dummy header can easily be written that defines the +appropriate things for Unix. + + +Definitions +=========== + +"Standard URL": A URL with an "authority", which is a hostname and optionally + a port, username, and password. Most URLs are standard such as HTTP and FTP. + +"File URL": A URL that references a file on disk. There are special rules for + this type of URL. Note that it may have a hostname! "localhost" is allowed, + for example "file://localhost/foo" is the same as "file:///foo". + +"Path URL": This is everything else. There is no standard on how to treat these + URLs, or even what they are called. This library decomposes them into a + scheme and a path. The path is everything following the scheme. This type of + URL includes "javascript", "data", and even "mailto" (although "mailto" + might look like a standard scheme in some respects, it is not). + + +Design +====== + +The library is divided into four layers. They are listed here from the lowest +to the highest; you can use any portion of the library as long as you embed the +layers below it. + +1. Parsing +---------- +At the lowest level is the parsing code. The files encompasing this are +url_parse.* and the main include file is src/url_parse.h. This code will, given +an input string, parse it into the most likely form of a URL. + +Parsing can not fail and does no validation. The exception is the port number, +which it currently validates, but this is a bug. Given crazy input, the parser +will do its best to find the various URL components according to its rules (see +url_parse_unittest.cc for some examples). + +To use this, an application will typically use ExtractScheme to determine the +type of a given input URL, and then call one of the initialization functions: +"ParseStandardURL", "ParsePathURL", or "ParseFileURL". This will result in +a "Parsed" structure which identifies the substrings of each identified +component. + +2. Canonicalization +------------------- +At the next highest level is canonicalization. The files encompasing this are +url_canon.* and the main include file is src/url_canon.h. This code will +validate an already-parsed URL, and will convert it to a canonical form. For +example, this will convert host names to lowercase, convert IP addresses +into dotted-decimal notation, handle encoding issues, etc. + +This layer will always do its best to produce a reasonable output string, but +it may return that the string is invalid. For example, if there are invalid +characters in the host name, it will escape them or replace them with the +Unicode "invalid character" character, but will fail. This way, the program can +display error messages to the user with the output, log it, etc. and the +string will have some meaning. + +Canonicalized output is written to a CanonOutput object which is a simple +wrapper around an expanding buffer. An implementation called RawCanonOutput is +proivided that writes to a raw buffer with a fixed amount statically allocated +(for performance). Applications using STL can use StdStringCanonOutput defined +in url_canon_stdstring.h which writes into a std::string. + +A normal application would call one of the three high-level functions +"CanonicalizeStandardURL", "CanonicalizeFileURL", and CanonicalizePathURL" +depending on the type of URL in question. Lower-level functions are also +provided which will canonicalize individual parts of a URL (for example, +"CanonicalizeHost"). + +Part of this layer is the integration with the host system for IDN and encoding +conversion. An implementation that provides integration with the ICU +(http://www-306.ibm.com/software/globalization/icu/index.jsp) is provided in +src/url_canon_icu.cc. The embedder may wish to replace this file with +implementations of the functions for their own IDN library if they do not use +ICU. + +3. Application integration +-------------------------- +The canonicalization and parsing layers do not know anything about the URI +schemes supported by your application. The parsing and canonicalization +functions are very low-level, and you must call the correct function to do the +work (for example, "CanonicalizeFileURL"). + +The application integration in url_util.* provides wrappers around the +low-level parsing and canonicalization to call the correct versions for +different identified schemes. Embedders will want to modify this file if +necessary to suit the needs of their application. + +4. URL object +------------- +The highest level is the "URL" object that a C++ application would use to +to encapsulate a URL. Embedders will typically want to provide their own URL +object that meets the requirements of their system. A reasonably complete +example implemnetation is provided in src/gurl.*. You may wish to use this +object, extend or modify it, or write your own. + +Whitespace +---------- +Sometimes, you may want to remove linefeeds and tabs from the content of a URL. +Some web pages, for example, expect that a URL spanning two lines should be +treated as one with the newline removed. Depending on the source of the URLs +you are canonicalizing, these newlines may or may not be trimmed off. + +If you want this behavior, call RemoveURLWhitespace before parsing. This will +remove CR, LF and TAB from the input. Note that it preserves spaces. On typical +URLs, this function produces a 10-15% speed reduction, so it is optional and +not done automatically. The example GURL object and the url_util wrapper does +this for you. + +Tests +===== + +There are a number of *_unittest.cc and *_perftest.cc files. These files are +not currently compilable as they rely on a not-included unit testing framework +Tests are declared like this: + TEST(TestCaseName, TestName) { + ASSERT_TRUE(a); + EXPECT_EQ(a, b); + } +If you would like to compile them, it should be straightforward to define +the TEST macro (which would declare a function by combining the two arguments) +and the other macros whose behavior should be self-explanatory (EXPECT is like +an ASSERT, but does not stop the test, if you are doing this, you probably +don't care about this difference). Then you would define a .cc file that +calls all of these functions. diff --git a/googleurl/base/README.txt b/googleurl/base/README.txt new file mode 100644 index 0000000..311faa0 --- /dev/null +++ b/googleurl/base/README.txt @@ -0,0 +1,2 @@ +These files contain some shared code. You can define your own assertion macros +to eliminate the dependency on logging.h. diff --git a/googleurl/base/basictypes.h b/googleurl/base/basictypes.h new file mode 100644 index 0000000..b0c404d --- /dev/null +++ b/googleurl/base/basictypes.h @@ -0,0 +1,88 @@ +// Copyright 2001 - 2003 Google Inc. All Rights Reserved + +#ifndef BASE_BASICTYPES_H__ +#define BASE_BASICTYPES_H__ + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; + +const uint8 kuint8max = (( uint8) 0xFF); +const uint32 kuint32max = ((uint32) 0xFFFFFFFF); + +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. +// +// One caveat is that arraysize() doesn't accept any array of an +// anonymous type or a type defined inside a function. In these rare +// cases, you have to use the unsafe ARRAYSIZE() macro below. This is +// due to a limitation in C++'s template system. The limitation might +// eventually be removed, but it hasn't happened yet. + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template <typename T, size_t N> +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef _MSC_VER +template <typename T, size_t N> +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + +// ARRAYSIZE performs essentially the same calculation as arraysize, +// but can be used on anonymous types or types defined inside +// functions. It's less safe than arraysize as it accepts some +// (although not all) pointers. Therefore, you should use arraysize +// whenever possible. +// +// The expression ARRAYSIZE(a) is a compile-time constant of type +// size_t. +// +// ARRAYSIZE catches a few type errors. If you see a compiler error +// +// "warning: division by zero in ..." +// +// when using ARRAYSIZE, you are (wrongfully) giving it a pointer. +// You should only use ARRAYSIZE on statically allocated arrays. +// +// The following comments are on the implementation details, and can +// be ignored by the users. +// +// ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in +// the array) and sizeof(*(arr)) (the # of bytes in one array +// element). If the former is divisible by the latter, perhaps arr is +// indeed an array, in which case the division result is the # of +// elements in the array. Otherwise, arr cannot possibly be an array, +// and we generate a compiler error to prevent the code from +// compiling. +// +// Since the size of bool is implementation-defined, we need to cast +// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final +// result has type size_t. +// +// This macro is not perfect as it wrongfully accepts certain +// pointers, namely where the pointer size is divisible by the pointee +// size. Since all our code has to go through a 32-bit compiler, +// where a pointer is 4 bytes, this means all pointers to a type whose +// size is 3 or greater than 4 will be (righteously) rejected. +// +// Starting with Visual C++ 2005, WinNT.h includes ARRAYSIZE. +#define ARRAYSIZE_UNSAFE(a) \ + ((sizeof(a) / sizeof(*(a))) / \ + static_cast<size_t>(!(sizeof(a) % sizeof(*(a))))) + +// A macro to disallow the evil copy constructor and operator= functions +// This should be used in the private: declarations for a class +#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) + +#endif // BASE_BASICTYPES_H__ diff --git a/googleurl/base/logging.cc b/googleurl/base/logging.cc new file mode 100644 index 0000000..ab03150 --- /dev/null +++ b/googleurl/base/logging.cc @@ -0,0 +1,380 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <ctime> +#include <iomanip> +#include <cstring> +#include <windows.h> +#include <tchar.h> +#include <algorithm> +#include "base/logging.h" + +namespace logging { + +const char* const log_severity_names[LOG_NUM_SEVERITIES] = { + "INFO", "WARNING", "ERROR", "FATAL" }; + +int min_log_level = 0; +LogLockingState lock_log_file = LOCK_LOG_FILE; +LoggingDestination logging_destination = LOG_ONLY_TO_FILE; + +const int kMaxFilteredLogLevel = LOG_WARNING; +char* log_filter_prefix = NULL; + +// which log file to use? This is initialized by InitLogging or +// will be lazily initialized to the default value when it is +// first needed. +TCHAR log_file_name[MAX_PATH] = { 0 }; + +// this file is lazily opened and the handle may be NULL +HANDLE log_file = NULL; + +// what should be prepended to each message? +bool log_process_id = false; +bool log_thread_id = false; +bool log_timestamp = true; +bool log_tickcount = false; + +// An assert handler override specified by the client to be called instead of +// the debug message dialog. +LogAssertHandlerFunction log_assert_handler = NULL; + +// The critical section is used if log file locking is false. It helps us +// avoid problems with multiple threads writing to the log file at the same +// time. +bool initialized_critical_section = false; +CRITICAL_SECTION log_critical_section; + +// When we don't use a critical section, we are using a global mutex. We +// need to do this because LockFileEx is not thread safe +HANDLE log_mutex = NULL; + +// Called by logging functions to ensure that debug_file is initialized +// and can be used for writing. Returns false if the file could not be +// initialized. debug_file will be NULL in this case. +bool InitializeLogFileHandle() { + if (log_file) + return true; + + if (!log_file_name[0]) { + // nobody has called InitLogging to specify a debug log file, so here we + // initialize the log file name to the default + GetModuleFileName(NULL, log_file_name, MAX_PATH); + TCHAR* last_backslash = _tcsrchr(log_file_name, '\\'); + if (last_backslash) + last_backslash[1] = 0; // name now ends with the backslash + _tcscat_s(log_file_name, _T("debug.log")); + } + + log_file = CreateFile(log_file_name, GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + if (log_file == INVALID_HANDLE_VALUE || log_file == NULL) { + // try the current directory + log_file = CreateFile(_T(".\\debug.log"), GENERIC_WRITE, + FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, + OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + if (log_file == INVALID_HANDLE_VALUE || log_file == NULL) { + log_file = NULL; + return false; + } + } + SetFilePointer(log_file, 0, 0, FILE_END); + return true; +} + +void InitLogMutex() { + if (!log_mutex) { + // \ is not a legal character in mutex names so we replace \ with / + std::wstring safe_name(log_file_name); + std::replace(safe_name.begin(), safe_name.end(), '\\', '/'); + std::wstring t(L"Global\\"); + t.append(safe_name); + log_mutex = ::CreateMutex(NULL, FALSE, t.c_str()); + } +} + +void InitLogging(const TCHAR* new_log_file, LoggingDestination logging_dest, + LogLockingState lock_log, OldFileDeletionState delete_old) { + if (log_file) { + // calling InitLogging twice or after some log call has already opened the + // default log file will re-initialize to the new options + CloseHandle(log_file); + log_file = NULL; + } + + lock_log_file = lock_log; + logging_destination = logging_dest; + + // ignore file options if logging is only to system + if (logging_destination == LOG_ONLY_TO_SYSTEM_DEBUG_LOG) + return; + + _tcscpy_s(log_file_name, MAX_PATH, new_log_file); + if (delete_old == DELETE_OLD_LOG_FILE) + DeleteFile(log_file_name); + + if (lock_log_file == LOCK_LOG_FILE) { + InitLogMutex(); + } else if (!initialized_critical_section) { + // initialize the critical section + InitializeCriticalSection(&log_critical_section); + initialized_critical_section = true; + } + + InitializeLogFileHandle(); +} + +void SetMinLogLevel(int level) { + min_log_level = level; +} + +void SetLogFilterPrefix(char* filter) { + if (log_filter_prefix) { + delete[] log_filter_prefix; + log_filter_prefix = NULL; + } + + if (filter) { + size_t size = strlen(filter)+1; + log_filter_prefix = new char[size]; + strcpy_s(log_filter_prefix, size, filter); + } +} + +void SetLogItems(bool enable_process_id, bool enable_thread_id, + bool enable_timestamp, bool enable_tickcount) { + log_process_id = enable_process_id; + log_thread_id = enable_thread_id; + log_timestamp = enable_timestamp; + log_tickcount = enable_tickcount; +} + +void SetLogAssertHandler(LogAssertHandlerFunction handler) { + log_assert_handler = handler; +} + +// Displays a message box to the user with the error message in it. For +// Windows programs, it's possible that the message loop is messed up on +// a fatal error, and creating a MessageBox will cause that message loop +// to be run. Instead, we try to spawn another process that displays its +// command line. We look for "Debug Message.exe" in the same directory as +// the application. If it exists, we use it, otherwise, we use a regular +// message box. +void DisplayDebugMessage(const std::string& str) { + if (str.empty()) + return; + + // look for the debug dialog program next to our application + wchar_t prog_name[MAX_PATH]; + GetModuleFileNameW(NULL, prog_name, MAX_PATH); + wchar_t* backslash = wcsrchr(prog_name, '\\'); + if (backslash) + backslash[1] = 0; + wcscat_s(prog_name, MAX_PATH, L"debug_message.exe"); + + // stupid CreateProcess requires a non-const command line and may modify it. + // We also want to use the wide string + int charcount = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, NULL, 0); + if (!charcount) + return; + scoped_array<wchar_t> cmdline(new wchar_t[charcount]); + if (!MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, cmdline.get(), charcount)) + return; + + STARTUPINFO startup_info; + memset(&startup_info, 0, sizeof(startup_info)); + startup_info.cb = sizeof(startup_info); + + PROCESS_INFORMATION process_info; + if (CreateProcessW(prog_name, cmdline.get(), NULL, NULL, false, 0, NULL, + NULL, &startup_info, &process_info)) { + WaitForSingleObject(process_info.hProcess, INFINITE); + CloseHandle(process_info.hThread); + CloseHandle(process_info.hProcess); + } else { + // debug process broken, let's just do a message box + MessageBoxW(NULL, cmdline.get(), L"Fatal error", MB_OK | MB_ICONHAND); + } +} + +LogMessage::LogMessage(const char* file, int line, LogSeverity severity, + int ctr) + : severity_(severity) { + Init(file, line); +} + +LogMessage::LogMessage(const char* file, int line, const CheckOpString& result) + : severity_(LOG_FATAL) { + Init(file, line); + stream_ << "Check failed: " << (*result.str_); +} + +LogMessage::LogMessage(const char* file, int line) + : severity_(LOG_INFO) { + Init(file, line); +} + +LogMessage::LogMessage(const char* file, int line, LogSeverity severity) + : severity_(severity) { + Init(file, line); +} + +// writes the common header info to the stream +void LogMessage::Init(const char* file, int line) { + // log only the filename + const char* last_slash = strrchr(file, '\\'); + if (last_slash) + file = last_slash + 1; + + stream_ << '['; + if (log_process_id) + stream_ << GetCurrentProcessId() << ':'; + if (log_thread_id) + stream_ << GetCurrentThreadId() << ':'; + if (log_timestamp) { + time_t t = time(NULL); + struct tm tm_time; + localtime_s(&tm_time, &t); + stream_ << std::setfill('0') + << std::setw(2) << 1 + tm_time.tm_mon + << std::setw(2) << tm_time.tm_mday + << '/' + << std::setw(2) << tm_time.tm_hour + << std::setw(2) << tm_time.tm_min + << std::setw(2) << tm_time.tm_sec + << ':'; + } + if (log_tickcount) + stream_ << GetTickCount() << ':'; + stream_ << log_severity_names[severity_] << ":" << file << "(" << line << ")] "; + + message_start_ = stream_.pcount(); +} + +LogMessage::~LogMessage() { + if (severity_ < min_log_level) + return; + + std::string str_newline(stream_.str(), stream_.pcount()); + str_newline.append("\r\n"); + + if (log_filter_prefix && severity_ <= kMaxFilteredLogLevel && + str_newline.compare(message_start_, strlen(log_filter_prefix), + log_filter_prefix) != 0) { + goto cleanup; + } + + if (logging_destination != LOG_ONLY_TO_FILE) + OutputDebugStringA(str_newline.c_str()); + + // write to log file + if (logging_destination != LOG_ONLY_TO_SYSTEM_DEBUG_LOG && + InitializeLogFileHandle()) { + // we can have multiple threads and/or processes, so try to prevent them from + // clobbering each other's writes + if (lock_log_file == LOCK_LOG_FILE) { + // Ensure that the mutex is initialized in case the client app did not + // call InitLogging. This is not thread safe. See below + InitLogMutex(); + + DWORD r = ::WaitForSingleObject(log_mutex, INFINITE); + DCHECK(r != WAIT_ABANDONED); + } else { + // use the critical section + if (!initialized_critical_section) { + // The client app did not call InitLogging, and so the critical section + // has not been created. We do this on demand, but if two threads try to + // do this at the same time, there will be a race condition to create + // the critical section. This is why InitLogging should be called from + // the main thread at the beginning of execution. + InitializeCriticalSection(&log_critical_section); + initialized_critical_section = true; + } + EnterCriticalSection(&log_critical_section); + } + + SetFilePointer(log_file, 0, 0, SEEK_END); + DWORD num_written; + WriteFile(log_file, (void*)str_newline.c_str(), (DWORD)str_newline.length(), &num_written, NULL); + + if (lock_log_file == LOCK_LOG_FILE) { + ReleaseMutex(log_mutex); + } else { + LeaveCriticalSection(&log_critical_section); + } + } + + if (severity_ == LOG_FATAL) { + // display a message or break into the debugger on a fatal error + if (::IsDebuggerPresent()) { + DebugBreak(); + } else { + if (log_assert_handler) { + log_assert_handler(std::string(stream_.str(), stream_.pcount())); + } else { + // don't use the string with the newline, get a fresh version to send to + // the debug message process + DisplayDebugMessage(std::string(stream_.str(), stream_.pcount())); + TerminateProcess(GetCurrentProcess(), 1); + } + } + } + +cleanup: + // Calling stream_.str() freezes the stream buffer. A frozen buffer will + // not be freed during strstreambuf destruction. + stream_.freeze(false); +} + +void CloseLogFile() { + if (!log_file) + return; + + CloseHandle(log_file); + log_file = NULL; +} + +} // namespace logging + +std::ostream& operator<<(std::ostream& out, const wchar_t* wstr) { + if (!wstr || !wstr[0]) + return out; + + // compute the length of the buffer we'll need + int charcount = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, + NULL, 0, NULL, NULL); + if (charcount == 0) + return out; + + // convert + scoped_array<char> buf(new char[charcount]); + WideCharToMultiByte(CP_UTF8, 0, wstr, -1, buf.get(), charcount, NULL, NULL); + return out << buf.get(); +} diff --git a/googleurl/base/logging.h b/googleurl/base/logging.h new file mode 100644 index 0000000..5353b59 --- /dev/null +++ b/googleurl/base/logging.h @@ -0,0 +1,482 @@ +// Copyright 2006 Google Inc. All Rights Reserved. +// Author: brettw (Brett Wilson) + +#ifndef BASE_LOGGING_H__ +#define BASE_LOGGING_H__ + +#include <string> +#include <cstring> +#include <strstream> +#include <tchar.h> + +#include "base/basictypes.h" +#include "base/scoped_ptr.h" + +// Optional message capabilities +// ----------------------------- +// Assertion failed messages and fatal errors are displayed in a dialog box +// before the application exits. However, running this UI creates a message +// loop, which causes application messages to be processed and potentially +// dispatched to existing application windows. Since the application is in a +// bad state when this assertion dialog is displayed, these messages may not +// get processed and hang the dialog, or the application might go crazy. +// +// Therefore, it can be beneficial to display the error dialog in a separate +// process from the main application. When the logging system needs to display +// a fatal error dialog box, it will look for a program called +// "DebugMessage.exe" in the same directory as the application executable. It +// will run this application with the message as the command line, and will +// not include the name of the application as is traditional for easier +// parsing. +// +// The code for DebugMessage.exe is only one line. In WinMain, do: +// MessageBox(NULL, GetCommandLineW(), L"Fatal Error", 0); +// +// If DebugMessage.exe is not found, the logging code will use a normal +// MessageBox, potentially causing the problems discussed above. + + +// Instructions +// ------------ +// +// Make a bunch of macros for logging. The way to log things is to stream +// things to LOG(<a particular severity level>). E.g., +// +// LOG(INFO) << "Found " << num_cookies << " cookies"; +// +// You can also do conditional logging: +// +// LOG_IF(INFO, num_cookies > 10) << "Got lots of cookies"; +// +// The above will cause log messages to be output on the 1st, 11th, 21st, ... +// times it is executed. Note that the special COUNTER value is used to +// identify which repetition is happening. +// +// There are also "debug mode" logging macros like the ones above: +// +// DLOG(INFO) << "Found cookies"; +// +// DLOG_IF(INFO, num_cookies > 10) << "Got lots of cookies"; +// +// All "debug mode" logging is compiled away to nothing for non-debug mode +// compiles. LOG_IF and development flags also work well together +// because the code can be compiled away sometimes. +// +// We also have +// +// LOG_ASSERT(assertion); +// DLOG_ASSERT(assertion); +// +// which is syntactic sugar for {,D}LOG_IF(FATAL, assert fails) << assertion; +// +// We also override the standard 'assert' to use 'DLOG_ASSERT'. +// +// The supported severity levels for macros that allow you to specify one +// are (in increasing order of severity) INFO, WARNING, ERROR, and FATAL. +// +// There is also the special severity of DFATAL, which logs FATAL in +// debug mode, ERROR in normal mode. +// +// Very important: logging a message at the FATAL severity level causes +// the program to terminate (after the message is logged). + +namespace logging { + +// Where to record logging output? A flat file and/or system debug log via +// OutputDebugString. Defaults to LOG_ONLY_TO_FILE. +enum LoggingDestination { LOG_ONLY_TO_FILE, + LOG_ONLY_TO_SYSTEM_DEBUG_LOG, + LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG }; + +// Indicates that the log file should be locked when being written to. +// Often, there is no locking, which is fine for a single threaded program. +// If logging is being done from multiple threads or there can be more than +// one process doing the logging, the file should be locked during writes to +// make each log outut atomic. Other writers will block. +// +// All processes writing to the log file must have their locking set for it to +// work properly. Defaults to DONT_LOCK_LOG_FILE. +enum LogLockingState { LOCK_LOG_FILE, DONT_LOCK_LOG_FILE }; + +// On startup, should we delete or append to an existing log file (if any)? +// Defaults to APPEND_TO_OLD_LOG_FILE. +enum OldFileDeletionState { DELETE_OLD_LOG_FILE, APPEND_TO_OLD_LOG_FILE }; + +// Sets the log file name and other global logging state. Calling this function +// is recommended, and is normally done at the beginning of application init. +// If you don't call it, all the flags will be initialized to their default +// values, and there is a race condition that may leak a critical section +// object if two threads try to do the first log at the same time. +// See the definition of the enums above for descriptions and default values. +// +// The default log file is initialized to "debug.log" in the application +// directory. You probably don't want this, especially since the program +// directory may not be writable on an enduser's system. +void InitLogging(const TCHAR* log_file, LoggingDestination logging_dest, + LogLockingState lock_log, OldFileDeletionState delete_old); + +// Sets the log level. Anything at or above this level will be written to the +// log file/displayed to the user (if applicable). Anything below this level +// will be silently ignored. The log level defaults to 0 (everything is logged) +// if this function is not called. +void SetMinLogLevel(int level); + +// Sets the log filter prefix. Any log message below LOG_ERROR severity that +// doesn't start with this prefix with be silently ignored. The filter defaults +// to NULL (everything is logged) if this function is not called. Messages +// with severity of LOG_ERROR or higher will not be filtered. +void SetLogFilterPrefix(char* filter); + +// Sets the common items you want to be prepended to each log message. +// process and thread IDs default to off, the timestamp defaults to on. +// If this function is not called, logging defaults to writing the timestamp +// only. +void SetLogItems(bool enable_process_id, bool enable_thread_id, + bool enable_timestamp, bool enable_tickcount); + +// Sets the Log Assert Handler that will be used to notify of check failures. +// The default handler shows a dialog box, however clients can use this +// function to override with their own handling (e.g. a silent one for Unit +// Tests) +typedef void (*LogAssertHandlerFunction)(const std::string& str); +void SetLogAssertHandler(LogAssertHandlerFunction handler); + +typedef int LogSeverity; +const LogSeverity LOG_INFO = 0; +const LogSeverity LOG_WARNING = 1; +const LogSeverity LOG_ERROR = 2; +const LogSeverity LOG_FATAL = 3; +const LogSeverity LOG_NUM_SEVERITIES = 4; + +// LOG_DFATAL_LEVEL is LOG_FATAL in debug mode, ERROR in normal mode +#ifdef NDEBUG +const LogSeverity LOG_DFATAL_LEVEL = LOG_ERROR; +#else +const LogSeverity LOG_DFATAL_LEVEL = LOG_FATAL; +#endif + +// A few definitions of macros that don't generate much code. These are used +// by LOG() and LOG_IF, etc. Since these are used all over our code, it's +// better to have compact code for these operations. +#define COMPACT_GOOGLE_LOG_INFO \ + logging::LogMessage(__FILE__, __LINE__) +#define COMPACT_GOOGLE_LOG_WARNING \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_WARNING) +#define COMPACT_GOOGLE_LOG_ERROR \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR) +#define COMPACT_GOOGLE_LOG_FATAL \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_FATAL) +#define COMPACT_GOOGLE_LOG_DFATAL \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_DFATAL_LEVEL) + +// wingdi.h defines ERROR to be 0. When we call LOG(ERROR), it gets +// substituted with 0, and it expands to COMPACT_GOOGLE_LOG_0. To allow us +// to keep using this syntax, we define this macro to do the same thing +// as COMPACT_GOOGLE_LOG_ERROR, and also define ERROR the same way that +// the Windows SDK does for consistency. +#define ERROR 0 +#define COMPACT_GOOGLE_LOG_0 \ + logging::LogMessage(__FILE__, __LINE__, logging::LOG_ERROR) + +// We use the preprocessor's merging operator, "##", so that, e.g., +// LOG(INFO) becomes the token COMPACT_GOOGLE_LOG_INFO. There's some funny +// subtle difference between ostream member streaming functions (e.g., +// ostream::operator<<(int) and ostream non-member streaming functions +// (e.g., ::operator<<(ostream&, string&): it turns out that it's +// impossible to stream something like a string directly to an unnamed +// ostream. We employ a neat hack by calling the stream() member +// function of LogMessage which seems to avoid the problem. + +#define LOG(severity) COMPACT_GOOGLE_LOG_ ## severity.stream() +#define SYSLOG(severity) LOG(severity) + +#define LOG_IF(severity, condition) \ + !(condition) ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) +#define SYSLOG_IF(severity, condition) LOG_IF(severity, condition) + +#define LOG_ASSERT(condition) \ + LOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". " +#define SYSLOG_ASSERT(condition) \ + SYSLOG_IF(FATAL, !(condition)) << "Assert failed: " #condition ". " + +// A container for a string pointer which can be evaluated to a bool - +// true iff the pointer is NULL. +struct CheckOpString { + CheckOpString(std::string* str) : str_(str) { } + // No destructor: if str_ is non-NULL, we're about to LOG(FATAL), + // so there's no point in cleaning up str_. + operator bool() const { return str_ != NULL; } + std::string* str_; +}; + +// Build the error message string. This is separate from the "Impl" +// function template because it is not performance critical and so can +// be out of line, while the "Impl" code should be inline. +template<class t1, class t2> +std::string* MakeCheckOpString(const t1& v1, const t2& v2, const char* names) { + std::ostrstream ss; + ss << names << " (" << v1 << " vs. " << v2 << ")"; + return new std::string(ss.str(), ss.pcount()); +} + +extern std::string* MakeCheckOpStringIntInt(int v1, int v2, const char* names); + +template<int, int> +std::string* MakeCheckOpString(const int& v1, const int& v2, const char* names) { + return MakeCheckOpStringIntInt(v1, v2, names); +} + +// Plus some debug-logging macros that get compiled to nothing for production +// +// DEBUG_MODE is for uses like +// if (DEBUG_MODE) foo.CheckThatFoo(); +// instead of +// #ifndef NDEBUG +// foo.CheckThatFoo(); +// #endif + +#ifndef NDEBUG + +#define DLOG(severity) LOG(severity) +#define DLOG_IF(severity, condition) LOG_IF(severity, condition) +#define DLOG_ASSERT(condition) LOG_ASSERT(condition) + +// debug-only checking. not executed in NDEBUG mode. +enum { DEBUG_MODE = 1 }; +#define DCHECK(condition) \ + LOG_IF(FATAL, !(condition)) << "Check failed: " #condition ". " + +// Helper functions for DCHECK_OP macro. +// The (int, int) specialization works around the issue that the compiler +// will not instantiate the template version of the function on values of +// unnamed enum type - see comment below. +#define DEFINE_DCHECK_OP_IMPL(name, op) \ + template <class t1, class t2> \ + inline std::string* Check##name##Impl(const t1& v1, const t2& v2, \ + const char* names) { \ + if (v1 op v2) return NULL; \ + else return MakeCheckOpString(v1, v2, names); \ + } \ + inline std::string* Check##name##Impl(int v1, int v2, const char* names) { \ + if (v1 op v2) return NULL; \ + else return MakeCheckOpString(v1, v2, names); \ + } +DEFINE_DCHECK_OP_IMPL(EQ, ==) +DEFINE_DCHECK_OP_IMPL(NE, !=) +DEFINE_DCHECK_OP_IMPL(LE, <=) +DEFINE_DCHECK_OP_IMPL(LT, < ) +DEFINE_DCHECK_OP_IMPL(GE, >=) +DEFINE_DCHECK_OP_IMPL(GT, > ) +#undef DEFINE_DCHECK_OP_IMPL + +// Helper macro for binary operators. +// Don't use this macro directly in your code, use CHECK_EQ et al below. +#define DCHECK_OP(name, op, val1, val2) \ + while (logging::CheckOpString _result = \ + logging::Check##name##Impl((val1), (val2), #val1 " " #op " " #val2)) \ + logging::LogMessage(__FILE__, __LINE__, _result).stream() + +// Equality/Inequality checks - compare two values, and log a LOG_FATAL message +// including the two values when the result is not as expected. The values +// must have operator<<(ostream, ...) defined. +// +// You may append to the error message like so: +// CHECK_NE(1, 2) << ": The world must be ending!"; +// +// We are very careful to ensure that each argument is evaluated exactly +// once, and that anything which is legal to pass as a function argument is +// legal here. In particular, the arguments may be temporary expressions +// which will end up being destroyed at the end of the apparent statement, +// for example: +// CHECK_EQ(string("abc")[1], 'b'); +// +// WARNING: These don't compile correctly if one of the arguments is a pointer +// and the other is NULL. To work around this, simply static_cast NULL to the +// type of the desired pointer. + +#define DCHECK_EQ(val1, val2) DCHECK_OP(EQ, ==, val1, val2) +#define DCHECK_NE(val1, val2) DCHECK_OP(NE, !=, val1, val2) +#define DCHECK_LE(val1, val2) DCHECK_OP(LE, <=, val1, val2) +#define DCHECK_LT(val1, val2) DCHECK_OP(LT, < , val1, val2) +#define DCHECK_GE(val1, val2) DCHECK_OP(GE, >=, val1, val2) +#define DCHECK_GT(val1, val2) DCHECK_OP(GT, > , val1, val2) + +// Helper functions for string comparisons. +// To avoid bloat, the definitions are in logging.cc. +#define DECLARE_DCHECK_STROP_IMPL(func, expected) \ + std::string* Check##func##expected##Impl(const char* s1, \ + const char* s2, \ + const char* names); +DECLARE_DCHECK_STROP_IMPL(strcmp, true) +DECLARE_DCHECK_STROP_IMPL(strcmp, false) +DECLARE_DCHECK_STROP_IMPL(_stricmp, true) +DECLARE_DCHECK_STROP_IMPL(_stricmp, false) +#undef DECLARE_DCHECK_STROP_IMPL + +// Helper macro for string comparisons. +// Don't use this macro directly in your code, use CHECK_STREQ et al below. +#define DCHECK_STROP(func, op, expected, s1, s2) \ + while (CheckOpString _result = \ + logging::Check##func##expected##Impl((s1), (s2), \ + #s1 " " #op " " #s2)) \ + LOG(FATAL) << *_result.str_ + +// String (char*) equality/inequality checks. +// CASE versions are case-insensitive. +// +// Note that "s1" and "s2" may be temporary strings which are destroyed +// by the compiler at the end of the current "full expression" +// (e.g. DCHECK_STREQ(Foo().c_str(), Bar().c_str())). + +#define DCHECK_STREQ(s1, s2) DCHECK_STROP(strcmp, ==, true, s1, s2) +#define DCHECK_STRNE(s1, s2) DCHECK_STROP(strcmp, !=, false, s1, s2) +#define DCHECK_STRCASEEQ(s1, s2) DCHECK_STROP(_stricmp, ==, true, s1, s2) +#define DCHECK_STRCASENE(s1, s2) DCHECK_STROP(_stricmp, !=, false, s1, s2) + +#define DCHECK_INDEX(I,A) DCHECK(I < (sizeof(A)/sizeof(A[0]))) +#define DCHECK_BOUND(B,A) DCHECK(B <= (sizeof(A)/sizeof(A[0]))) + +#else // NDEBUG + +#define DLOG(severity) \ + true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) + +#define DLOG_IF(severity, condition) \ + true ? (void) 0 : logging::LogMessageVoidify() & LOG(severity) + +#define DLOG_ASSERT(condition) \ + true ? (void) 0 : LOG_ASSERT(condition) + +enum { DEBUG_MODE = 0 }; + +// This macro can be followed by a sequence of stream parameters in +// non-debug mode. The DCHECK and friends macros use this so that +// the expanded expression DCHECK(foo) << "asdf" is still syntactically +// valid, even though the expression will get optimized away. +#define NDEBUG_EAT_STREAM_PARAMETERS \ + logging::LogMessage(__FILE__, __LINE__).stream() + +#define DCHECK(condition) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_EQ(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_NE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_LE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_LT(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_GE(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_GT(val1, val2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STREQ(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRCASEEQ(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRNE(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#define DCHECK_STRCASENE(str1, str2) \ + while (false) NDEBUG_EAT_STREAM_PARAMETERS + +#endif // NDEBUG + +#define NOTREACHED() DCHECK(false) + +// Redefine the standard assert to use our nice log files +#undef assert +#define assert(x) DLOG_ASSERT(x) + +// This class more or less represents a particular log message. You +// create an instance of LogMessage and then stream stuff to it. +// When you finish streaming to it, ~LogMessage is called and the +// full message gets streamed to the appropriate destination. +// +// You shouldn't actually use LogMessage's constructor to log things, +// though. You should use the LOG() macro (and variants thereof) +// above. +class LogMessage { + public: + LogMessage(const char* file, int line, LogSeverity severity, int ctr); + + // Two special constructors that generate reduced amounts of code at + // LOG call sites for common cases. + // + // Used for LOG(INFO): Implied are: + // severity = LOG_INFO, ctr = 0 + // + // Using this constructor instead of the more complex constructor above + // saves a couple of bytes per call site. + LogMessage(const char* file, int line); + + // Used for LOG(severity) where severity != INFO. Implied + // are: ctr = 0 + // + // Using this constructor instead of the more complex constructor above + // saves a couple of bytes per call site. + LogMessage(const char* file, int line, LogSeverity severity); + + // A special constructor used for check failures. + // Implied severity = LOG_FATAL + LogMessage(const char* file, int line, const CheckOpString& result); + + ~LogMessage(); + + std::ostream& stream() { return stream_; } + + private: + void Init(const char* file, int line); + + LogSeverity severity_; + std::ostrstream stream_; + int message_start_; // offset of the start of the message (past prefix info). + + DISALLOW_EVIL_CONSTRUCTORS(LogMessage); +}; + +// A non-macro interface to the log facility; (useful +// when the logging level is not a compile-time constant). +inline void LogAtLevel(int const log_level, std::string const &msg) { + LogMessage(__FILE__, __LINE__, log_level).stream() << msg; +} + +// This class is used to explicitly ignore values in the conditional +// logging macros. This avoids compiler warnings like "value computed +// is not used" and "statement has no effect". +class LogMessageVoidify { + public: + LogMessageVoidify() { } + // This has to be an operator with a precedence lower than << but + // higher than ?: + void operator&(std::ostream&) { } +}; + +// Closes the log file explicitly if open. +// NOTE: Since the log file is opened as necessary by the action of logging +// statements, there's no guarantee that it will stay closed +// after this call. +void CloseLogFile(); + +} // namespace Logging + +// These functions are provided as a convenience for logging, which is where we +// use streams (it is against Google style to use streams in other places). It +// is designed to allow you to emit non-ASCII Unicode strings to the log file, +// which is normally ASCII. It is relatively slow, so try not to use it for +// common cases. Non-ASCII characters will be converted to UTF-8 by these operators. +std::ostream& operator<<(std::ostream& out, const wchar_t* wstr); +inline std::ostream& operator<<(std::ostream& out, const std::wstring& wstr) { + return out << wstr.c_str(); +} + +#endif // BASE_LOGGING_H__ diff --git a/googleurl/base/scoped_ptr.h b/googleurl/base/scoped_ptr.h new file mode 100644 index 0000000..de0b388 --- /dev/null +++ b/googleurl/base/scoped_ptr.h @@ -0,0 +1,322 @@ +#ifndef BASE_SCOPED_PTR_H +#define BASE_SCOPED_PTR_H + +// (C) Copyright Greg Colvin and Beman Dawes 1998, 1999. +// Copyright (c) 2001, 2002 Peter Dimov +// +// Permission to copy, use, modify, sell and distribute this software +// is granted provided this copyright notice appears in all copies. +// This software is provided "as is" without express or implied +// warranty, and with no claim as to its suitability for any purpose. +// +// See http://www.boost.org/libs/smart_ptr/scoped_ptr.htm for documentation. +// + +// scoped_ptr mimics a built-in pointer except that it guarantees deletion +// of the object pointed to, either on destruction of the scoped_ptr or via +// an explicit reset(). scoped_ptr is a simple solution for simple needs; +// use shared_ptr or std::auto_ptr if your needs are more complex. + +// *** NOTE *** +// If your scoped_ptr is a class member of class FOO pointing to a +// forward declared type BAR (as shown below), then you MUST use a non-inlined +// version of the destructor. The destructor of a scoped_ptr (called from +// FOO's destructor) must have a complete definition of BAR in order to +// destroy it. Example: +// +// -- foo.h -- +// class BAR; +// +// class FOO { +// public: +// FOO(); +// ~FOO(); // Required for sources that instantiate class FOO to compile! +// +// private: +// scoped_ptr<BAR> bar_; +// }; +// +// -- foo.cc -- +// #include "foo.h" +// FOO::~FOO() {} // Empty, but must be non-inlined to FOO's class definition. + +#include <cstddef> // for std::ptrdiff_t +#include <assert.h> // for assert +#include <stdlib.h> // for free() decl + +template <typename T> +class scoped_ptr { + private: + + T* ptr; + + scoped_ptr(scoped_ptr const &); + scoped_ptr & operator=(scoped_ptr const &); + + public: + + typedef T element_type; + + explicit scoped_ptr(T* p = 0): ptr(p) {} + + ~scoped_ptr() { + typedef char type_must_be_complete[sizeof(T)]; + delete ptr; + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + delete ptr; + ptr = p; + } + } + + T& operator*() const { + assert(ptr != 0); + return *ptr; + } + + T* operator->() const { + assert(ptr != 0); + return ptr; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_ptr & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_ptr should have its own object + template <typename U> bool operator==(scoped_ptr<U> const& p) const; + template <typename U> bool operator!=(scoped_ptr<U> const& p) const; +}; + +template<typename T> inline +void swap(scoped_ptr<T>& a, scoped_ptr<T>& b) { + a.swap(b); +} + +template<typename T> inline +bool operator==(T* p, const scoped_ptr<T>& b) { + return p == b.get(); +} + +template<typename T> inline +bool operator!=(T* p, const scoped_ptr<T>& b) { + return p != b.get(); +} + +// scoped_array extends scoped_ptr to arrays. Deletion of the array pointed to +// is guaranteed, either on destruction of the scoped_array or via an explicit +// reset(). Use shared_array or std::vector if your needs are more complex. + +template<typename T> +class scoped_array { + private: + + T* ptr; + + scoped_array(scoped_array const &); + scoped_array & operator=(scoped_array const &); + + public: + + typedef T element_type; + + explicit scoped_array(T* p = 0) : ptr(p) {} + + ~scoped_array() { + typedef char type_must_be_complete[sizeof(T)]; + delete[] ptr; + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + delete [] ptr; + ptr = p; + } + } + + T& operator[](std::ptrdiff_t i) const { + assert(ptr != 0); + assert(i >= 0); + return ptr[i]; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_array & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_array should have its own object + template <typename U> bool operator==(scoped_array<U> const& p) const; + template <typename U> bool operator!=(scoped_array<U> const& p) const; +}; + +template<class T> inline +void swap(::scoped_array<T>& a, ::scoped_array<T>& b) { + a.swap(b); +} + +template<typename T> inline +bool operator==(T* p, const ::scoped_array<T>& b) { + return p == b.get(); +} + +template<typename T> inline +bool operator!=(T* p, const ::scoped_array<T>& b) { + return p != b.get(); +} + + +// This class wraps the c library function free() in a class that can be +// passed as a template argument to scoped_ptr_malloc below. +class ScopedPtrMallocFree { + public: + inline void operator()(void* x) const { + free(x); + } +}; + +// scoped_ptr_malloc<> is similar to scoped_ptr<>, but it accepts a +// second template argument, the functor used to free the object. + +template<typename T, typename FreeProc = ScopedPtrMallocFree> +class scoped_ptr_malloc { + private: + + T* ptr; + + scoped_ptr_malloc(scoped_ptr_malloc const &); + scoped_ptr_malloc & operator=(scoped_ptr_malloc const &); + + public: + + typedef T element_type; + + explicit scoped_ptr_malloc(T* p = 0): ptr(p) {} + + ~scoped_ptr_malloc() { + typedef char type_must_be_complete[sizeof(T)]; + free_((void*) ptr); + } + + void reset(T* p = 0) { + typedef char type_must_be_complete[sizeof(T)]; + + if (ptr != p) { + free_((void*) ptr); + ptr = p; + } + } + + T& operator*() const { + assert(ptr != 0); + return *ptr; + } + + T* operator->() const { + assert(ptr != 0); + return ptr; + } + + bool operator==(T* p) const { + return ptr == p; + } + + bool operator!=(T* p) const { + return ptr != p; + } + + T* get() const { + return ptr; + } + + void swap(scoped_ptr_malloc & b) { + T* tmp = b.ptr; + b.ptr = ptr; + ptr = tmp; + } + + T* release() { + T* tmp = ptr; + ptr = 0; + return tmp; + } + + private: + + // no reason to use these: each scoped_ptr_malloc should have its own object + template <typename U, typename GP> + bool operator==(scoped_ptr_malloc<U, GP> const& p) const; + template <typename U, typename GP> + bool operator!=(scoped_ptr_malloc<U, GP> const& p) const; + + static FreeProc const free_; +}; + +template<typename T, typename FP> +FP const scoped_ptr_malloc<T,FP>::free_ = FP(); + +template<typename T, typename FP> inline +void swap(scoped_ptr_malloc<T,FP>& a, scoped_ptr_malloc<T,FP>& b) { + a.swap(b); +} + +template<typename T, typename FP> inline +bool operator==(T* p, const scoped_ptr_malloc<T,FP>& b) { + return p == b.get(); +} + +template<typename T, typename FP> inline +bool operator!=(T* p, const scoped_ptr_malloc<T,FP>& b) { + return p != b.get(); +} + +#endif // #ifndef BASE_SCOPED_PTR_H diff --git a/googleurl/base/string16.cc b/googleurl/base/string16.cc new file mode 100644 index 0000000..fc25809 --- /dev/null +++ b/googleurl/base/string16.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "base/string16.h" + +#ifdef WIN32 + +#error This file should not be used on 2-byte wchar_t systems +// If this winds up being needed on 2-byte wchar_t systems, either the +// definitions below can be used, or the host system's wide character +// functions like wmemcmp can be wrapped. + +#else // !WIN32 + +namespace base { + +int c16memcmp(const char16* s1, const char16* s2, size_t n) { + // We cannot call memcmp because that changes the semantics. + while (n-- > 0) { + if (*s1 != *s2) { + // We cannot use (*s1 - *s2) because char16 is unsigned. + return ((*s1 < *s2) ? -1 : 1); + } + ++s1; + ++s2; + } + return 0; +} + +size_t c16len(const char16* s) { + const char16 *s_orig = s; + while (*s) { + ++s; + } + return s - s_orig; +} + +const char16* c16memchr(const char16* s, char16 c, size_t n) { + while (n-- > 0) { + if (*s == c) { + return s; + } + ++s; + } + return 0; +} + +char16* c16memmove(char16* s1, const char16* s2, size_t n) { + return reinterpret_cast<char16*>(memmove(s1, s2, n * sizeof(char16))); +} + +char16* c16memcpy(char16* s1, const char16* s2, size_t n) { + return reinterpret_cast<char16*>(memcpy(s1, s2, n * sizeof(char16))); +} + +char16* c16memset(char16* s, char16 c, size_t n) { + char16 *s_orig = s; + while (n-- > 0) { + *s = c; + ++s; + } + return s_orig; +} + +} // namespace base + +template class std::basic_string<char16, base::string16_char_traits>; + +#endif // WIN32 diff --git a/googleurl/base/string16.h b/googleurl/base/string16.h new file mode 100644 index 0000000..9e0fd1d --- /dev/null +++ b/googleurl/base/string16.h @@ -0,0 +1,192 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef BASE_STRING16_H_ +#define BASE_STRING16_H_ + +// WHAT: +// A version of std::basic_string that provides 2-byte characters even when +// wchar_t is not implemented as a 2-byte type. You can access this class as +// string16. We also define char16, which string16 is based upon. +// +// WHY: +// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2 +// data. Plenty of existing code operates on strings encoded as UTF-16. +// +// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make +// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails +// at run time, because it calls some functions (like wcslen) that come from +// the system's native C library -- which was built with a 4-byte wchar_t! +// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's +// entirely improper on those systems where the encoding of wchar_t is defined +// as UTF-32. +// +// Here, we define string16, which is similar to std::wstring but replaces all +// libc functions with custom, 2-byte-char compatible routines. It is capable +// of carrying UTF-16-encoded data. + +#include <string> + +#include "base/basictypes.h" + +#ifdef WIN32 + +typedef wchar_t char16; +typedef std::wstring string16; + +#else // !WIN32 + +typedef uint16 char16; + +namespace base { + +// char16 versions of the functions required by string16_char_traits; these +// are based on the wide character functions of similar names ("w" or "wcs" +// instead of "c16"). +int c16memcmp(const char16* s1, const char16* s2, size_t n); +size_t c16len(const char16* s); +const char16* c16memchr(const char16* s, char16 c, size_t n); +char16* c16memmove(char16* s1, const char16* s2, size_t n); +char16* c16memcpy(char16* s1, const char16* s2, size_t n); +char16* c16memset(char16* s, char16 c, size_t n); + +struct string16_char_traits { + typedef char16 char_type; + typedef int int_type; + + typedef std::streamoff off_type; + typedef mbstate_t state_type; + typedef std::fpos<state_type> pos_type; + + static void assign(char_type& c1, const char_type& c2) { + c1 = c2; + } + + static bool eq(const char_type& c1, const char_type& c2) { + return c1 == c2; + } + static bool lt(const char_type& c1, const char_type& c2) { + return c1 < c2; + } + + static int compare(const char_type* s1, const char_type* s2, size_t n) { + return c16memcmp(s1, s2, n); + } + + static size_t length(const char_type* s) { + return c16len(s); + } + + static const char_type* find(const char_type* s, size_t n, + const char_type& a) { + return c16memchr(s, a, n); + } + + static char_type* move(char_type* s1, const char_type* s2, int_type n) { + return c16memmove(s1, s2, n); + } + + static char_type* copy(char_type* s1, const char_type* s2, size_t n) { + return c16memcpy(s1, s2, n); + } + + static char_type* assign(char_type* s, size_t n, char_type a) { + return c16memset(s, a, n); + } + + static int_type not_eof(const int_type& c) { + return eq_int_type(c, eof()) ? 0 : c; + } + + static char_type to_char_type(const int_type& c) { + return char_type(c); + } + + static int_type to_int_type(const char_type& c) { + return int_type(c); + } + + static bool eq_int_type(const int_type& c1, const int_type& c2) { + return c1 == c2; + } + + static int_type eof() { + return static_cast<int_type>(EOF); + } +}; + +} // namespace base + +// The string class will be explicitly instantiated only once, in string16.cc. +// +// std::basic_string<> in GNU libstdc++ contains a static data member, +// _S_empty_rep_storage, to represent empty strings. When an operation such +// as assignment or destruction is performed on a string, causing its existing +// data member to be invalidated, it must not be freed if this static data +// member is being used. Otherwise, it counts as an attempt to free static +// (and not allocated) data, which is a memory error. +// +// Generally, due to C++ template magic, _S_empty_rep_storage will be marked +// as a coalesced symbol, meaning that the linker will combine multiple +// instances into a single one when generating output. +// +// If a string class is used by multiple shared libraries, a problem occurs. +// Each library will get its own copy of _S_empty_rep_storage. When strings +// are passed across a library boundary for alteration or destruction, memory +// errors will result. GNU libstdc++ contains a configuration option, +// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which +// disables the static data member optimization, but it's a good optimization +// and non-STL code is generally at the mercy of the system's STL +// configuration. Fully-dynamic strings are not the default for GNU libstdc++ +// libstdc++ itself or for the libstdc++ installations on the systems we care +// about, such as Mac OS X and relevant flavors of Linux. +// +// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 . +// +// To avoid problems, string classes need to be explicitly instantiated only +// once, in exactly one library. All other string users see it via an "extern" +// declaration. This is precisely how GNU libstdc++ handles +// std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring). +// +// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2), +// in which the linker does not fully coalesce symbols when dead code +// stripping is enabled. This bug causes the memory errors described above +// to occur even when a std::basic_string<> does not cross shared library +// boundaries, such as in statically-linked executables. +// +// TODO(mark): File this bug with Apple and update this note with a bug number. + +extern template class std::basic_string<char16, base::string16_char_traits>; + +typedef std::basic_string<char16, base::string16_char_traits> string16; + +extern std::ostream& operator<<(std::ostream& out, const string16& str); + +#endif // !WIN32 + +#endif // BASE_STRING16_H_ diff --git a/googleurl/build/README.txt b/googleurl/build/README.txt new file mode 100644 index 0000000..eab011a --- /dev/null +++ b/googleurl/build/README.txt @@ -0,0 +1,4 @@ +This directory includes solution and project files for compiling with +Visual Studio 2005 on Windows. + +The base checkout directory must be named 'googleurl'. diff --git a/googleurl/build/base.vcproj b/googleurl/build/base.vcproj new file mode 100644 index 0000000..0e923cf --- /dev/null +++ b/googleurl/build/base.vcproj @@ -0,0 +1,151 @@ +<?xml version="1.0" encoding="Windows-1252"?> +<VisualStudioProject + ProjectType="Visual C++" + Version="8.00" + Name="base" + ProjectGUID="{ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}" + RootNamespace="base" + Keyword="Win32Proj" + > + <Platforms> + <Platform + Name="Win32" + /> + </Platforms> + <ToolFiles> + </ToolFiles> + <Configurations> + <Configuration + Name="Debug|Win32" + ConfigurationType="4" + InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\debug.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops" + > + <Tool + Name="VCPreBuildEventTool" + /> + <Tool + Name="VCCustomBuildTool" + /> + <Tool + Name="VCXMLDataGeneratorTool" + /> + <Tool + Name="VCWebServiceProxyGeneratorTool" + /> + <Tool + Name="VCMIDLTool" + /> + <Tool + Name="VCCLCompilerTool" + /> + <Tool + Name="VCManagedResourceCompilerTool" + /> + <Tool + Name="VCResourceCompilerTool" + /> + <Tool + Name="VCPreLinkEventTool" + /> + <Tool + Name="VCLibrarianTool" + /> + <Tool + Name="VCALinkTool" + /> + <Tool + Name="VCXDCMakeTool" + /> + <Tool + Name="VCBscMakeTool" + /> + <Tool + Name="VCFxCopTool" + /> + <Tool + Name="VCPostBuildEventTool" + /> + </Configuration> + <Configuration + Name="Release|Win32" + ConfigurationType="4" + InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\release.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops" + > + <Tool + Name="VCPreBuildEventTool" + /> + <Tool + Name="VCCustomBuildTool" + /> + <Tool + Name="VCXMLDataGeneratorTool" + /> + <Tool + Name="VCWebServiceProxyGeneratorTool" + /> + <Tool + Name="VCMIDLTool" + /> + <Tool + Name="VCCLCompilerTool" + /> + <Tool + Name="VCManagedResourceCompilerTool" + /> + <Tool + Name="VCResourceCompilerTool" + /> + <Tool + Name="VCPreLinkEventTool" + /> + <Tool + Name="VCLibrarianTool" + /> + <Tool + Name="VCALinkTool" + /> + <Tool + Name="VCXDCMakeTool" + /> + <Tool + Name="VCBscMakeTool" + /> + <Tool + Name="VCFxCopTool" + /> + <Tool + Name="VCPostBuildEventTool" + /> + </Configuration> + </Configurations> + <References> + </References> + <Files> + <File + RelativePath="..\base\basictypes.h" + > + </File> + <File + RelativePath="..\base\logging.cc" + > + </File> + <File + RelativePath="..\base\logging.h" + > + </File> + <File + RelativePath="..\base\README.txt" + > + </File> + <File + RelativePath="..\base\scoped_ptr.h" + > + </File> + <File + RelativePath="..\base\string16.h" + > + </File> + </Files> + <Globals> + </Globals> +</VisualStudioProject> diff --git a/googleurl/build/common.vsprops b/googleurl/build/common.vsprops new file mode 100644 index 0000000..ede28e9 --- /dev/null +++ b/googleurl/build/common.vsprops @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="Windows-1252"?> +<VisualStudioPropertySheet + ProjectType="Visual C++" + Version="8.00" + Name="common" + OutputDirectory="$(SolutionDir)$(ConfigurationName)" + IntermediateDirectory="$(SolutionDir)$(ConfigurationName)\obj\$(ProjectName)" + CharacterSet="1" + > + <Tool + Name="VCCLCompilerTool" + AdditionalIncludeDirectories="$(SolutionDir)..\..;$(SolutionDir).." + PreprocessorDefinitions="_WIN32_WINNT=0x0501;WINVER=0x0501;WIN32;_WINDOWS" + MinimalRebuild="false" + BufferSecurityCheck="true" + EnableFunctionLevelLinking="true" + WarningLevel="3" + WarnAsError="true" + Detect64BitPortabilityProblems="true" + DebugInformationFormat="3" + /> +</VisualStudioPropertySheet> diff --git a/googleurl/build/debug.vsprops b/googleurl/build/debug.vsprops new file mode 100644 index 0000000..d2aa43f --- /dev/null +++ b/googleurl/build/debug.vsprops @@ -0,0 +1,18 @@ +<?xml version="1.0" encoding="Windows-1252"?> +<VisualStudioPropertySheet + ProjectType="Visual C++" + Version="8.00" + Name="debug" + > + <Tool + Name="VCCLCompilerTool" + Optimization="0" + PreprocessorDefinitions="_DEBUG" + BasicRuntimeChecks="3" + RuntimeLibrary="1" + /> + <Tool + Name="VCLinkerTool" + LinkIncremental="2" + /> +</VisualStudioPropertySheet> diff --git a/googleurl/build/googleurl.sln b/googleurl/build/googleurl.sln new file mode 100644 index 0000000..347810d --- /dev/null +++ b/googleurl/build/googleurl.sln @@ -0,0 +1,32 @@ + +Microsoft Visual Studio Solution File, Format Version 9.00 +# Visual Studio 2005 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "googleurl", "googleurl.vcproj", "{EF5E94AB-B646-4E5B-A058-52EF07B8351C}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "base", "base.vcproj", "{ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{D8E84C85-89D3-4B8D-9A3A-C44B63C3383A}" + ProjectSection(SolutionItems) = preProject + ..\LICENSE.txt = ..\LICENSE.txt + ..\README.txt = ..\README.txt + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {EF5E94AB-B646-4E5B-A058-52EF07B8351C}.Debug|Win32.ActiveCfg = Debug|Win32 + {EF5E94AB-B646-4E5B-A058-52EF07B8351C}.Debug|Win32.Build.0 = Debug|Win32 + {EF5E94AB-B646-4E5B-A058-52EF07B8351C}.Release|Win32.ActiveCfg = Release|Win32 + {EF5E94AB-B646-4E5B-A058-52EF07B8351C}.Release|Win32.Build.0 = Release|Win32 + {ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}.Debug|Win32.ActiveCfg = Debug|Win32 + {ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}.Debug|Win32.Build.0 = Debug|Win32 + {ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}.Release|Win32.ActiveCfg = Release|Win32 + {ACAC8E18-F003-4881-9BA0-C7718AC5FFD5}.Release|Win32.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/googleurl/build/googleurl.vcproj b/googleurl/build/googleurl.vcproj new file mode 100644 index 0000000..71b3123 --- /dev/null +++ b/googleurl/build/googleurl.vcproj @@ -0,0 +1,239 @@ +<?xml version="1.0" encoding="Windows-1252"?> +<VisualStudioProject + ProjectType="Visual C++" + Version="8.00" + Name="googleurl" + ProjectGUID="{EF5E94AB-B646-4E5B-A058-52EF07B8351C}" + RootNamespace="googleurl" + Keyword="Win32Proj" + > + <Platforms> + <Platform + Name="Win32" + /> + </Platforms> + <ToolFiles> + </ToolFiles> + <Configurations> + <Configuration + Name="Debug|Win32" + ConfigurationType="4" + InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\debug.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops" + > + <Tool + Name="VCPreBuildEventTool" + /> + <Tool + Name="VCCustomBuildTool" + /> + <Tool + Name="VCXMLDataGeneratorTool" + /> + <Tool + Name="VCWebServiceProxyGeneratorTool" + /> + <Tool + Name="VCMIDLTool" + /> + <Tool + Name="VCCLCompilerTool" + /> + <Tool + Name="VCManagedResourceCompilerTool" + /> + <Tool + Name="VCResourceCompilerTool" + /> + <Tool + Name="VCPreLinkEventTool" + /> + <Tool + Name="VCLibrarianTool" + /> + <Tool + Name="VCALinkTool" + /> + <Tool + Name="VCXDCMakeTool" + /> + <Tool + Name="VCBscMakeTool" + /> + <Tool + Name="VCFxCopTool" + /> + <Tool + Name="VCPostBuildEventTool" + /> + </Configuration> + <Configuration + Name="Release|Win32" + ConfigurationType="4" + InheritedPropertySheets="$(SolutionDir)..\build\common.vsprops;$(SolutionDir)..\build\release.vsprops;$(SolutionDir)../third_party/icu/build/using_icu.vsprops" + > + <Tool + Name="VCPreBuildEventTool" + /> + <Tool + Name="VCCustomBuildTool" + /> + <Tool + Name="VCXMLDataGeneratorTool" + /> + <Tool + Name="VCWebServiceProxyGeneratorTool" + /> + <Tool + Name="VCMIDLTool" + /> + <Tool + Name="VCCLCompilerTool" + /> + <Tool + Name="VCManagedResourceCompilerTool" + /> + <Tool + Name="VCResourceCompilerTool" + /> + <Tool + Name="VCPreLinkEventTool" + /> + <Tool + Name="VCLibrarianTool" + /> + <Tool + Name="VCALinkTool" + /> + <Tool + Name="VCXDCMakeTool" + /> + <Tool + Name="VCBscMakeTool" + /> + <Tool + Name="VCFxCopTool" + /> + <Tool + Name="VCPostBuildEventTool" + /> + </Configuration> + </Configurations> + <References> + </References> + <Files> + <File + RelativePath="..\src\gurl.cc" + > + </File> + <File + RelativePath="..\src\gurl.h" + > + </File> + <File + RelativePath=".\README.txt" + > + </File> + <File + RelativePath="..\src\url_canon.h" + > + </File> + <File + RelativePath="..\src\url_canon_etc.cc" + > + </File> + <File + RelativePath="..\src\url_canon_fileurl.cc" + > + </File> + <File + RelativePath="..\src\url_canon_host.cc" + > + </File> + <File + RelativePath="..\src\url_canon_icu.cc" + > + </File> + <File + RelativePath="..\src\url_canon_icu.h" + > + </File> + <File + RelativePath="..\src\url_canon_internal.cc" + > + </File> + <File + RelativePath="..\src\url_canon_internal.h" + > + </File> + <File + RelativePath="..\src\url_canon_internal_file.h" + > + </File> + <File + RelativePath="..\src\url_canon_ip.cc" + > + </File> + <File + RelativePath="..\src\url_canon_ip.h" + > + </File> + <File + RelativePath="..\src\url_canon_mailtourl.cc" + > + </File> + <File + RelativePath="..\src\url_canon_path.cc" + > + </File> + <File + RelativePath="..\src\url_canon_pathurl.cc" + > + </File> + <File + RelativePath="..\src\url_canon_query.cc" + > + </File> + <File + RelativePath="..\src\url_canon_relative.cc" + > + </File> + <File + RelativePath="..\src\url_canon_stdstring.h" + > + </File> + <File + RelativePath="..\src\url_canon_stdurl.cc" + > + </File> + <File + RelativePath="..\src\url_file.h" + > + </File> + <File + RelativePath="..\src\url_parse.cc" + > + </File> + <File + RelativePath="..\src\url_parse.h" + > + </File> + <File + RelativePath="..\src\url_parse_file.cc" + > + </File> + <File + RelativePath="..\src\url_parse_internal.h" + > + </File> + <File + RelativePath="..\src\url_util.cc" + > + </File> + <File + RelativePath="..\src\url_util.h" + > + </File> + </Files> + <Globals> + </Globals> +</VisualStudioProject> diff --git a/googleurl/build/release.vsprops b/googleurl/build/release.vsprops new file mode 100644 index 0000000..2e59356 --- /dev/null +++ b/googleurl/build/release.vsprops @@ -0,0 +1,23 @@ +<?xml version="1.0" encoding="Windows-1252"?> +<VisualStudioPropertySheet + ProjectType="Visual C++" + Version="8.00" + Name="release" + > + <Tool + Name="VCCLCompilerTool" + WholeProgramOptimization="true" + PreprocessorDefinitions="NDEBUG" + /> + <Tool + Name="VCLibrarianTool" + AdditionalOptions="/ltcg" + /> + <Tool + Name="VCLinkerTool" + LinkIncremental="1" + OptimizeReferences="2" + EnableCOMDATFolding="2" + LinkTimeCodeGeneration="1" + /> +</VisualStudioPropertySheet> diff --git a/googleurl/src/gurl.cc b/googleurl/src/gurl.cc new file mode 100644 index 0000000..2dab0b2 --- /dev/null +++ b/googleurl/src/gurl.cc @@ -0,0 +1,450 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifdef WIN32 +#include <windows.h> +#else +#include <pthread.h> +#endif + +#include <algorithm> + +#include "googleurl/src/gurl.h" + +#include "base/logging.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_util.h" + +namespace { + +// External template that can handle initialization of either character type. +// The input spec is given, and the canonical version will be placed in +// |*canonical|, along with the parsing of the canonical spec in |*parsed|. +template<typename STR> +bool InitCanonical(const STR& input_spec, + std::string* canonical, + url_parse::Parsed* parsed) { + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + canonical->reserve(input_spec.size() + 32); + url_canon::StdStringCanonOutput output(canonical); + bool success = url_util::Canonicalize( + input_spec.data(), static_cast<int>(input_spec.length()), + NULL, &output, parsed); + + output.Complete(); // Must be done before using string. + return success; +} + +static std::string* empty_string = NULL; +static GURL* empty_gurl = NULL; + +#ifdef WIN32 + +// Returns a static reference to an empty string for returning a reference +// when there is no underlying string. +const std::string& EmptyStringForGURL() { + // Avoid static object construction/destruction on startup/shutdown. + if (!empty_string) { + // Create the string. Be careful that we don't break in the case that this + // is being called from multiple threads. Statics are not threadsafe. + std::string* new_empty_string = new std::string; + if (InterlockedCompareExchangePointer( + reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) { + // The old value was non-NULL, so no replacement was done. Another + // thread did the initialization out from under us. + delete new_empty_string; + } + } + return *empty_string; +} + +#else + +static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT; +static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT; + +void EmptyStringForGURLOnce(void) { + empty_string = new std::string; +} + +const std::string& EmptyStringForGURL() { + // Avoid static object construction/destruction on startup/shutdown. + pthread_once(&empty_string_once, EmptyStringForGURLOnce); + return *empty_string; +} + +#endif // WIN32 + +} // namespace + +GURL::GURL() : is_valid_(false) { +} + +GURL::GURL(const GURL& other) + : spec_(other.spec_), + is_valid_(other.is_valid_), + parsed_(other.parsed_) { +} + +GURL::GURL(const std::string& url_string) { + is_valid_ = InitCanonical(url_string, &spec_, &parsed_); +} + +GURL::GURL(const string16& url_string) { + is_valid_ = InitCanonical(url_string, &spec_, &parsed_); +} + +GURL::GURL(const char* canonical_spec, size_t canonical_spec_len, + const url_parse::Parsed& parsed, bool is_valid) + : spec_(canonical_spec, canonical_spec_len), + is_valid_(is_valid), + parsed_(parsed) { +#ifndef NDEBUG + // For testing purposes, check that the parsed canonical URL is identical to + // what we would have produced. Skip checking for invalid URLs have no meaning + // and we can't always canonicalize then reproducabely. + if (is_valid_) { + GURL test_url(spec_); + + DCHECK(test_url.is_valid_ == is_valid_); + DCHECK(test_url.spec_ == spec_); + + DCHECK(test_url.parsed_.scheme == parsed_.scheme); + DCHECK(test_url.parsed_.username == parsed_.username); + DCHECK(test_url.parsed_.password == parsed_.password); + DCHECK(test_url.parsed_.host == parsed_.host); + DCHECK(test_url.parsed_.port == parsed_.port); + DCHECK(test_url.parsed_.path == parsed_.path); + DCHECK(test_url.parsed_.query == parsed_.query); + DCHECK(test_url.parsed_.ref == parsed_.ref); + } +#endif +} + +const std::string& GURL::spec() const { + if (is_valid_ || spec_.empty()) + return spec_; + + DCHECK(false) << "Trying to get the spec of an invalid URL!"; + return EmptyStringForGURL(); +} + +GURL GURL::Resolve(const std::string& relative) const { + return ResolveWithCharsetConverter(relative, NULL); +} +GURL GURL::Resolve(const string16& relative) const { + return ResolveWithCharsetConverter(relative, NULL); +} + +// Note: code duplicated below (it's inconvenient to use a template here). +GURL GURL::ResolveWithCharsetConverter( + const std::string& relative, + url_canon::CharsetConverter* charset_converter) const { + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + GURL result; + + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + result.spec_.reserve(spec_.size() + 32); + url_canon::StdStringCanonOutput output(&result.spec_); + + if (!url_util::ResolveRelative( + spec_.data(), static_cast<int>(spec_.length()), parsed_, + relative.data(), static_cast<int>(relative.length()), + charset_converter, &output, &result.parsed_)) { + // Error resolving, return an empty URL. + return GURL(); + } + + output.Complete(); + result.is_valid_ = true; + return result; +} + +// Note: code duplicated above (it's inconvenient to use a template here). +GURL GURL::ResolveWithCharsetConverter( + const string16& relative, + url_canon::CharsetConverter* charset_converter) const { + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + GURL result; + + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + result.spec_.reserve(spec_.size() + 32); + url_canon::StdStringCanonOutput output(&result.spec_); + + if (!url_util::ResolveRelative( + spec_.data(), static_cast<int>(spec_.length()), parsed_, + relative.data(), static_cast<int>(relative.length()), + charset_converter, &output, &result.parsed_)) { + // Error resolving, return an empty URL. + return GURL(); + } + + output.Complete(); + result.is_valid_ = true; + return result; +} + +// Note: code duplicated below (it's inconvenient to use a template here). +GURL GURL::ReplaceComponents( + const url_canon::Replacements<char>& replacements) const { + GURL result; + + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + result.spec_.reserve(spec_.size() + 32); + url_canon::StdStringCanonOutput output(&result.spec_); + + result.is_valid_ = url_util::ReplaceComponents( + spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, + NULL, &output, &result.parsed_); + + output.Complete(); + return result; +} + +// Note: code duplicated above (it's inconvenient to use a template here). +GURL GURL::ReplaceComponents( + const url_canon::Replacements<char16>& replacements) const { + GURL result; + + // Not allowed for invalid URLs. + if (!is_valid_) + return GURL(); + + // Reserve enough room in the output for the input, plus some extra so that + // we have room if we have to escape a few things without reallocating. + result.spec_.reserve(spec_.size() + 32); + url_canon::StdStringCanonOutput output(&result.spec_); + + result.is_valid_ = url_util::ReplaceComponents( + spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, + NULL, &output, &result.parsed_); + + output.Complete(); + return result; +} + +GURL GURL::GetOrigin() const { + // This doesn't make sense for invalid or nonstandard URLs, so return + // the empty URL + if (!is_valid_ || !IsStandard()) + return GURL(); + + url_canon::Replacements<char> replacements; + replacements.ClearUsername(); + replacements.ClearPassword(); + replacements.ClearPath(); + replacements.ClearQuery(); + replacements.ClearRef(); + + return ReplaceComponents(replacements); +} + +GURL GURL::GetWithEmptyPath() const { + // This doesn't make sense for invalid or nonstandard URLs, so return + // the empty URL. + if (!is_valid_ || !IsStandard()) + return GURL(); + + // We could optimize this since we know that the URL is canonical, and we are + // appending a canonical path, so avoiding re-parsing. + GURL other(*this); + if (parsed_.path.len == 0) + return other; + + // Clear everything after the path. + other.parsed_.query.reset(); + other.parsed_.ref.reset(); + + // Set the path, since the path is longer than one, we can just set the + // first character and resize. + other.spec_[other.parsed_.path.begin] = '/'; + other.parsed_.path.len = 1; + other.spec_.resize(other.parsed_.path.begin + 1); + return other; +} + +bool GURL::IsStandard() const { + return url_util::IsStandard(spec_.data(), static_cast<int>(spec_.length()), + parsed_.scheme); +} + +bool GURL::SchemeIs(const char* lower_ascii_scheme) const { + if (parsed_.scheme.len <= 0) + return lower_ascii_scheme == NULL; + return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin, + spec_.data() + parsed_.scheme.end(), + lower_ascii_scheme); +} + +int GURL::IntPort() const { + if (parsed_.port.is_nonempty()) + return url_parse::ParsePort(spec_.data(), parsed_.port); + return url_parse::PORT_UNSPECIFIED; +} + +int GURL::EffectiveIntPort() const { + int int_port = IntPort(); + if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard()) + return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin, + parsed_.scheme.len); + return int_port; +} + +std::string GURL::ExtractFileName() const { + url_parse::Component file_component; + url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component); + return ComponentString(file_component); +} + +std::string GURL::PathForRequest() const { + DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty"; + if (parsed_.ref.len >= 0) { + // Clip off the reference when it exists. The reference starts after the # + // sign, so we have to subtract one to also remove it. + return std::string(spec_, parsed_.path.begin, + parsed_.ref.begin - parsed_.path.begin - 1); + } + + // Use everything form the path to the end. + return std::string(spec_, parsed_.path.begin); +} + +std::string GURL::HostNoBrackets() const { + // If host looks like an IPv6 literal, strip the square brackets. + url_parse::Component h(parsed_.host); + if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') { + h.begin++; + h.len -= 2; + } + return ComponentString(h); +} + +bool GURL::HostIsIPAddress() const { + if (!is_valid_ || spec_.empty()) + return false; + + url_canon::RawCanonOutputT<char, 128> ignored_output; + url_canon::CanonHostInfo host_info; + url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host, + &ignored_output, &host_info); + return host_info.IsIPAddress(); +} + +#ifdef WIN32 + +const GURL& GURL::EmptyGURL() { + // Avoid static object construction/destruction on startup/shutdown. + if (!empty_gurl) { + // Create the string. Be careful that we don't break in the case that this + // is being called from multiple threads. + GURL* new_empty_gurl = new GURL; + if (InterlockedCompareExchangePointer( + reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) { + // The old value was non-NULL, so no replacement was done. Another + // thread did the initialization out from under us. + delete new_empty_gurl; + } + } + return *empty_gurl; +} + +#else + +void EmptyGURLOnce(void) { + empty_gurl = new GURL; +} + +const GURL& GURL::EmptyGURL() { + // Avoid static object construction/destruction on startup/shutdown. + pthread_once(&empty_gurl_once, EmptyGURLOnce); + return *empty_gurl; +} + +#endif // WIN32 + +bool GURL::DomainIs(const char* lower_ascii_domain, + int domain_len) const { + // Return false if this URL is not valid or domain is empty. + if (!is_valid_ || !parsed_.host.is_nonempty() || !domain_len) + return false; + + // Check whether the host name is end with a dot. If yes, treat it + // the same as no-dot unless the input comparison domain is end + // with dot. + const char* last_pos = spec_.data() + parsed_.host.end() - 1; + int host_len = parsed_.host.len; + if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) { + last_pos--; + host_len--; + } + + // Return false if host's length is less than domain's length. + if (host_len < domain_len) + return false; + + // Compare this url whether belong specific domain. + const char* start_pos = spec_.data() + parsed_.host.begin + + host_len - domain_len; + + if (!url_util::LowerCaseEqualsASCII(start_pos, + last_pos + 1, + lower_ascii_domain, + lower_ascii_domain + domain_len)) + return false; + + // Check whether host has right domain start with dot, make sure we got + // right domain range. For example www.google.com has domain + // "google.com" but www.iamnotgoogle.com does not. + if ('.' != lower_ascii_domain[0] && host_len > domain_len && + '.' != *(start_pos - 1)) + return false; + + return true; +} + +void GURL::Swap(GURL* other) { + spec_.swap(other->spec_); + std::swap(is_valid_, other->is_valid_); + std::swap(parsed_, other->parsed_); +} + diff --git a/googleurl/src/gurl.h b/googleurl/src/gurl.h new file mode 100644 index 0000000..36cd14c --- /dev/null +++ b/googleurl/src/gurl.h @@ -0,0 +1,372 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_GURL_H__ +#define GOOGLEURL_SRC_GURL_H__ + +#include <iostream> +#include <string> + +#include "base/string16.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_parse.h" + +class GURL { + public: + typedef url_canon::StdStringReplacements<std::string> Replacements; + typedef url_canon::StdStringReplacements<string16> ReplacementsW; + + // Creates an empty, invalid URL. + GURL(); + + // Copy construction is relatively inexpensive, with most of the time going + // to reallocating the string. It does not re-parse. + GURL(const GURL& other); + + // The narrow version requires the input be UTF-8. Invalid UTF-8 input will + // result in an invalid URL. + // + // The wide version should also take an encoding parameter so we know how to + // encode the query parameters. It is probably sufficient for the narrow + // version to assume the query parameter encoding should be the same as the + // input encoding. + explicit GURL(const std::string& url_string /*, output_param_encoding*/); + explicit GURL(const string16& url_string /*, output_param_encoding*/); + + // Constructor for URLs that have already been parsed and canonicalized. This + // is used for conversions from KURL, for example. The caller must supply all + // information associated with the URL, which must be correct and consistent. + GURL(const char* canonical_spec, size_t canonical_spec_len, + const url_parse::Parsed& parsed, bool is_valid); + + // Returns true when this object represents a valid parsed URL. When not + // valid, other functions will still succeed, but you will not get canonical + // data out in the format you may be expecting. Instead, we keep something + // "reasonable looking" so that the user can see how it's busted if + // displayed to them. + bool is_valid() const { + return is_valid_; + } + + // Returns true if the URL is zero-length. Note that empty URLs are also + // invalid, and is_valid() will return false for them. This is provided + // because some users may want to treat the empty case differently. + bool is_empty() const { + return spec_.empty(); + } + + // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8, + // if the URL is valid. If the URL is not valid, this will assert and return + // the empty string (for safety in release builds, to keep them from being + // misused which might be a security problem). + // + // The URL will be ASCII except the reference fragment, which may be UTF-8. + // It is guaranteed to be valid UTF-8. + // + // The exception is for empty() URLs (which are !is_valid()) but this will + // return the empty string without asserting. + // + // Used invalid_spec() below to get the unusable spec of an invalid URL. This + // separation is designed to prevent errors that may cause security problems + // that could result from the mistaken use of an invalid URL. + const std::string& spec() const; + + // Returns the potentially invalid spec for a the URL. This spec MUST NOT be + // modified or sent over the network. It is designed to be displayed in error + // messages to the user, as the apperance of the spec may explain the error. + // If the spec is valid, the valid spec will be returned. + // + // The returned string is guaranteed to be valid UTF-8. + const std::string& possibly_invalid_spec() const { + return spec_; + } + + // Getter for the raw parsed structure. This allows callers to locate parts + // of the URL within the spec themselves. Most callers should consider using + // the individual component getters below. + // + // The returned parsed structure will reference into the raw spec, which may + // or may not be valid. If you are using this to index into the spec, BE + // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you + // don't do anything "important" with invalid specs. + const url_parse::Parsed& parsed_for_possibly_invalid_spec() const { + return parsed_; + } + + // Defiant equality operator! + bool operator==(const GURL& other) const { + return spec_ == other.spec_; + } + bool operator!=(const GURL& other) const { + return spec_ != other.spec_; + } + + // Allows GURL to used as a key in STL (for example, a std::set or std::map). + bool operator<(const GURL& other) const { + return spec_ < other.spec_; + } + + // Resolves a URL that's possibly relative to this object's URL, and returns + // it. Absolute URLs are also handled according to the rules of URLs on web + // pages. + // + // It may be impossible to resolve the URLs properly. If the input is not + // "standard" (SchemeIsStandard() == false) and the input looks relative, we + // can't resolve it. In these cases, the result will be an empty, invalid + // GURL. + // + // The result may also be a nonempty, invalid URL if the input has some kind + // of encoding error. In these cases, we will try to construct a "good" URL + // that may have meaning to the user, but it will be marked invalid. + // + // It is an error to resolve a URL relative to an invalid URL. The result + // will be the empty URL. + GURL Resolve(const std::string& relative) const; + GURL Resolve(const string16& relative) const; + + // Like Resolve() above but takes a character set encoder which will be used + // for any query text specified in the input. The charset converter parameter + // may be NULL, in which case it will be treated as UTF-8. + // + // TODO(brettw): These should be replaced with versions that take something + // more friendly than a raw CharsetConverter (maybe like an ICU character set + // name). + GURL ResolveWithCharsetConverter( + const std::string& relative, + url_canon::CharsetConverter* charset_converter) const; + GURL ResolveWithCharsetConverter( + const string16& relative, + url_canon::CharsetConverter* charset_converter) const; + + // Creates a new GURL by replacing the current URL's components with the + // supplied versions. See the Replacements class in url_canon.h for more. + // + // These are not particularly quick, so avoid doing mutations when possible. + // Prefer the 8-bit version when possible. + // + // It is an error to replace components of an invalid URL. The result will + // be the empty URL. + // + // Note that we use the more general url_canon::Replacements type to give + // callers extra flexibility rather than our override. + GURL ReplaceComponents( + const url_canon::Replacements<char>& replacements) const; + GURL ReplaceComponents( + const url_canon::Replacements<char16>& replacements) const; + + // A helper function that is equivalent to replacing the path with a slash + // and clearing out everything after that. We sometimes need to know just the + // scheme and the authority. If this URL is not a standard URL (it doesn't + // have the regular authority and path sections), then the result will be + // an empty, invalid GURL. Note that this *does* work for file: URLs, which + // some callers may want to filter out before calling this. + // + // It is an error to get an empty path on an invalid URL. The result + // will be the empty URL. + GURL GetWithEmptyPath() const; + + // A helper function to return a GURL containing just the scheme, host, + // and port from a URL. Equivalent to clearing any username and password, + // replacing the path with a slash, and clearing everything after that. If + // this URL is not a standard URL, then the result will be an empty, + // invalid GURL. If the URL has neither username nor password, this + // degenerates to GetWithEmptyPath(). + // + // It is an error to get the origin of an invalid URL. The result + // will be the empty URL. + GURL GetOrigin() const; + + // Returns true if the scheme for the current URL is a known "standard" + // scheme or there is a "://" after it. Standard schemes have an authority + // and a path section. This includes file:, which some callers may want to + // filter out explicitly by calling SchemeIsFile. + bool IsStandard() const; + + // Returns true if the given parameter (should be lower-case ASCII to match + // the canonicalized scheme) is the scheme for this URL. This call is more + // efficient than getting the scheme and comparing it because no copies or + // object constructions are done. + bool SchemeIs(const char* lower_ascii_scheme) const; + + // We often need to know if this is a file URL. File URLs are "standard", but + // are often treated separately by some programs. + bool SchemeIsFile() const { + return SchemeIs("file"); + } + + // If the scheme indicates a secure connection + bool SchemeIsSecure() const { + return SchemeIs("https"); + } + + // Returns true if the hostname is an IP address. Note: this function isn't + // as cheap as a simple getter because it re-parses the hostname to verify. + // This currently identifies only IPv4 addresses (bug 822685). + bool HostIsIPAddress() const; + + // Getters for various components of the URL. The returned string will be + // empty if the component is empty or is not present. + std::string scheme() const { // Not including the colon. See also SchemeIs. + return ComponentString(parsed_.scheme); + } + std::string username() const { + return ComponentString(parsed_.username); + } + std::string password() const { + return ComponentString(parsed_.password); + } + // Note that this may be a hostname, an IPv4 address, or an IPv6 literal + // surrounded by square brackets, like "[2001:db8::1]". To exclude these + // brackets, use HostNoBrackets() below. + std::string host() const { + return ComponentString(parsed_.host); + } + std::string port() const { // Returns -1 if "default" + return ComponentString(parsed_.port); + } + std::string path() const { // Including first slash following host + return ComponentString(parsed_.path); + } + std::string query() const { // Stuff following '?' + return ComponentString(parsed_.query); + } + std::string ref() const { // Stuff following '#' + return ComponentString(parsed_.ref); + } + + // Existance querying. These functions will return true if the corresponding + // URL component exists in this URL. Note that existance is different than + // being nonempty. http://www.google.com/? has a query that just happens to + // be empty, and has_query() will return true. + bool has_scheme() const { + return parsed_.scheme.len >= 0; + } + bool has_username() const { + return parsed_.username.len >= 0; + } + bool has_password() const { + return parsed_.password.len >= 0; + } + bool has_host() const { + // Note that hosts are special, absense of host means length 0. + return parsed_.host.len > 0; + } + bool has_port() const { + return parsed_.port.len >= 0; + } + bool has_path() const { + // Note that http://www.google.com/" has a path, the path is "/". This can + // return false only for invalid or nonstandard URLs. + return parsed_.path.len >= 0; + } + bool has_query() const { + return parsed_.query.len >= 0; + } + bool has_ref() const { + return parsed_.ref.len >= 0; + } + + // Returns a parsed version of the port. Can also be any of the special + // values defined in Parsed for ExtractPort. + int IntPort() const; + + // Returns the port number of the url, or the default port number. + // If the scheme has no concept of port (or unknown default) returns + // PORT_UNSPECIFIED. + int EffectiveIntPort() const; + + // Extracts the filename portion of the path and returns it. The filename + // is everything after the last slash in the path. This may be empty. + std::string ExtractFileName() const; + + // Returns the path that should be sent to the server. This is the path, + // parameter, and query portions of the URL. It is guaranteed to be ASCII. + std::string PathForRequest() const; + + // Returns the host, excluding the square brackets surrounding IPv6 address + // literals. This can be useful for passing to getaddrinfo(). + std::string HostNoBrackets() const; + + // Returns true if this URL's host matches or is in the same domain as + // the given input string. For example if this URL was "www.google.com", + // this would match "com", "google.com", and "www.google.com + // (input domain should be lower-case ASCII to match the canonicalized + // scheme). This call is more efficient than getting the host and check + // whether host has the specific domain or not because no copies or + // object constructions are done. + // + // If function DomainIs has parameter domain_len, which means the parameter + // lower_ascii_domain does not gurantee to terminate with NULL character. + bool DomainIs(const char* lower_ascii_domain, int domain_len) const; + + // If function DomainIs only has parameter lower_ascii_domain, which means + // domain string should be terminate with NULL character. + bool DomainIs(const char* lower_ascii_domain) const { + return DomainIs(lower_ascii_domain, + static_cast<int>(strlen(lower_ascii_domain))); + } + + // Swaps the contents of this GURL object with the argument without doing + // any memory allocations. + void Swap(GURL* other); + + // Returns a reference to a singleton empty GURL. This object is for callers + // who return references but don't have anything to return in some cases. + // This function may be called from any thread. + static const GURL& EmptyGURL(); + + private: + // Returns the substring of the input identified by the given component. + std::string ComponentString(const url_parse::Component& comp) const { + if (comp.len <= 0) + return std::string(); + return std::string(spec_, comp.begin, comp.len); + } + + // The actual text of the URL, in canonical ASCII form. + std::string spec_; + + // Set when the given URL is valid. Otherwise, we may still have a spec and + // components, but they may not identify valid resources (for example, an + // invalid port number, invalid characters in the scheme, etc.). + bool is_valid_; + + // Identified components of the canonical spec. + url_parse::Parsed parsed_; + + // TODO bug 684583: Add encoding for query params. +}; + +// Stream operator so GURL can be used in assertion statements. +inline std::ostream& operator<<(std::ostream& out, const GURL& url) { + return out << url.possibly_invalid_spec(); +} + +#endif // GOOGLEURL_SRC_GURL_H__ diff --git a/googleurl/src/gurl_test_main.cc b/googleurl/src/gurl_test_main.cc new file mode 100644 index 0000000..9a7c9f4 --- /dev/null +++ b/googleurl/src/gurl_test_main.cc @@ -0,0 +1,97 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "build/build_config.h" + +#if defined(OS_WIN) +#include <windows.h> +#endif + +#include <string> + +#include "testing/gtest/include/gtest/gtest.h" +#include "unicode/putil.h" +#include "unicode/udata.h" + +#define ICU_UTIL_DATA_SHARED 1 +#define ICU_UTIL_DATA_STATIC 2 + +#ifndef ICU_UTIL_DATA_IMPL + +#if defined(OS_WIN) +#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_SHARED +#elif defined(OS_MACOSX) +#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_STATIC +#elif defined(OS_LINUX) +#define ICU_UTIL_DATA_IMPL ICU_UTIL_DATA_FILE +#endif + +#endif // ICU_UTIL_DATA_IMPL + +#if defined(OS_WIN) +#define ICU_UTIL_DATA_SYMBOL "icudt" U_ICU_VERSION_SHORT "_dat" +#define ICU_UTIL_DATA_SHARED_MODULE_NAME "icudt" U_ICU_VERSION_SHORT ".dll" +#endif + +bool InitializeICU() { +#if (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_SHARED) + // We expect to find the ICU data module alongside the current module. + // Because the module name is ASCII-only, "A" API should be safe. + HMODULE module = LoadLibraryA(ICU_UTIL_DATA_SHARED_MODULE_NAME); + if (!module) + return false; + + FARPROC addr = GetProcAddress(module, ICU_UTIL_DATA_SYMBOL); + if (!addr) + return false; + + UErrorCode err = U_ZERO_ERROR; + udata_setCommonData(reinterpret_cast<void*>(addr), &err); + return err == U_ZERO_ERROR; +#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_STATIC) + // Mac bundles the ICU data in. + return true; +#elif (ICU_UTIL_DATA_IMPL == ICU_UTIL_DATA_FILE) + // We expect to find the ICU data module alongside the current module. + u_setDataDirectory("."); + // Only look for the packaged data file; + // the default behavior is to look for individual files. + UErrorCode err = U_ZERO_ERROR; + udata_setFileAccess(UDATA_ONLY_PACKAGES, &err); + return err == U_ZERO_ERROR; +#endif +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + + InitializeICU(); + + return RUN_ALL_TESTS(); +} diff --git a/googleurl/src/gurl_unittest.cc b/googleurl/src/gurl_unittest.cc new file mode 100644 index 0000000..4e81de6 --- /dev/null +++ b/googleurl/src/gurl_unittest.cc @@ -0,0 +1,433 @@ +// Copyright 2007 Google Inc. All Rights Reserved. +// Author: brettw@google.com (Brett Wilson) + +#include "googleurl/src/gurl.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_test_utils.h" +#include "testing/gtest/include/gtest/gtest.h" + +// Some implementations of base/basictypes.h may define ARRAYSIZE. +// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro +// which is in our version of basictypes.h. +#ifndef ARRAYSIZE +#define ARRAYSIZE ARRAYSIZE_UNSAFE +#endif + +using url_test_utils::WStringToUTF16; +using url_test_utils::ConvertUTF8ToUTF16; + +namespace { + +template<typename CHAR> +void SetupReplacement(void (url_canon::Replacements<CHAR>::*func)(const CHAR*, + const url_parse::Component&), + url_canon::Replacements<CHAR>* replacements, + const CHAR* str) { + if (str) { + url_parse::Component comp; + if (str[0]) + comp.len = static_cast<int>(strlen(str)); + (replacements->*func)(str, comp); + } +} + +} // namespace + +// Different types of URLs should be handled differently by url_util, and +// handed off to different canonicalizers. +TEST(GURLTest, Types) { + struct TypeTest { + const char* src; + const char* expected; + } type_cases[] = { + // URLs with "://" should be treated as standard and have a hostname, even + // when the scheme is unknown. + {"something:///HOSTNAME.com/", "something://hostname.com/"}, + // In the reverse, lacking a "://" means a path URL so no canonicalization + // should happen. + {"something:HOSTNAME.com/", "something:HOSTNAME.com/"}, + {"something:/HOSTNAME.com/", "something:/HOSTNAME.com/"}, +#ifdef WIN32 + // URLs that look like absolute Windows drive specs. + {"c:\\foo.txt", "file:///C:/foo.txt"}, + {"Z|foo.txt", "file:///Z:/foo.txt"}, + {"\\\\server\\foo.txt", "file://server/foo.txt"}, + {"//server/foo.txt", "file://server/foo.txt"}, +#endif + }; + + for (size_t i = 0; i < ARRAYSIZE(type_cases); i++) { + GURL gurl(type_cases[i].src); + EXPECT_STREQ(type_cases[i].expected, gurl.spec().c_str()); + } +} + +// Test the basic creation and querying of components in a GURL. We assume +// the parser is already tested and works, so we are mostly interested if the +// object does the right thing with the results. +TEST(GURLTest, Components) { + GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref")); + EXPECT_TRUE(url.is_valid()); + EXPECT_TRUE(url.SchemeIs("http")); + EXPECT_FALSE(url.SchemeIsFile()); + + // This is the narrow version of the URL, which should match the wide input. + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url.spec()); + + EXPECT_EQ("http", url.scheme()); + EXPECT_EQ("user", url.username()); + EXPECT_EQ("pass", url.password()); + EXPECT_EQ("google.com", url.host()); + EXPECT_EQ("99", url.port()); + EXPECT_EQ(99, url.IntPort()); + EXPECT_EQ("/foo;bar", url.path()); + EXPECT_EQ("q=a", url.query()); + EXPECT_EQ("ref", url.ref()); +} + +TEST(GURLTest, Empty) { + GURL url; + EXPECT_FALSE(url.is_valid()); + EXPECT_EQ("", url.spec()); + + EXPECT_EQ("", url.scheme()); + EXPECT_EQ("", url.username()); + EXPECT_EQ("", url.password()); + EXPECT_EQ("", url.host()); + EXPECT_EQ("", url.port()); + EXPECT_EQ(url_parse::PORT_UNSPECIFIED, url.IntPort()); + EXPECT_EQ("", url.path()); + EXPECT_EQ("", url.query()); + EXPECT_EQ("", url.ref()); +} + +TEST(GURLTest, Copy) { + GURL url(WStringToUTF16(L"http://user:pass@google.com:99/foo;bar?q=a#ref")); + + GURL url2(url); + EXPECT_TRUE(url2.is_valid()); + + EXPECT_EQ("http://user:pass@google.com:99/foo;bar?q=a#ref", url2.spec()); + EXPECT_EQ("http", url2.scheme()); + EXPECT_EQ("user", url2.username()); + EXPECT_EQ("pass", url2.password()); + EXPECT_EQ("google.com", url2.host()); + EXPECT_EQ("99", url2.port()); + EXPECT_EQ(99, url2.IntPort()); + EXPECT_EQ("/foo;bar", url2.path()); + EXPECT_EQ("q=a", url2.query()); + EXPECT_EQ("ref", url2.ref()); + + // Copying of invalid URL should be invalid + GURL invalid; + GURL invalid2(invalid); + EXPECT_FALSE(invalid2.is_valid()); + EXPECT_EQ("", invalid2.spec()); + EXPECT_EQ("", invalid2.scheme()); + EXPECT_EQ("", invalid2.username()); + EXPECT_EQ("", invalid2.password()); + EXPECT_EQ("", invalid2.host()); + EXPECT_EQ("", invalid2.port()); + EXPECT_EQ(url_parse::PORT_UNSPECIFIED, invalid2.IntPort()); + EXPECT_EQ("", invalid2.path()); + EXPECT_EQ("", invalid2.query()); + EXPECT_EQ("", invalid2.ref()); +} + +// Given an invalid URL, we should still get most of the components. +TEST(GURLTest, Invalid) { + GURL url("http:google.com:foo"); + EXPECT_FALSE(url.is_valid()); + EXPECT_EQ("http://google.com:foo/", url.possibly_invalid_spec()); + + EXPECT_EQ("http", url.scheme()); + EXPECT_EQ("", url.username()); + EXPECT_EQ("", url.password()); + EXPECT_EQ("google.com", url.host()); + EXPECT_EQ("foo", url.port()); + EXPECT_EQ(url_parse::PORT_INVALID, url.IntPort()); + EXPECT_EQ("/", url.path()); + EXPECT_EQ("", url.query()); + EXPECT_EQ("", url.ref()); +} + +TEST(GURLTest, Resolve) { + // The tricky cases for relative URL resolving are tested in the + // canonicalizer unit test. Here, we just test that the GURL integration + // works properly. + struct ResolveCase { + const char* base; + const char* relative; + bool expected_valid; + const char* expected; + } resolve_cases[] = { + {"http://www.google.com/", "foo.html", true, "http://www.google.com/foo.html"}, + {"http://www.google.com/", "http://images.google.com/foo.html", true, "http://images.google.com/foo.html"}, + {"http://www.google.com/blah/bloo?c#d", "../../../hello/./world.html?a#b", true, "http://www.google.com/hello/world.html?a#b"}, + {"http://www.google.com/foo#bar", "#com", true, "http://www.google.com/foo#com"}, + {"http://www.google.com/", "Https:images.google.com", true, "https://images.google.com/"}, + // Unknown schemes with a "://" should be treated as standard. + {"somescheme://foo/", "bar", true, "somescheme://foo/bar"}, + // Unknown schemes with no "://" are not standard. + {"data:blahblah", "http://google.com/", true, "http://google.com/"}, + {"data:blahblah", "http:google.com", true, "http://google.com/"}, + {"data:/blahblah", "file.html", false, ""}, + }; + + for (size_t i = 0; i < ARRAYSIZE(resolve_cases); i++) { + // 8-bit code path. + GURL input(resolve_cases[i].base); + GURL output = input.Resolve(resolve_cases[i].relative); + EXPECT_EQ(resolve_cases[i].expected_valid, output.is_valid()); + EXPECT_EQ(resolve_cases[i].expected, output.spec()); + + // Wide code path. + GURL inputw(ConvertUTF8ToUTF16(resolve_cases[i].base)); + GURL outputw = + input.Resolve(ConvertUTF8ToUTF16(resolve_cases[i].relative)); + EXPECT_EQ(resolve_cases[i].expected_valid, outputw.is_valid()); + EXPECT_EQ(resolve_cases[i].expected, outputw.spec()); + } +} + +TEST(GURLTest, GetOrigin) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"javascript:window.alert(\"hello,world\");", ""}, + {"http://user:pass@www.google.com:21/blah#baz", "http://www.google.com:21/"}, + {"http://user@www.google.com", "http://www.google.com/"}, + {"http://:pass@www.google.com", "http://www.google.com/"}, + {"http://:@www.google.com", "http://www.google.com/"}, + }; + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + GURL url(cases[i].input); + GURL origin = url.GetOrigin(); + EXPECT_EQ(cases[i].expected, origin.spec()); + } +} + +TEST(GURLTest, GetWithEmptyPath) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "http://www.google.com/"}, + {"javascript:window.alert(\"hello, world\");", ""}, + {"http://www.google.com/foo/bar.html?baz=22", "http://www.google.com/"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + GURL url(cases[i].input); + GURL empty_path = url.GetWithEmptyPath(); + EXPECT_EQ(cases[i].expected, empty_path.spec()); + } +} + +TEST(GURLTest, Replacements) { + // The url canonicalizer replacement test will handle most of these case. + // The most important thing to do here is to check that the proper + // canonicalizer gets called based on the scheme of the input. + struct ReplaceCase { + const char* base; + const char* scheme; + const char* username; + const char* password; + const char* host; + const char* port; + const char* path; + const char* query; + const char* ref; + const char* expected; + } replace_cases[] = { + {"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "http://www.google.com/"}, + {"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", "", "window.open('foo');", "", "", "javascript:window.open('foo');"}, + {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo","search", "ref", "http://www.google.com:99/foo?search#ref"}, +#ifdef WIN32 + {"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", "c:\\", "", "", "file:///C:/"}, +#endif + }; + + for (size_t i = 0; i < ARRAYSIZE(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + GURL url(cur.base); + GURL::Replacements repl; + SetupReplacement(&GURL::Replacements::SetScheme, &repl, cur.scheme); + SetupReplacement(&GURL::Replacements::SetUsername, &repl, cur.username); + SetupReplacement(&GURL::Replacements::SetPassword, &repl, cur.password); + SetupReplacement(&GURL::Replacements::SetHost, &repl, cur.host); + SetupReplacement(&GURL::Replacements::SetPort, &repl, cur.port); + SetupReplacement(&GURL::Replacements::SetPath, &repl, cur.path); + SetupReplacement(&GURL::Replacements::SetQuery, &repl, cur.query); + SetupReplacement(&GURL::Replacements::SetRef, &repl, cur.ref); + GURL output = url.ReplaceComponents(repl); + + EXPECT_EQ(replace_cases[i].expected, output.spec()); + } +} + +TEST(GURLTest, PathForRequest) { + struct TestCase { + const char* input; + const char* expected; + } cases[] = { + {"http://www.google.com", "/"}, + {"http://www.google.com/", "/"}, + {"http://www.google.com/foo/bar.html?baz=22", "/foo/bar.html?baz=22"}, + {"http://www.google.com/foo/bar.html#ref", "/foo/bar.html"}, + {"http://www.google.com/foo/bar.html?query#ref", "/foo/bar.html?query"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + GURL url(cases[i].input); + std::string path_request = url.PathForRequest(); + EXPECT_EQ(cases[i].expected, path_request); + } +} + +TEST(GURLTest, EffectiveIntPort) { + struct PortTest { + const char* spec; + int expected_int_port; + } port_tests[] = { + // http + {"http://www.google.com/", 80}, + {"http://www.google.com:80/", 80}, + {"http://www.google.com:443/", 443}, + + // https + {"https://www.google.com/", 443}, + {"https://www.google.com:443/", 443}, + {"https://www.google.com:80/", 80}, + + // ftp + {"ftp://www.google.com/", 21}, + {"ftp://www.google.com:21/", 21}, + {"ftp://www.google.com:80/", 80}, + + // gopher + {"gopher://www.google.com/", 70}, + {"gopher://www.google.com:70/", 70}, + {"gopher://www.google.com:80/", 80}, + + // file - no port + {"file://www.google.com/", url_parse::PORT_UNSPECIFIED}, + {"file://www.google.com:443/", url_parse::PORT_UNSPECIFIED}, + + // data - no port + {"data:www.google.com:90", url_parse::PORT_UNSPECIFIED}, + {"data:www.google.com", url_parse::PORT_UNSPECIFIED}, + }; + + for (size_t i = 0; i < ARRAYSIZE(port_tests); i++) { + GURL url(port_tests[i].spec); + EXPECT_EQ(port_tests[i].expected_int_port, url.EffectiveIntPort()); + } +} + +TEST(GURLTest, IPAddress) { + struct IPTest { + const char* spec; + bool expected_ip; + } ip_tests[] = { + {"http://www.google.com/", false}, + {"http://192.168.9.1/", true}, + {"http://192.168.9.1.2/", false}, + {"http://192.168.m.1/", false}, + {"http://2001:db8::1/", false}, + {"http://[2001:db8::1]/", true}, + {"", false}, + {"some random input!", false}, + }; + + for (size_t i = 0; i < ARRAYSIZE(ip_tests); i++) { + GURL url(ip_tests[i].spec); + EXPECT_EQ(ip_tests[i].expected_ip, url.HostIsIPAddress()); + } +} + +TEST(GURLTest, HostNoBrackets) { + struct TestCase { + const char* input; + const char* expected_host; + const char* expected_plainhost; + } cases[] = { + {"http://www.google.com", "www.google.com", "www.google.com"}, + {"http://[2001:db8::1]/", "[2001:db8::1]", "2001:db8::1"}, + {"http://[::]/", "[::]", "::"}, + + // Don't require a valid URL, but don't crash either. + {"http://[]/", "[]", ""}, + {"http://[x]/", "[x]", "x"}, + {"http://[x/", "[x", "[x"}, + {"http://x]/", "x]", "x]"}, + {"http://[/", "[", "["}, + {"http://]/", "]", "]"}, + {"", "", ""}, + }; + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + GURL url(cases[i].input); + EXPECT_EQ(cases[i].expected_host, url.host()); + EXPECT_EQ(cases[i].expected_plainhost, url.HostNoBrackets()); + } +} + +TEST(GURLTest, DomainIs) { + const char google_domain[] = "google.com"; + + GURL url_1("http://www.google.com:99/foo"); + EXPECT_TRUE(url_1.DomainIs(google_domain)); + + GURL url_2("http://google.com:99/foo"); + EXPECT_TRUE(url_2.DomainIs(google_domain)); + + GURL url_3("http://google.com./foo"); + EXPECT_TRUE(url_3.DomainIs(google_domain)); + + GURL url_4("http://google.com/foo"); + EXPECT_FALSE(url_4.DomainIs("google.com.")); + + GURL url_5("http://google.com./foo"); + EXPECT_TRUE(url_5.DomainIs("google.com.")); + + GURL url_6("http://www.google.com./foo"); + EXPECT_TRUE(url_6.DomainIs(".com.")); + + GURL url_7("http://www.balabala.com/foo"); + EXPECT_FALSE(url_7.DomainIs(google_domain)); + + GURL url_8("http://www.google.com.cn/foo"); + EXPECT_FALSE(url_8.DomainIs(google_domain)); + + GURL url_9("http://www.iamnotgoogle.com/foo"); + EXPECT_FALSE(url_9.DomainIs(google_domain)); + + GURL url_10("http://www.iamnotgoogle.com../foo"); + EXPECT_FALSE(url_10.DomainIs(".com")); +} + +// Newlines should be stripped from inputs. +TEST(GURLTest, Newlines) { + // Constructor. + GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n "); + EXPECT_EQ("http://www.google.com/asdf", url_1.spec()); + + // Relative path resolver. + GURL url_2 = url_1.Resolve(" \n /fo\to\r "); + EXPECT_EQ("http://www.google.com/foo", url_2.spec()); + + // Note that newlines are NOT stripped from ReplaceComponents. +} + +TEST(GURLTest, IsStandard) { + GURL a("http:foo/bar"); + EXPECT_TRUE(a.IsStandard()); + + GURL b("foo:bar/baz"); + EXPECT_FALSE(b.IsStandard()); + + GURL c("foo://bar/baz"); + EXPECT_TRUE(c.IsStandard()); +} diff --git a/googleurl/src/url_canon.h b/googleurl/src/url_canon.h new file mode 100644 index 0000000..143574d --- /dev/null +++ b/googleurl/src/url_canon.h @@ -0,0 +1,871 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#ifndef GOOGLEURL_SRC_URL_CANON_H__ +#define GOOGLEURL_SRC_URL_CANON_H__ + +#include <memory.h> +#include <stdlib.h> + +#include "base/string16.h" +#include "googleurl/src/url_parse.h" + +namespace url_canon { + +// Canonicalizer output ------------------------------------------------------- + +// Base class for the canonicalizer output, this maintains a buffer and +// supports simple resizing and append operations on it. +// +// It is VERY IMPORTANT that no virtual function calls be made on the common +// code path. We only have two virtual function calls, the destructor and a +// resize function that is called when the existing buffer is not big enough. +// The derived class is then in charge of setting up our buffer which we will +// manage. +template<typename T> +class CanonOutputT { + public: + CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) { + } + virtual ~CanonOutputT() { + } + + // Implemented to resize the buffer. This function should update the buffer + // pointer to point to the new buffer, and any old data up to |cur_len_| in + // the buffer must be copied over. + // + // The new size |sz| must be larger than buffer_len_. + virtual void Resize(int sz) = 0; + + // Accessor for returning a character at a given position. The input offset + // must be in the valid range. + inline char at(int offset) const { + return buffer_[offset]; + } + + // Sets the character at the given position. The given position MUST be less + // than the length(). + inline void set(int offset, int ch) { + buffer_[offset] = ch; + } + + // Returns the number of characters currently in the buffer. + inline int length() const { + return cur_len_; + } + + // Returns the current capacity of the buffer. The length() is the number of + // characters that have been declared to be written, but the capacity() is + // the number that can be written without reallocation. If the caller must + // write many characters at once, it can make sure there is enough capacity, + // write the data, then use set_size() to declare the new length(). + int capacity() const { + return buffer_len_; + } + + // Called by the user of this class to get the output. The output will NOT + // be NULL-terminated. Call length() to get the + // length. + const T* data() const { + return buffer_; + } + T* data() { + return buffer_; + } + + // Shortens the URL to the new length. Used for "backing up" when processing + // relative paths. This can also be used if an external function writes a lot + // of data to the buffer (when using the "Raw" version below) beyond the end, + // to declare the new length. + // + // This MUST NOT be used to expand the size of the buffer beyond capacity(). + void set_length(int new_len) { + cur_len_ = new_len; + } + + // This is the most performance critical function, since it is called for + // every character. + void push_back(T ch) { + // In VC2005, putting this common case first speeds up execution + // dramatically because this branch is predicted as taken. + if (cur_len_ < buffer_len_) { + buffer_[cur_len_] = ch; + cur_len_++; + return; + } + + // Grow the buffer to hold at least one more item. Hopefully we won't have + // to do this very often. + if (!Grow(1)) + return; + + // Actually do the insertion. + buffer_[cur_len_] = ch; + cur_len_++; + } + + // Appends the given string to the output. + void Append(const T* str, int str_len) { + if (cur_len_ + str_len > buffer_len_) { + if (!Grow(cur_len_ + str_len - buffer_len_)) + return; + } + for (int i = 0; i < str_len; i++) + buffer_[cur_len_ + i] = str[i]; + cur_len_ += str_len; + } + + protected: + // Grows the given buffer so that it can fit at least |min_additional| + // characters. Returns true if the buffer could be resized, false on OOM. + bool Grow(int min_additional) { + static const int kMinBufferLen = 16; + int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_; + do { + if (new_len >= (1 << 30)) // Prevent overflow below. + return false; + new_len *= 2; + } while (new_len < buffer_len_ + min_additional); + Resize(new_len); + return true; + } + + T* buffer_; + int buffer_len_; + + // Used characters in the buffer. + int cur_len_; +}; + +// Simple implementation of the CanonOutput using new[]. This class +// also supports a static buffer so if it is allocated on the stack, most +// URLs can be canonicalized with no heap allocations. +template<typename T, int fixed_capacity = 1024> +class RawCanonOutputT : public CanonOutputT<T> { + public: + RawCanonOutputT() : CanonOutputT<T>() { + this->buffer_ = fixed_buffer_; + this->buffer_len_ = fixed_capacity; + } + virtual ~RawCanonOutputT() { + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + } + + virtual void Resize(int sz) { + T* new_buf = new T[sz]; + memcpy(new_buf, this->buffer_, + sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz)); + if (this->buffer_ != fixed_buffer_) + delete[] this->buffer_; + this->buffer_ = new_buf; + this->buffer_len_ = sz; + } + + protected: + T fixed_buffer_[fixed_capacity]; +}; + +// Normally, all canonicalization output is in narrow characters. We support +// the templates so it can also be used internally if a wide buffer is +// required. +typedef CanonOutputT<char> CanonOutput; +typedef CanonOutputT<char16> CanonOutputW; + +template<int fixed_capacity> +class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {}; +template<int fixed_capacity> +class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {}; + +// Character set converter ---------------------------------------------------- +// +// Converts query strings into a custom encoding. The embedder can supply an +// implementation of this class to interface with their own character set +// conversion libraries. +// +// Embedders will want to see the unit test for the ICU version. + +class CharsetConverter { + public: + CharsetConverter() {} + virtual ~CharsetConverter() {} + + // Converts the given input string from UTF-16 to whatever output format the + // converter supports. This is used only for the query encoding conversion, + // which does not fail. Instead, the converter should insert "invalid + // character" characters in the output for invalid sequences, and do the + // best it can. + // + // If the input contains a character not representable in the output + // character set, the converter should append the HTML entity sequence in + // decimal, (such as "你") with escaping of the ampersand, number + // sign, and semicolon (in the previous example it would be + // "%26%2320320%3B"). This rule is based on what IE does in this situation. + virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output) = 0; +}; + +// Whitespace ----------------------------------------------------------------- + +// Searches for whitespace that should be removed from the middle of URLs, and +// removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces +// are preserved, which is what most browsers do. A pointer to the output will +// be returned, and the length of that output will be in |output_len|. +// +// This should be called before parsing if whitespace removal is desired (which +// it normally is when you are canonicalizing). +// +// If no whitespace is removed, this function will not use the buffer and will +// return a pointer to the input, to avoid the extra copy. If modification is +// required, the given |buffer| will be used and the returned pointer will +// point to the beginning of the buffer. +// +// Therefore, callers should not use the buffer, since it may actuall be empty, +// use the computed pointer and |*output_len| instead. +const char* RemoveURLWhitespace(const char* input, int input_len, + CanonOutputT<char>* buffer, + int* output_len); +const char16* RemoveURLWhitespace(const char16* input, int input_len, + CanonOutputT<char16>* buffer, + int* output_len); + +// IDN ------------------------------------------------------------------------ + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must fall in the ASCII range, but will be encoded in UTF-16. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, returns false. The output in this case is undefined. +bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); + +// Piece-by-piece canonicalizers ---------------------------------------------- +// +// These individual canonicalizers append the canonicalized versions of the +// corresponding URL component to the given std::string. The spec and the +// previously-identified range of that component are the input. The range of +// the canonicalized component will be written to the output component. +// +// These functions all append to the output so they can be chained. Make sure +// the output is empty when you start. +// +// These functions returns boolean values indicating success. On failure, they +// will attempt to write something reasonable to the output so that, if +// displayed to the user, they will recognise it as something that's messed up. +// Nothing more should ever be done with these invalid URLs, however. + +// Scheme: Appends the scheme and colon to the URL. The output component will +// indicate the range of characters up to but not including the colon. +// +// Canonical URLs always have a scheme. If the scheme is not present in the +// input, this will just write the colon to indicate an empty scheme. Does not +// append slashes which will be needed before any authority components for most +// URLs. +// +// The 8-bit version requires UTF-8 encoding. +bool CanonicalizeScheme(const char* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); +bool CanonicalizeScheme(const char16* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme); + +// User info: username/password. If present, this will add the delimiters so +// the output will be "<username>:<password>@" or "<username>@". Empty +// username/password pairs, or empty passwords, will get converted to +// nonexistant in the canonical version. +// +// The components for the username and password refer to ranges in the +// respective source strings. Usually, these will be the same string, which +// is legal as long as the two components don't overlap. +// +// The 8-bit version requires UTF-8 encoding. +bool CanonicalizeUserInfo(const char* username_source, + const url_parse::Component& username, + const char* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); +bool CanonicalizeUserInfo(const char16* username_source, + const url_parse::Component& username, + const char16* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password); + + +// This structure holds detailed state exported from the IP/Host canonicalizers. +// Additional fields may be added as callers require them. +struct CanonHostInfo { + CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} + + // Convenience function to test if family is an IP address. + bool IsIPAddress() const { return family == IPV4 || family == IPV6; } + + // This field summarizes how the input was classified by the canonicalizer. + enum Family { + NEUTRAL, // - Doesn't resemble an IP address. As far as the IP + // canonicalizer is concerned, it should be treated as a + // hostname. + BROKEN, // - Almost an IP, but was not canonicalized. This could be an + // IPv4 address where truncation occurred, or something + // containing the special characters :[] which did not parse + // as an IPv6 address. Never attempt to connect to this + // address, because it might actually succeed! + IPV4, // - Successfully canonicalized as an IPv4 address. + IPV6, // - Successfully canonicalized as an IPv6 address. + }; + Family family; + + // If |family| is IPV4, then this is the number of nonempty dot-separated + // components in the input text, from 1 to 4. If |family| is not IPV4, + // this value is undefined. + int num_ipv4_components; + + // Location of host within the canonicalized output. + // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. + // CanonicalizeHostVerbose() always sets it. + url_parse::Component out_host; +}; + + +// Host. +// +// The 8-bit version requires UTF-8 encoding. Use this version when you only +// need to know whether canonicalization succeeded. +bool CanonicalizeHost(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); +bool CanonicalizeHost(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host); + +// Extended version of CanonicalizeHost, which returns additional information. +// Use this when you need to know whether the hostname was an IP address. +// A successful return is indicated by host_info->family != BROKEN. See the +// definition of CanonHostInfo above for details. +void CanonicalizeHostVerbose(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +void CanonicalizeHostVerbose(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + + +// IP addresses. +// +// Tries to interpret the given host name as an IPv4 or IPv6 address. If it is +// an IP address, it will canonicalize it as such, appending it to |output|. +// Additional status information is returned via the |*host_info| parameter. +// See the definition of CanonHostInfo above for details. +// +// This is called AUTOMATICALLY from the host canonicalizer, which ensures that +// the input is unescaped and name-prepped, etc. It should not normally be +// necessary or wise to call this directly. +void CanonicalizeIPAddress(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); +void CanonicalizeIPAddress(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info); + +// Port: this function will add the colon for the port if a port is present. +// The caller can pass url_parse::PORT_UNSPECIFIED as the +// default_port_for_scheme argument if there is no default port. +// +// The 8-bit version requires UTF-8 encoding. +bool CanonicalizePort(const char* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); +bool CanonicalizePort(const char16* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port); + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +int DefaultPortForScheme(const char* scheme, int scheme_len); + +// Path. If the input does not begin in a slash (including if the input is +// empty), we'll prepend a slash to the path to make it canonical. +// +// The 8-bit version assumes UTF-8 encoding, but does not verify the validity +// of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid +// characters, etc.). Normally, URLs will come in as UTF-16, so this isn't +// an issue. Somebody giving us an 8-bit path is responsible for generating +// the path that the server expects (we'll escape high-bit characters), so +// if something is invalid, it's their problem. +bool CanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +bool CanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Canonicalizes the input as a file path. This is like CanonicalizePath except +// that it also handles Windows drive specs. For example, the path can begin +// with "c|\" and it will get properly canonicalized to "C:/". +// The string will be appended to |*output| and |*out_path| will be updated. +// +// The 8-bit version requires UTF-8 encoding. +bool FileCanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +bool FileCanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Query: Prepends the ? if needed. +// +// The 8-bit version requires the input to be UTF-8 encoding. Incorrectly +// encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode +// "invalid character." This function can not fail, we always just try to do +// our best for crazy input here since web pages can set it themselves. +// +// This will convert the given input into the output encoding that the given +// character set converter object provides. The converter will only be called +// if necessary, for ASCII input, no conversions are necessary. +// +// The converter can be NULL. In this case, the output encoding will be UTF-8. +void CanonicalizeQuery(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); +void CanonicalizeQuery(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query); + +// Ref: Prepends the # if needed. The output will be UTF-8 (this is the only +// canonicalizer that does not produce ASCII output). The output is +// guaranteed to be valid UTF-8. +// +// This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use +// the "Unicode replacement character" for the confusing bits and copy the rest. +void CanonicalizeRef(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); +void CanonicalizeRef(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path); + +// Full canonicalizer --------------------------------------------------------- +// +// These functions replace any string contents, rather than append as above. +// See the above piece-by-piece functions for information specific to +// canonicalizing individual components. +// +// The output will be ASCII except the reference fragment, which may be UTF-8. +// +// The 8-bit versions require UTF-8 encoding. + +// Use for standard URLs with authorities and paths. +bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool CanonicalizeStandardURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for file URLs. +bool CanonicalizeFileURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool CanonicalizeFileURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for path URLs such as javascript. This does not modify the path in any +// way, for example, by escaping it. +bool CanonicalizePathURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool CanonicalizePathURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Use for mailto URLs. This "canonicalizes" the url into a path and query +// component. It does not attempt to merge "to" fields. It uses UTF-8 for +// the query encoding if there is a query. This is because a mailto URL is +// really intended for an external mail program, and the encoding of a page, +// etc. which would influence a query encoding normally are irrelevant. +bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool CanonicalizeMailtoURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Part replacer -------------------------------------------------------------- + +// Internal structure used for storing separate strings for each component. +// The basic canonicalization functions use this structure internally so that +// component remplacement (different strings for different components) can be +// treated on the same code path as regular canonicalization (the same string +// for each component). +// +// A url_parse::Parsed structure usually goes along with this. Those +// components identify offsets within these strings, so that they can all be +// in the same string, or spread arbitrarily across different ones. +// +// This structures does not own any data. It is the caller's responsibility to +// ensure that the data the pointers point to stays in scope and is not +// modified. +template<typename CHAR> +struct URLComponentSource { + // Constructor normally used by callers wishing to replace components. This + // will make them all NULL, which is no replacement. The caller would then + // override the compoents they want to replace. + URLComponentSource() + : scheme(NULL), + username(NULL), + password(NULL), + host(NULL), + port(NULL), + path(NULL), + query(NULL), + ref(NULL) { + } + + // Constructor normally used internally to initialize all the components to + // point to the same spec. + explicit URLComponentSource(const CHAR* default_value) + : scheme(default_value), + username(default_value), + password(default_value), + host(default_value), + port(default_value), + path(default_value), + query(default_value), + ref(default_value) { + } + + const CHAR* scheme; + const CHAR* username; + const CHAR* password; + const CHAR* host; + const CHAR* port; + const CHAR* path; + const CHAR* query; + const CHAR* ref; +}; + +// This structure encapsulates information on modifying a URL. Each component +// may either be left unchanged, replaced, or deleted. +// +// By default, each component is unchanged. For those components that should be +// modified, call either Set* or Clear* to modify it. +// +// The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT +// IN SCOPE BY THE CALLER for as long as this object exists! +// +// Prefer the 8-bit replacement version if possible since it is more efficient. +template<typename CHAR> +class Replacements { + public: + Replacements() { + } + + // Scheme + void SetScheme(const CHAR* s, const url_parse::Component& comp) { + sources_.scheme = s; + components_.scheme = comp; + } + // Note: we don't have a ClearScheme since this doesn't make any sense. + bool IsSchemeOverridden() const { return sources_.scheme != NULL; } + + // Username + void SetUsername(const CHAR* s, const url_parse::Component& comp) { + sources_.username = s; + components_.username = comp; + } + void ClearUsername() { + sources_.username = Placeholder(); + components_.username = url_parse::Component(); + } + bool IsUsernameOverridden() const { return sources_.username != NULL; } + + // Password + void SetPassword(const CHAR* s, const url_parse::Component& comp) { + sources_.password = s; + components_.password = comp; + } + void ClearPassword() { + sources_.password = Placeholder(); + components_.password = url_parse::Component(); + } + bool IsPasswordOverridden() const { return sources_.password != NULL; } + + // Host + void SetHost(const CHAR* s, const url_parse::Component& comp) { + sources_.host = s; + components_.host = comp; + } + void ClearHost() { + sources_.host = Placeholder(); + components_.host = url_parse::Component(); + } + bool IsHostOverridden() const { return sources_.host != NULL; } + + // Port + void SetPort(const CHAR* s, const url_parse::Component& comp) { + sources_.port = s; + components_.port = comp; + } + void ClearPort() { + sources_.port = Placeholder(); + components_.port = url_parse::Component(); + } + bool IsPortOverridden() const { return sources_.port != NULL; } + + // Path + void SetPath(const CHAR* s, const url_parse::Component& comp) { + sources_.path = s; + components_.path = comp; + } + void ClearPath() { + sources_.path = Placeholder(); + components_.path = url_parse::Component(); + } + bool IsPathOverridden() const { return sources_.path != NULL; } + + // Query + void SetQuery(const CHAR* s, const url_parse::Component& comp) { + sources_.query = s; + components_.query = comp; + } + void ClearQuery() { + sources_.query = Placeholder(); + components_.query = url_parse::Component(); + } + bool IsQueryOverridden() const { return sources_.query != NULL; } + + // Ref + void SetRef(const CHAR* s, const url_parse::Component& comp) { + sources_.ref = s; + components_.ref = comp; + } + void ClearRef() { + sources_.ref = Placeholder(); + components_.ref = url_parse::Component(); + } + bool IsRefOverridden() const { return sources_.ref != NULL; } + + // Getters for the itnernal data. See the variables below for how the + // information is encoded. + const URLComponentSource<CHAR>& sources() const { return sources_; } + const url_parse::Parsed& components() const { return components_; } + + private: + // Returns a pointer to a static empty string that is used as a placeholder + // to indicate a component should be deleted (see below). + const CHAR* Placeholder() { + static const CHAR empty_string = 0; + return &empty_string; + } + + // We support three states: + // + // Action | Source Component + // -----------------------+-------------------------------------------------- + // Don't change component | NULL (unused) + // Replace component | (replacement string) (replacement component) + // Delete component | (non-NULL) (invalid component: (0,-1)) + // + // We use a pointer to the empty string for the source when the component + // should be deleted. + URLComponentSource<CHAR> sources_; + url_parse::Parsed components_; +}; + +// The base must be an 8-bit canonical URL. +bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Replacing some parts of a file URL is not permitted. Everything except +// the host, path, query, and ref will be ignored. +bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Path URLs can only have the scheme and path replaced. All other components +// will be ignored. +bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Mailto URLs can only have the scheme, path, and query replaced. +// All other components will be ignored. +bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); +bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed); + +// Relative URL --------------------------------------------------------------- + +// Given an input URL or URL fragment |fragment|, determines if it is a +// relative or absolute URL and places the result into |*is_relative|. If it is +// relative, the relevant portion of the URL will be placed into +// |*relative_component| (there may have been trimmed whitespace, for example). +// This value is passed to ResolveRelativeURL. If the input is not relative, +// this value is UNDEFINED (it may be changed by the functin). +// +// Returns true on success (we successfully determined the URL is relative or +// not). Failure means that the combination of URLs doesn't make any sense. +// +// The base URL should always be canonical, therefore is ASCII. +bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); +bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component); + +// Given a canonical parsed source URL, a URL fragment known to be relative, +// and the identified relevant portion of the relative URL (computed by +// IsRelativeURL), this produces a new parsed canonical URL in |output| and +// |out_parsed|. +// +// It also requires a flag indicating whether the base URL is a file: URL +// which triggers additional logic. +// +// The base URL should be canonical and have a host (may be empty for file +// URLs) and a path. If it doesn't have these, we can't resolve relative +// URLs off of it and will return the base as the output with an error flag. +// Becausee it is canonical is should also be ASCII. +// +// The query charset converter follows the same rules as CanonicalizeQuery. +// +// Returns true on success. On failure, the output will be "something +// reasonable" that will be consistent and valid, just probably not what +// was intended by the web page author or caller. +bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); +bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char16* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed); + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_H__ diff --git a/googleurl/src/url_canon_etc.cc b/googleurl/src/url_canon_etc.cc new file mode 100644 index 0000000..672b187 --- /dev/null +++ b/googleurl/src/url_canon_etc.cc @@ -0,0 +1,391 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Canonicalizers for random bits that aren't big enough for their own files. + +#include <string.h> + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +// Returns true if the given character should be removed from the middle of a +// URL. +inline bool IsRemovableURLWhitespace(int ch) { + return ch == '\r' || ch == '\n' || ch == '\t'; +} + +// Backend for RemoveURLWhitespace (see declaration in url_canon.h). +// It sucks that we have to do this, since this takes about 13% of the total URL +// canonicalization time. +template<typename CHAR> +const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len, + CanonOutputT<CHAR>* buffer, + int* output_len) { + // Fast verification that there's nothing that needs removal. This is the 99% + // case, so we want it to be fast and don't care about impacting the speed + // when we do find whitespace. + int found_whitespace = false; + for (int i = 0; i < input_len; i++) { + if (!IsRemovableURLWhitespace(input[i])) + continue; + found_whitespace = true; + break; + } + + if (!found_whitespace) { + // Didn't find any whitespace, we don't need to do anything. We can just + // return the input as the output. + *output_len = input_len; + return input; + } + + // Remove the whitespace into the new buffer and return it. + for (int i = 0; i < input_len; i++) { + if (!IsRemovableURLWhitespace(input[i])) + buffer->push_back(input[i]); + } + *output_len = buffer->length(); + return buffer->data(); +} + +// Contains the canonical version of each possible input letter in the scheme +// (basically, lower-cased). The corresponding entry will be 0 if the letter +// is not allowed in a scheme. +const char kSchemeCanonical[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 , +// @ A B C D E F G H I J K L M N O + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0, +// ` a b c d e f g h i j k l m n o + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 }; + +// This could be a table lookup as well by setting the high bit for each +// valid character, but it's only called once per URL, and it makes the lookup +// table easier to read not having extra stuff in it. +inline bool IsSchemeFirstChar(unsigned char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +template<typename CHAR, typename UCHAR> +bool DoScheme(const CHAR* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme) { + if (scheme.len <= 0) { + // Scheme is unspecified or empty, convert to empty by appending a colon. + *out_scheme = url_parse::Component(output->length(), 0); + output->push_back(':'); + return true; + } + + // The output scheme starts from the current position. + out_scheme->begin = output->length(); + + bool success = true; + int end = scheme.end(); + for (int i = scheme.begin; i < end; i++) { + UCHAR ch = static_cast<UCHAR>(spec[i]); + char replacement = 0; + if (ch < 0x80) { + if (i == scheme.begin) { + // Need to do a special check for the first letter of the scheme. + if (IsSchemeFirstChar(static_cast<unsigned char>(ch))) + replacement = kSchemeCanonical[ch]; + } else { + replacement = kSchemeCanonical[ch]; + } + } + + if (replacement) { + output->push_back(replacement); + } else if (ch == '%') { + // Canonicalizing the scheme multiple times should lead to the same + // result. Since invalid characters will be escaped, we need to preserve + // the percent to avoid multiple escaping. The scheme will be invalid. + success = false; + output->push_back('%'); + } else { + // Invalid character, store it but mark this scheme as invalid. + success = false; + + // This will escape the output and also handle encoding issues. + // Ignore the return value since we already failed. + AppendUTF8EscapedChar(spec, &i, end, output); + } + } + + // The output scheme ends with the the current position, before appending + // the colon. + out_scheme->len = output->length() - out_scheme->begin; + output->push_back(':'); + return success; +} + +// The username and password components reference ranges in the corresponding +// *_spec strings. Typically, these specs will be the same (we're +// canonicalizing a single source string), but may be different when +// replacing components. +template<typename CHAR, typename UCHAR> +bool DoUserInfo(const CHAR* username_spec, + const url_parse::Component& username, + const CHAR* password_spec, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password) { + if (username.len <= 0 && password.len <= 0) { + // Common case: no user info. We strip empty username/passwords. + *out_username = url_parse::Component(); + *out_password = url_parse::Component(); + return true; + } + + // Write the username. + out_username->begin = output->length(); + if (username.len > 0) { + // This will escape characters not valid for the username. + AppendStringOfType(&username_spec[username.begin], username.len, + CHAR_USERINFO, output); + } + out_username->len = output->length() - out_username->begin; + + // When there is a password, we need the separator. Note that we strip + // empty but specified passwords. + if (password.len > 0) { + output->push_back(':'); + out_password->begin = output->length(); + AppendStringOfType(&password_spec[password.begin], password.len, + CHAR_USERINFO, output); + out_password->len = output->length() - out_password->begin; + } else { + *out_password = url_parse::Component(); + } + + output->push_back('@'); + return true; +} + +// Helper functions for converting port integers to strings. +inline void WritePortInt(char* output, int output_len, int port) { + _itoa_s(port, output, output_len, 10); +} +inline void WritePortInt(char16* output, int output_len, int port) { + _itow_s(port, output, output_len, 10); +} + +// This function will prepend the colon if there will be a port. +template<typename CHAR, typename UCHAR> +bool DoPort(const CHAR* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port) { + int port_num = url_parse::ParsePort(spec, port); + if (port_num == url_parse::PORT_UNSPECIFIED || + port_num == default_port_for_scheme) { + *out_port = url_parse::Component(); + return true; // Leave port empty. + } + + if (port_num == url_parse::PORT_INVALID) { + // Invalid port: We'll copy the text from the input so the user can see + // what the error was, and mark the URL as invalid by returning false. + output->push_back(':'); + out_port->begin = output->length(); + AppendInvalidNarrowString(spec, port.begin, port.end(), output); + out_port->len = output->length() - out_port->begin; + return false; + } + + // Convert port number back to an integer. Max port value is 5 digits, and + // the Parsed::ExtractPort will have made sure the integer is in range. + const int buf_size = 6; + char buf[buf_size]; + WritePortInt(buf, buf_size, port_num); + + // Append the port number to the output, preceeded by a colon. + output->push_back(':'); + out_port->begin = output->length(); + for (int i = 0; i < buf_size && buf[i]; i++) + output->push_back(buf[i]); + + out_port->len = output->length() - out_port->begin; + return true; +} + +template<typename CHAR, typename UCHAR> +void DoCanonicalizeRef(const CHAR* spec, + const url_parse::Component& ref, + CanonOutput* output, + url_parse::Component* out_ref) { + if (ref.len < 0) { + // Common case of no ref. + *out_ref = url_parse::Component(); + return; + } + + // Append the ref separator. Note that we need to do this even when the ref + // is empty but present. + output->push_back('#'); + out_ref->begin = output->length(); + + // Now iterate through all the characters, converting to UTF-8 and validating. + int end = ref.end(); + for (int i = ref.begin; i < end; i++) { + if (spec[i] == 0) { + // IE just strips NULLs, so we do too. + continue; + } else if (static_cast<UCHAR>(spec[i]) < 0x20) { + // Unline IE seems to, we escape control characters. This will probably + // make the reference fragment unusable on a web page, but people + // shouldn't be using control characters in their anchor names. + AppendEscapedChar(static_cast<unsigned char>(spec[i]), output); + } else if (static_cast<UCHAR>(spec[i]) < 0x80) { + // Normal ASCII characters are just appended. + output->push_back(static_cast<char>(spec[i])); + } else { + // Non-ASCII characters are appended unescaped, but only when they are + // valid. Invalid Unicode characters are replaced with the "invalid + // character" as IE seems to. + unsigned code_point; + if (!ReadUTFChar(spec, &i, end, &code_point)) + AppendUTF8Value(kUnicodeReplacementCharacter, output); + else + AppendUTF8Value(code_point, output); + } + } + + out_ref->len = output->length() - out_ref->begin; +} + +} // namespace + +const char* RemoveURLWhitespace(const char* input, int input_len, + CanonOutputT<char>* buffer, + int* output_len) { + return DoRemoveURLWhitespace(input, input_len, buffer, output_len); +} + +const char16* RemoveURLWhitespace(const char16* input, int input_len, + CanonOutputT<char16>* buffer, + int* output_len) { + return DoRemoveURLWhitespace(input, input_len, buffer, output_len); +} + +char CanonicalSchemeChar(char16 ch) { + if (ch >= 0x80) + return 0; // Non-ASCII is not supported by schemes. + return kSchemeCanonical[ch]; +} + +bool CanonicalizeScheme(const char* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme) { + return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme); +} + +bool CanonicalizeScheme(const char16* spec, + const url_parse::Component& scheme, + CanonOutput* output, + url_parse::Component* out_scheme) { + return DoScheme<char16, char16>(spec, scheme, output, out_scheme); +} + +bool CanonicalizeUserInfo(const char* username_source, + const url_parse::Component& username, + const char* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password) { + return DoUserInfo<char, unsigned char>( + username_source, username, password_source, password, + output, out_username, out_password); +} + +bool CanonicalizeUserInfo(const char16* username_source, + const url_parse::Component& username, + const char16* password_source, + const url_parse::Component& password, + CanonOutput* output, + url_parse::Component* out_username, + url_parse::Component* out_password) { + return DoUserInfo<char16, char16>( + username_source, username, password_source, password, + output, out_username, out_password); +} + +bool CanonicalizePort(const char* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port) { + return DoPort<char, unsigned char>(spec, port, + default_port_for_scheme, + output, out_port); +} + +bool CanonicalizePort(const char16* spec, + const url_parse::Component& port, + int default_port_for_scheme, + CanonOutput* output, + url_parse::Component* out_port) { + return DoPort<char16, char16>(spec, port, default_port_for_scheme, + output, out_port); +} + +void CanonicalizeRef(const char* spec, + const url_parse::Component& ref, + CanonOutput* output, + url_parse::Component* out_ref) { + DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref); +} + +void CanonicalizeRef(const char16* spec, + const url_parse::Component& ref, + CanonOutput* output, + url_parse::Component* out_ref) { + DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_fileurl.cc b/googleurl/src/url_canon_fileurl.cc new file mode 100644 index 0000000..97023eb --- /dev/null +++ b/googleurl/src/url_canon_fileurl.cc @@ -0,0 +1,215 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Functions for canonicalizing "file:" URLs. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_canon { + +namespace { + +#ifdef WIN32 + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template<typename CHAR> +int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = url_parse::CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!url_parse::DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // A drive spec is the start of a path, so we need to add a slash for the + // authority terminator (typically the third slash). + output->push_back('/'); + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z') + output->push_back(spec[after_slashes] - 'a' + 'A'); + else + output->push_back(static_cast<char>(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + return after_slashes + 2; +} + +#endif // WIN32 + +template<typename CHAR, typename UCHAR> +bool DoFileCanonicalizePath(const CHAR* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + // Copies and normalizes the "c:" at the beginning, if present. + out_path->begin = output->length(); + int after_drive; +#ifdef WIN32 + after_drive = FileDoDriveSpec(spec, path.begin, path.end(), output); +#else + after_drive = path.begin; +#endif + + // Copies the rest of the path, starting from the slash following the + // drive colon (if any, Windows only), or the first slash of the path. + bool success = true; + if (after_drive < path.end()) { + // Use the regular path canonicalizer to canonicalize the rest of the + // path. Give it a fake output component to write into. DoCanonicalizeFile + // will compute the full path component. + url_parse::Component sub_path = + url_parse::MakeRange(after_drive, path.end()); + url_parse::Component fake_output_path; + success = CanonicalizePath(spec, sub_path, output, &fake_output_path); + } else { + // No input path, canonicalize to a slash. + output->push_back('/'); + } + + out_path->len = output->length() - out_path->begin; + return success; +} + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = url_parse::Component(); + new_parsed->password = url_parse::Component(); + new_parsed->port = url_parse::Component(); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("file://", 7); + new_parsed->scheme.len = 4; + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = CanonicalizeHost(source.host, parsed.host, + output, &new_parsed->host); + success &= DoFileCanonicalizePath<CHAR, UCHAR>(source.path, parsed.path, + output, &new_parsed->path); + CanonicalizeQuery(source.query, parsed.query, query_converter, + output, &new_parsed->query); + + // Ignore failure for refs since the URL can probably still be loaded. + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace + +bool CanonicalizeFileURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeFileURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, query_converter, + output, new_parsed); +} + +bool CanonicalizeFileURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeFileURL<char16, char16>( + URLComponentSource<char16>(spec), parsed, query_converter, + output, new_parsed); +} + +bool FileCanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + return DoFileCanonicalizePath<char, unsigned char>(spec, path, + output, out_path); +} + +bool FileCanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + return DoFileCanonicalizePath<char16, char16>(spec, path, + output, out_path); +} + +bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeFileURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +bool ReplaceFileURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeFileURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_host.cc b/googleurl/src/url_canon_host.cc new file mode 100644 index 0000000..6642004 --- /dev/null +++ b/googleurl/src/url_canon_host.cc @@ -0,0 +1,401 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "base/logging.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +// For reference, here's what IE supports: +// Key: 0 (disallowed: failure if present in the input) +// + (allowed either escaped or unescaped, and unmodified) +// U (allowed escaped or unescaped but always unescaped if present in +// escaped form) +// E (allowed escaped or unescaped but always escaped if present in +// unescaped form) +// % (only allowed escaped in the input, will be unmodified). +// I left blank alpha numeric characters. +// +// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f +// ----------------------------------------------- +// 0 0 E E E E E E E E E E E E E E E +// 1 E E E E E E E E E E E E E E E E +// 2 E + E E + E + + + + + + + U U 0 +// 3 % % E + E 0 <-- Those are : ; < = > ? +// 4 % +// 5 U 0 U U U <-- Those are [ \ ] ^ _ +// 6 E <-- That's ` +// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE) +// +// NOTE: I didn't actually test all the control characters. Some may be +// disallowed in the input, but they are all accepted escaped except for 0. +// I also didn't test if characters affecting HTML parsing are allowed +// unescaped, eg. (") or (#), which would indicate the beginning of the path. +// Surprisingly, space is accepted in the input and always escaped. + +// This table lists the canonical version of all characters we allow in the +// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar +// value to indicate that this character should be escaped. We are a little more +// restrictive than IE, but less restrictive than Firefox. +// +// Note that we disallow the % character. We will allow it when part of an +// escape sequence, of course, but this disallows "%25". Even though IE allows +// it, allowing it would put us in a funny state. If there was an invalid +// escape sequence like "%zz", we'll add "%25zz" to the output and fail. +// Allowing percents means we'll succeed a second time, so validity would change +// based on how many times you run the canonicalizer. We prefer to always report +// the same vailidity, so reject this. +const unsigned char kEsc = 0xff; +const unsigned char kHostCharLookup[0x80] = { +// 00-1f: all are invalid + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// ' ' ! " # $ % & ' ( ) * + , - . / + kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 , +// @ A B C D E F G H I J K L M N O + kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// P Q R S T U V W X Y Z [ \ ] ^ _ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_', +// ` a b c d e f g h i j k l m n o + kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +// p q r s t u v w x y z { | } ~ + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 }; + +const int kTempHostBufferLen = 1024; +typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer; +typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW; + +// Scans a host name and fills in the output flags according to what we find. +// |has_non_ascii| will be true if there are any non-7-bit characters, and +// |has_escaped| will be true if there is a percent sign. +template<typename CHAR, typename UCHAR> +void ScanHostname(const CHAR* spec, const url_parse::Component& host, + bool* has_non_ascii, bool* has_escaped) { + int end = host.end(); + *has_non_ascii = false; + *has_escaped = false; + for (int i = host.begin; i < end; i++) { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + *has_non_ascii = true; + else if (spec[i] == '%') + *has_escaped = true; + } +} + +// Canonicalizes a host name that is entirely 8-bit characters (even though +// the type holding them may be 16 bits. Escaped characters will be unescaped. +// Non-7-bit characters (for example, UTF-8) will be passed unchanged. +// +// The |*has_non_ascii| flag will be true if there are non-7-bit characters in +// the output. +// +// This function is used in two situations: +// +// * When the caller knows there is no non-ASCII or percent escaped +// characters. This is what DoHost does. The result will be a completely +// canonicalized host since we know nothing weird can happen (escaped +// characters could be unescaped to non-7-bit, so they have to be treated +// with suspicion at this point). It does not use the |has_non_ascii| flag. +// +// * When the caller has an 8-bit string that may need unescaping. +// DoComplexHost calls us this situation to do unescaping and validation. +// After this, it may do other IDN operations depending on the value of the +// |*has_non_ascii| flag. +// +// The return value indicates if the output is a potentially valid host name. +template<typename INCHAR, typename OUTCHAR> +bool DoSimpleHost(const INCHAR* host, + int host_len, + CanonOutputT<OUTCHAR>* output, + bool* has_non_ascii) { + *has_non_ascii = false; + + bool success = true; + for (int i = 0; i < host_len; ++i) { + unsigned int source = host[i]; + if (source == '%') { + // Unescape first, if possible. + // Source will be used only if decode operation was successful. + if (!DecodeEscaped(host, &i, host_len, + reinterpret_cast<unsigned char*>(&source))) { + // Invalid escaped character. There is nothing that can make this + // host valid. We append an escaped percent so the URL looks reasonable + // and mark as failed. + AppendEscapedChar('%', output); + success = false; + continue; + } + } + + if (source < 0x80) { + // We have ASCII input, we can use our lookup table. + unsigned char replacement = kHostCharLookup[source]; + if (!replacement) { + // Invalid character, add it as percent-escaped and mark as failed. + AppendEscapedChar(source, output); + success = false; + } else if (replacement == kEsc) { + // This character is valid but should be escaped. + AppendEscapedChar(source, output); + } else { + // Common case, the given character is valid in a hostname, the lookup + // table tells us the canonical representation of that character (lower + // cased). + output->push_back(replacement); + } + } else { + // It's a non-ascii char. Just push it to the output. + // In case where we have char16 input, and char output it's safe to + // cast char16->char only if input string was converted to ASCII. + output->push_back(static_cast<OUTCHAR>(source)); + *has_non_ascii = true; + } + } + + return success; +} + +// Canonicalizes a host that requires IDN conversion. Returns true on success +bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) { + // We need to escape URL before doing IDN conversion, since punicode strings + // cannot be escaped after they are created. + RawCanonOutputW<kTempHostBufferLen> url_escaped_host; + bool has_non_ascii; + DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii); + + StackBufferW wide_output; + if (!IDNToASCII(url_escaped_host.data(), + url_escaped_host.length(), + &wide_output)) { + // Some error, give up. This will write some reasonable looking + // representation of the string to the output. + AppendInvalidNarrowString(src, 0, src_len, output); + return false; + } + + // Now we check the ASCII output like a normal host. It will also handle + // unescaping. Although we unescaped everything before this function call, if + // somebody does %00 as fullwidth, ICU will convert this to ASCII. + bool success = DoSimpleHost(wide_output.data(), + wide_output.length(), + output, &has_non_ascii); + DCHECK(!has_non_ascii); + return success; +} + +// 8-bit convert host to its ASCII version: this converts the UTF-8 input to +// UTF-16. The has_escaped flag should be set if the input string requires +// unescaping. +bool DoComplexHost(const char* host, int host_len, + bool has_non_ascii, bool has_escaped, CanonOutput* output) { + // Save the current position in the output. We may write stuff and rewind it + // below, so we need to know where to rewind to. + int begin_length = output->length(); + + // Points to the UTF-8 data we want to convert. This will either be the + // input or the unescaped version written to |*output| if necessary. + const char* utf8_source; + int utf8_source_len; + if (has_escaped) { + // Unescape before converting to UTF-16 for IDN. We write this into the + // output because it most likely does not require IDNization, and we can + // save another huge stack buffer. It will be replaced below if it requires + // IDN. This will also update our non-ASCII flag so we know whether the + // unescaped input requires IDN. + if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { + // Error with some escape sequence. We'll call the current output + // complete. DoSimpleHost will have written some "reasonable" output. + return false; + } + + // Unescaping may have left us with ASCII input, in which case the + // unescaped version we wrote to output is complete. + if (!has_non_ascii) { + return true; + } + + // Save the pointer into the data was just converted (it may be appended to + // other data in the output buffer). + utf8_source = &output->data()[begin_length]; + utf8_source_len = output->length() - begin_length; + } else { + // We don't need to unescape, use input for IDNization later. (We know the + // input has non-ASCII, or the simple version would have been called + // instead of us.) + utf8_source = host; + utf8_source_len = host_len; + } + + // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion. + // Above, we may have used the output to write the unescaped values to, so + // we have to rewind it to where we started after we convert it to UTF-16. + StackBufferW utf16; + if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) { + // In this error case, the input may or may not be the output. + StackBuffer utf8; + for (int i = 0; i < utf8_source_len; i++) + utf8.push_back(utf8_source[i]); + output->set_length(begin_length); + AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output); + return false; + } + output->set_length(begin_length); + + // This will call DoSimpleHost which will do normal ASCII canonicalization + // and also check for IP addresses in the outpt. + return DoIDNHost(utf16.data(), utf16.length(), output); +} + +// UTF-16 convert host to its ASCII version. The set up is already ready for +// the backend, so we just pass through. The has_escaped flag should be set if +// the input string requires unescaping. +bool DoComplexHost(const char16* host, int host_len, + bool has_non_ascii, bool has_escaped, CanonOutput* output) { + if (has_escaped) { + // Yikes, we have escaped characters with wide input. The escaped + // characters should be interpreted as UTF-8. To solve this problem, + // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN. + // + // We don't bother to optimize the conversion in the ASCII case (which + // *could* just be a copy) and use the UTF-8 path, because it should be + // very rare that host names have escaped characters, and it is relatively + // fast to do the conversion anyway. + StackBuffer utf8; + if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) { + AppendInvalidNarrowString(host, 0, host_len, output); + return false; + } + + // Once we convert to UTF-8, we can use the 8-bit version of the complex + // host handling code above. + return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, + has_escaped, output); + } + + // No unescaping necessary, we can safely pass the input to ICU. This + // function will only get called if we either have escaped or non-ascii + // input, so it's safe to just use ICU now. Even if the input is ASCII, + // this function will do the right thing (just slower than we could). + return DoIDNHost(host, host_len, output); +} + +template<typename CHAR, typename UCHAR> +void DoHost(const CHAR* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (host.len <= 0) { + // Empty hosts don't need anything. + host_info->family = CanonHostInfo::NEUTRAL; + host_info->out_host = url_parse::Component(); + return; + } + + bool has_non_ascii, has_escaped; + ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); + + // Keep track of output's initial length, so we can rewind later. + const int output_begin = output->length(); + + bool success; + if (!has_non_ascii && !has_escaped) { + success = DoSimpleHost(&spec[host.begin], host.len, + output, &has_non_ascii); + DCHECK(!has_non_ascii); + } else { + success = DoComplexHost(&spec[host.begin], host.len, + has_non_ascii, has_escaped, output); + } + + if (!success) { + // Canonicalization failed. Set BROKEN to notify the caller. + host_info->family = CanonHostInfo::BROKEN; + } else { + // After all the other canonicalization, check if we ended up with an IP + // address. IP addresses are small, so writing into this temporary buffer + // should not cause an allocation. + RawCanonOutput<64> canon_ip; + CanonicalizeIPAddress(output->data(), + url_parse::MakeRange(output_begin, output->length()), + &canon_ip, host_info); + + // If we got an IPv4/IPv6 address, copy the canonical form back to the + // real buffer. Otherwise, it's a hostname or broken IP, in which case + // we just leave it in place. + if (host_info->IsIPAddress()) { + output->set_length(output_begin); + output->Append(canon_ip.data(), canon_ip.length()); + } + } + + host_info->out_host = url_parse::MakeRange(output_begin, output->length()); +} + +} // namespace + +bool CanonicalizeHost(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host) { + CanonHostInfo host_info; + DoHost<char, unsigned char>(spec, host, output, &host_info); + *out_host = host_info.out_host; + return (host_info.family != CanonHostInfo::BROKEN); +} + +bool CanonicalizeHost(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + url_parse::Component* out_host) { + CanonHostInfo host_info; + DoHost<char16, char16>(spec, host, output, &host_info); + *out_host = host_info.out_host; + return (host_info.family != CanonHostInfo::BROKEN); +} + +void CanonicalizeHostVerbose(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo *host_info) { + DoHost<char, unsigned char>(spec, host, output, host_info); +} + +void CanonicalizeHostVerbose(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo *host_info) { + DoHost<char16, char16>(spec, host, output, host_info); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_icu.cc b/googleurl/src/url_canon_icu.cc new file mode 100644 index 0000000..b06808c --- /dev/null +++ b/googleurl/src/url_canon_icu.cc @@ -0,0 +1,207 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ICU integration functions. + +#include <stdlib.h> +#include <string.h> +#include <unicode/ucnv.h> +#include <unicode/ucnv_cb.h> +#include <unicode/uidna.h> + +#include "googleurl/src/url_canon_icu.h" +#include "googleurl/src/url_canon_internal.h" // for _itoa_s + +#include "base/logging.h" + +namespace url_canon { + +namespace { + +// Called when converting a character that can not be represented, this will +// append an escaped version of the numerical character reference for that code +// point. It is of the form "Ӓ" and we will escape the non-digits to +// "%26%231234%3B". Why? This is what Netscape did back in the olden days. +void appendURLEscapedChar(const void* context, + UConverterFromUnicodeArgs* from_args, + const UChar* code_units, + int32_t length, + UChar32 code_point, + UConverterCallbackReason reason, + UErrorCode* err) { + if (reason == UCNV_UNASSIGNED) { + *err = U_ZERO_ERROR; + + const static int prefix_len = 6; + const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped + ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err); + + DCHECK(code_point < 0x110000); + char number[8]; // Max Unicode code point is 7 digits. + _itoa_s(code_point, number, 10); + int number_len = static_cast<int>(strlen(number)); + ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err); + + const static int postfix_len = 3; + const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped + ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err); + } +} + +// A class for scoping the installation of the invalid character callback. +class AppendHandlerInstaller { + public: + // The owner of this object must ensure that the converter is alive for the + // duration of this object's lifetime. + AppendHandlerInstaller(UConverter* converter) : converter_(converter) { + UErrorCode err = U_ZERO_ERROR; + ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0, + &old_callback_, &old_context_, &err); + } + + ~AppendHandlerInstaller() { + UErrorCode err = U_ZERO_ERROR; + ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err); + } + + private: + UConverter* converter_; + + UConverterFromUCallback old_callback_; + const void* old_context_; +}; + +} // namespace + +ICUCharsetConverter::ICUCharsetConverter(UConverter* converter) + : converter_(converter) { +} + +void ICUCharsetConverter::ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output) { + // Install our error handler. It will be called for character that can not + // be represented in the destination character set. + AppendHandlerInstaller handler(converter_); + + int begin_offset = output->length(); + int dest_capacity = output->capacity() - begin_offset; + output->set_length(output->length()); + + do { + UErrorCode err = U_ZERO_ERROR; + char* dest = &output->data()[begin_offset]; + int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity, + input, input_len, &err); + if (err != U_BUFFER_OVERFLOW_ERROR) { + output->set_length(begin_offset + required_capacity); + return; + } + + // Output didn't fit, expand + dest_capacity = required_capacity; + output->Resize(begin_offset + dest_capacity); + } while (true); +} + +// Converts the Unicode input representing a hostname to ASCII using IDN rules. +// The output must be ASCII, but is represented as wide characters. +// +// On success, the output will be filled with the ASCII host name and it will +// return true. Unlike most other canonicalization functions, this assumes that +// the output is empty. The beginning of the host will be at offset 0, and +// the length of the output will be set to the length of the new host name. +// +// On error, this will return false. The output in this case is undefined. +bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output) { + DCHECK(output->length() == 0); // Output buffer is assumed empty. + while (true) { + // Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate + // the spec (which do exist). This does not present any risk and is a + // little more future proof. + UErrorCode err = U_ZERO_ERROR; + int num_converted = uidna_IDNToASCII(src, src_len, output->data(), + output->capacity(), + UIDNA_ALLOW_UNASSIGNED, NULL, &err); + if (err == U_ZERO_ERROR) { + output->set_length(num_converted); + return true; + } + if (err != U_BUFFER_OVERFLOW_ERROR) + return false; // Unknown error, give up. + + // Not enough room in our buffer, expand. + output->Resize(output->capacity() * 2); + } +} + +bool ReadUTFChar(const char* str, int* begin, int length, + unsigned* code_point_out) { + int code_point; // Avoids warning when U8_NEXT writes -1 to it. + U8_NEXT(str, *begin, length, code_point); + *code_point_out = static_cast<unsigned>(code_point); + + // The ICU macro above moves to the next char, we want to point to the last + // char consumed. + (*begin)--; + + // Validate the decoded value. + if (U_IS_UNICODE_CHAR(code_point)) + return true; + *code_point_out = kUnicodeReplacementCharacter; + return false; +} + +bool ReadUTFChar(const char16* str, int* begin, int length, + unsigned* code_point) { + if (U16_IS_SURROGATE(str[*begin])) { + if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length || + !U16_IS_TRAIL(str[*begin + 1])) { + // Invalid surrogate pair. + *code_point = kUnicodeReplacementCharacter; + return false; + } else { + // Valid surrogate pair. + *code_point = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]); + (*begin)++; + } + } else { + // Not a surrogate, just one 16-bit word. + *code_point = str[*begin]; + } + + if (U_IS_UNICODE_CHAR(*code_point)) + return true; + + // Invalid code point. + *code_point = kUnicodeReplacementCharacter; + return false; +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_icu.h b/googleurl/src/url_canon_icu.h new file mode 100644 index 0000000..3980663 --- /dev/null +++ b/googleurl/src/url_canon_icu.h @@ -0,0 +1,63 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ICU integration functions. + +#ifndef GOOGLEURL_SRC_URL_CANON_ICU_H__ +#define GOOGLEURL_SRC_URL_CANON_ICU_H__ + +#include "googleurl/src/url_canon.h" + +typedef struct UConverter UConverter; + +namespace url_canon { + +// An implementation of CharsetConverter that implementations can use to +// interface the canonicalizer with ICU's conversion routines. +class ICUCharsetConverter : public CharsetConverter { + public: + // Constructs a converter using an already-existing ICU character set + // converter. This converter is NOT owned by this object; the lifetime must + // be managed by the creator such that it is alive as long as this is. + ICUCharsetConverter(UConverter* converter); + + virtual ~ICUCharsetConverter() {} + + virtual void ConvertFromUTF16(const char16* input, + int input_len, + CanonOutput* output); + + private: + // The ICU converter, not owned by this class. + UConverter* converter_; +}; + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_ICU_H__ diff --git a/googleurl/src/url_canon_internal.cc b/googleurl/src/url_canon_internal.cc new file mode 100644 index 0000000..6b776bc --- /dev/null +++ b/googleurl/src/url_canon_internal.cc @@ -0,0 +1,427 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <cstdio> +#include <errno.h> +#include <stdlib.h> +#include <string> + +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +template<typename CHAR, typename UCHAR> +void DoAppendStringOfType(const CHAR* source, int length, + SharedCharTypes type, + CanonOutput* output) { + for (int i = 0; i < length; i++) { + if (static_cast<UCHAR>(source[i]) >= 0x80) { + // ReadChar will fill the code point with kUnicodeReplacementCharacter + // when the input is invalid, which is what we want. + unsigned code_point; + ReadUTFChar(source, &i, length, &code_point); + AppendUTF8EscapedValue(code_point, output); + } else { + // Just append the 7-bit character, possibly escaping it. + unsigned char uch = static_cast<unsigned char>(source[i]); + if (!IsCharOfType(uch, type)) + AppendEscapedChar(uch, output); + else + output->push_back(uch); + } + } +} + +// This function assumes the input values are all contained in 8-bit, +// although it allows any type. Returns true if input is valid, false if not. +template<typename CHAR, typename UCHAR> +void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end, + CanonOutput* output) { + for (int i = begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(spec[i]); + if (uch >= 0x80) { + // Handle UTF-8/16 encodings. This call will correctly handle the error + // case by appending the invalid character. + AppendUTF8EscapedChar(spec, &i, end, output); + } else if (uch <= ' ' || uch == 0x7f) { + // This function is for error handling, so we escape all control + // characters and spaces, but not anything else since we lack + // context to do something more specific. + AppendEscapedChar(static_cast<unsigned char>(uch), output); + } else { + output->push_back(static_cast<char>(uch)); + } + } +} + +// Overrides one component, see the url_canon::Replacements structure for +// what the various combionations of source pointer and component mean. +void DoOverrideComponent(const char* override_source, + const url_parse::Component& override_component, + const char** dest, + url_parse::Component* dest_component) { + if (override_source) { + *dest = override_source; + *dest_component = override_component; + } +} + +// Similar to DoOverrideComponent except that it takes a UTF-16 input and does +// not actually set the output character pointer. +// +// The input is converted to UTF-8 at the end of the given buffer as a temporary +// holding place. The component indentifying the portion of the buffer used in +// the |utf8_buffer| will be specified in |*dest_component|. +// +// This will not actually set any |dest| pointer like DoOverrideComponent +// does because all of the pointers will point into the |utf8_buffer|, which +// may get resized while we're overriding a subsequent component. Instead, the +// caller should use the beginning of the |utf8_buffer| as the string pointer +// for all components once all overrides have been prepared. +bool PrepareUTF16OverrideComponent( + const char16* override_source, + const url_parse::Component& override_component, + CanonOutput* utf8_buffer, + url_parse::Component* dest_component) { + bool success = true; + if (override_source) { + if (!override_component.is_valid()) { + // Non-"valid" component (means delete), so we need to preserve that. + *dest_component = url_parse::Component(); + } else { + // Convert to UTF-8. + dest_component->begin = utf8_buffer->length(); + success = ConvertUTF16ToUTF8(&override_source[override_component.begin], + override_component.len, utf8_buffer); + dest_component->len = utf8_buffer->length() - dest_component->begin; + } + } + return success; +} + +} // namespace + +// See the header file for this array's declaration. +const unsigned char kSharedCharTypeTable[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x00 - 0x0f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x10 - 0x1f + 0, // 0x20 ' ' (escape spaces in queries) + CHAR_QUERY | CHAR_USERINFO, // 0x21 ! + 0, // 0x22 " + 0, // 0x23 # (invalid in query since it marks the ref) + CHAR_QUERY | CHAR_USERINFO, // 0x24 $ + CHAR_QUERY | CHAR_USERINFO, // 0x25 % + CHAR_QUERY | CHAR_USERINFO, // 0x26 & + CHAR_QUERY | CHAR_USERINFO, // 0x27 ' + CHAR_QUERY | CHAR_USERINFO, // 0x28 ( + CHAR_QUERY | CHAR_USERINFO, // 0x29 ) + CHAR_QUERY | CHAR_USERINFO, // 0x2a * + CHAR_QUERY | CHAR_USERINFO, // 0x2b + + CHAR_QUERY | CHAR_USERINFO, // 0x2c , + CHAR_QUERY | CHAR_USERINFO, // 0x2d - + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x2e . + CHAR_QUERY, // 0x2f / + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x30 0 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x31 1 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x32 2 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x33 3 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x34 4 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x35 5 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x36 6 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT, // 0x37 7 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC, // 0x38 8 + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC, // 0x39 9 + CHAR_QUERY, // 0x3a : + CHAR_QUERY, // 0x3b ; + 0, // 0x3c < (Try to prevent certain types of XSS.) + CHAR_QUERY, // 0x3d = + 0, // 0x3e > (Try to prevent certain types of XSS.) + CHAR_QUERY, // 0x3f ? + CHAR_QUERY, // 0x40 @ + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x41 A + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x42 B + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x43 C + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x44 D + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x45 E + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x46 F + CHAR_QUERY | CHAR_USERINFO, // 0x47 G + CHAR_QUERY | CHAR_USERINFO, // 0x48 H + CHAR_QUERY | CHAR_USERINFO, // 0x49 I + CHAR_QUERY | CHAR_USERINFO, // 0x4a J + CHAR_QUERY | CHAR_USERINFO, // 0x4b K + CHAR_QUERY | CHAR_USERINFO, // 0x4c L + CHAR_QUERY | CHAR_USERINFO, // 0x4d M + CHAR_QUERY | CHAR_USERINFO, // 0x4e N + CHAR_QUERY | CHAR_USERINFO, // 0x4f O + CHAR_QUERY | CHAR_USERINFO, // 0x50 P + CHAR_QUERY | CHAR_USERINFO, // 0x51 Q + CHAR_QUERY | CHAR_USERINFO, // 0x52 R + CHAR_QUERY | CHAR_USERINFO, // 0x53 S + CHAR_QUERY | CHAR_USERINFO, // 0x54 T + CHAR_QUERY | CHAR_USERINFO, // 0x55 U + CHAR_QUERY | CHAR_USERINFO, // 0x56 V + CHAR_QUERY | CHAR_USERINFO, // 0x57 W + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x58 X + CHAR_QUERY | CHAR_USERINFO, // 0x59 Y + CHAR_QUERY | CHAR_USERINFO, // 0x5a Z + CHAR_QUERY, // 0x5b [ + CHAR_QUERY, // 0x5c '\' + CHAR_QUERY, // 0x5d ] + CHAR_QUERY, // 0x5e ^ + CHAR_QUERY | CHAR_USERINFO, // 0x5f _ + CHAR_QUERY, // 0x60 ` + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x61 a + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x62 b + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x63 c + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x64 d + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x65 e + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX, // 0x66 f + CHAR_QUERY | CHAR_USERINFO, // 0x67 g + CHAR_QUERY | CHAR_USERINFO, // 0x68 h + CHAR_QUERY | CHAR_USERINFO, // 0x69 i + CHAR_QUERY | CHAR_USERINFO, // 0x6a j + CHAR_QUERY | CHAR_USERINFO, // 0x6b k + CHAR_QUERY | CHAR_USERINFO, // 0x6c l + CHAR_QUERY | CHAR_USERINFO, // 0x6d m + CHAR_QUERY | CHAR_USERINFO, // 0x6e n + CHAR_QUERY | CHAR_USERINFO, // 0x6f o + CHAR_QUERY | CHAR_USERINFO, // 0x70 p + CHAR_QUERY | CHAR_USERINFO, // 0x71 q + CHAR_QUERY | CHAR_USERINFO, // 0x72 r + CHAR_QUERY | CHAR_USERINFO, // 0x73 s + CHAR_QUERY | CHAR_USERINFO, // 0x74 t + CHAR_QUERY | CHAR_USERINFO, // 0x75 u + CHAR_QUERY | CHAR_USERINFO, // 0x76 v + CHAR_QUERY | CHAR_USERINFO, // 0x77 w + CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x78 x + CHAR_QUERY | CHAR_USERINFO, // 0x79 y + CHAR_QUERY | CHAR_USERINFO, // 0x7a z + CHAR_QUERY, // 0x7b { + CHAR_QUERY, // 0x7c | + CHAR_QUERY, // 0x7d } + CHAR_QUERY | CHAR_USERINFO, // 0x7e ~ + 0, // 0x7f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9f + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xa0 - 0xaf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xb0 - 0xbf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xc0 - 0xcf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xd0 - 0xdf + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xe0 - 0xef + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff +}; + +const char kHexCharLookup[0x10] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', +}; + +const char kCharToHexLookup[8] = { + 0, // 0x00 - 0x1f + '0', // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39 + 'A' - 10, // 0x40 - 0x5f: letters A - F are 0x41 - 0x46 + 'a' - 10, // 0x60 - 0x7f: letters a - f are 0x61 - 0x66 + 0, // 0x80 - 0x9F + 0, // 0xA0 - 0xBF + 0, // 0xC0 - 0xDF + 0, // 0xE0 - 0xFF +}; + +const char16 kUnicodeReplacementCharacter = 0xfffd; + +void AppendStringOfType(const char* source, int length, + SharedCharTypes type, + CanonOutput* output) { + DoAppendStringOfType<char, unsigned char>(source, length, type, output); +} + +void AppendStringOfType(const char16* source, int length, + SharedCharTypes type, + CanonOutput* output) { + DoAppendStringOfType<char16, char16>(source, length, type, output); +} + +void AppendInvalidNarrowString(const char* spec, int begin, int end, + CanonOutput* output) { + DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output); +} + +void AppendInvalidNarrowString(const char16* spec, int begin, int end, + CanonOutput* output) { + DoAppendInvalidNarrowString<char16, char16>(spec, begin, end, output); +} + +bool ConvertUTF16ToUTF8(const char16* input, int input_len, + CanonOutput* output) { + bool success = true; + for (int i = 0; i < input_len; i++) { + unsigned code_point; + success &= ReadUTFChar(input, &i, input_len, &code_point); + AppendUTF8Value(code_point, output); + } + return success; +} + +bool ConvertUTF8ToUTF16(const char* input, int input_len, + CanonOutputT<char16>* output) { + bool success = true; + for (int i = 0; i < input_len; i++) { + unsigned code_point; + success &= ReadUTFChar(input, &i, input_len, &code_point); + AppendUTF16Value(code_point, output); + } + return success; +} + +void SetupOverrideComponents(const char* base, + const Replacements<char>& repl, + URLComponentSource<char>* source, + url_parse::Parsed* parsed) { + // Get the source and parsed structures of the things we are replacing. + const URLComponentSource<char>& repl_source = repl.sources(); + const url_parse::Parsed& repl_parsed = repl.components(); + + DoOverrideComponent(repl_source.scheme, repl_parsed.scheme, + &source->scheme, &parsed->scheme); + DoOverrideComponent(repl_source.username, repl_parsed.username, + &source->username, &parsed->username); + DoOverrideComponent(repl_source.password, repl_parsed.password, + &source->password, &parsed->password); + + // Our host should be empty if not present, so override the default setup. + DoOverrideComponent(repl_source.host, repl_parsed.host, + &source->host, &parsed->host); + if (parsed->host.len == -1) + parsed->host.len = 0; + + DoOverrideComponent(repl_source.port, repl_parsed.port, + &source->port, &parsed->port); + DoOverrideComponent(repl_source.path, repl_parsed.path, + &source->path, &parsed->path); + DoOverrideComponent(repl_source.query, repl_parsed.query, + &source->query, &parsed->query); + DoOverrideComponent(repl_source.ref, repl_parsed.ref, + &source->ref, &parsed->ref); +} + +bool SetupUTF16OverrideComponents(const char* base, + const Replacements<char16>& repl, + CanonOutput* utf8_buffer, + URLComponentSource<char>* source, + url_parse::Parsed* parsed) { + bool success = true; + + // Get the source and parsed structures of the things we are replacing. + const URLComponentSource<char16>& repl_source = repl.sources(); + const url_parse::Parsed& repl_parsed = repl.components(); + + success &= PrepareUTF16OverrideComponent( + repl_source.scheme, repl_parsed.scheme, + utf8_buffer, &parsed->scheme); + success &= PrepareUTF16OverrideComponent( + repl_source.username, repl_parsed.username, + utf8_buffer, &parsed->username); + success &= PrepareUTF16OverrideComponent( + repl_source.password, repl_parsed.password, + utf8_buffer, &parsed->password); + success &= PrepareUTF16OverrideComponent( + repl_source.host, repl_parsed.host, + utf8_buffer, &parsed->host); + success &= PrepareUTF16OverrideComponent( + repl_source.port, repl_parsed.port, + utf8_buffer, &parsed->port); + success &= PrepareUTF16OverrideComponent( + repl_source.path, repl_parsed.path, + utf8_buffer, &parsed->path); + success &= PrepareUTF16OverrideComponent( + repl_source.query, repl_parsed.query, + utf8_buffer, &parsed->query); + success &= PrepareUTF16OverrideComponent( + repl_source.ref, repl_parsed.ref, + utf8_buffer, &parsed->ref); + + // PrepareUTF16OverrideComponent will not have set the data pointer since the + // buffer could be resized, invalidating the pointers. We set the data + // pointers for affected components now that the buffer is finalized. + if (repl_source.scheme) source->scheme = utf8_buffer->data(); + if (repl_source.username) source->username = utf8_buffer->data(); + if (repl_source.password) source->password = utf8_buffer->data(); + if (repl_source.host) source->host = utf8_buffer->data(); + if (repl_source.port) source->port = utf8_buffer->data(); + if (repl_source.path) source->path = utf8_buffer->data(); + if (repl_source.query) source->query = utf8_buffer->data(); + if (repl_source.ref) source->ref = utf8_buffer->data(); + + return success; +} + +#ifndef WIN32 + +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) { + const char* format_str; + if (radix == 10) + format_str = "%d"; + else if (radix == 16) + format_str = "%x"; + else + return EINVAL; + + int written = snprintf(buffer, size_in_chars, format_str, value); + if (static_cast<size_t>(written) >= size_in_chars) { + // Output was truncated, or written was negative. + return EINVAL; + } + return 0; +} + +int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix) { + if (radix != 10) + return EINVAL; + + // No more than 12 characters will be required for a 32-bit integer. + // Add an extra byte for the terminating null. + char temp[13]; + int written = snprintf(temp, sizeof(temp), "%d", value); + if (static_cast<size_t>(written) >= size_in_chars) { + // Output was truncated, or written was negative. + return EINVAL; + } + + for (int i = 0; i < written; ++i) { + buffer[i] = static_cast<char16>(temp[i]); + } + buffer[written] = '\0'; + return 0; +} + +#endif // !WIN32 + +} // namespace url_canon diff --git a/googleurl/src/url_canon_internal.h b/googleurl/src/url_canon_internal.h new file mode 100644 index 0000000..4b1e45a --- /dev/null +++ b/googleurl/src/url_canon_internal.h @@ -0,0 +1,460 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is intended to be included in another C++ file where the character +// types are defined. This allows us to write mostly generic code, but not have +// templace bloat because everything is inlined when anybody calls any of our +// functions. + +#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ + +#include <stdlib.h> + +#include "googleurl/src/url_canon.h" + +namespace url_canon { + +// Character type handling ----------------------------------------------------- + +// Bits that identify different character types. These types identify different +// bits that are set for each 8-bit character in the kSharedCharTypeTable. +enum SharedCharTypes { + // Characters that do not require escaping in queries. Characters that do + // not have this flag will be escaped, see url_canon_query.cc + CHAR_QUERY = 1, + + // Valid in the username/password field. + CHAR_USERINFO = 2, + + // Valid in a IPv4 address (digits plus dot and 'x' for hex). + CHAR_IPV4 = 4, + + // Valid in an ASCII-representation of a hex digit (as in %-escaped). + CHAR_HEX = 8, + + // Valid in an ASCII-representation of a decimal digit. + CHAR_DEC = 16, + + // Valid in an ASCII-representation of an octal digit. + CHAR_OCT = 32, +}; + +// This table contains the flags in SharedCharTypes for each 8-bit character. +// Some canonicalization functions have their own specialized lookup table. +// For those with simple requirements, we have collected the flags in one +// place so there are fewer lookup tables to load into the CPU cache. +// +// Using an unsigned char type has a small but measurable performance benefit +// over using a 32-bit number. +extern const unsigned char kSharedCharTypeTable[0x100]; + +// More readable wrappers around the character type lookup table. +inline bool IsCharOfType(unsigned char c, SharedCharTypes type) { + return !!(kSharedCharTypeTable[c] & type); +} +inline bool IsQueryChar(unsigned char c) { + return IsCharOfType(c, CHAR_QUERY); +} +inline bool IsIPv4Char(unsigned char c) { + return IsCharOfType(c, CHAR_IPV4); +} +inline bool IsHexChar(unsigned char c) { + return IsCharOfType(c, CHAR_HEX); +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. +void AppendStringOfType(const char* source, int length, + SharedCharTypes type, + CanonOutput* output); +void AppendStringOfType(const char16* source, int length, + SharedCharTypes type, + CanonOutput* output); + +// Maps the hex numerical values 0x0 to 0xf to the corresponding ASCII digit +// that will be used to represent it. +extern const char kHexCharLookup[0x10]; + +// This lookup table allows fast conversion between ASCII hex letters and their +// corresponding numerical value. The 8-bit range is divided up into 8 +// regions of 0x20 characters each. Each of the three character types (numbers, +// uppercase, lowercase) falls into different regions of this range. The table +// contains the amount to subtract from characters in that range to get at +// the corresponding numerical value. +// +// See HexDigitToValue for the lookup. +extern const char kCharToHexLookup[8]; + +// Assumes the input is a valid hex digit! Call IsHexChar before using this. +inline unsigned char HexCharToValue(unsigned char c) { + return c - kCharToHexLookup[c / 0x20]; +} + +// Indicates if the given character is a dot or dot equivalent, returning the +// number of characters taken by it. This will be one for a literal dot, 3 for +// an escaped dot. If the character is not a dot, this will return 0. +template<typename CHAR> +inline int IsDot(const CHAR* spec, int offset, int end) { + if (spec[offset] == '.') { + return 1; + } else if (spec[offset] == '%' && offset + 3 <= end && + spec[offset + 1] == '2' && + (spec[offset + 2] == 'e' || spec[offset + 2] == 'E')) { + // Found "%2e" + return 3; + } + return 0; +} + +// Returns the canonicalized version of the input character according to scheme +// rules. This is implemented alongside the scheme canonicalizer, and is +// required for relative URL resolving to test for scheme equality. +// +// Returns 0 if the input character is not a valid scheme character. +char CanonicalSchemeChar(char16 ch); + +// Write a single character, escaped, to the output. This always escapes: it +// does no checking that thee character requires escaping. +// Escaping makes sense only 8 bit chars, so code works in all cases of +// input parameters (8/16bit). +template<typename UINCHAR, typename OUTCHAR> +inline void AppendEscapedChar(UINCHAR ch, + CanonOutputT<OUTCHAR>* output) { + output->push_back('%'); + output->push_back(kHexCharLookup[ch >> 4]); + output->push_back(kHexCharLookup[ch & 0xf]); +} + +// The character we'll substitute for undecodable or invalid characters. +extern const char16 kUnicodeReplacementCharacter; + +// UTF-8 functions ------------------------------------------------------------ + +// Reads one character in UTF-8 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-byte ASCII character, it will not be changed). +// +// Implementation is in url_canon_icu.cc. +bool ReadUTFChar(const char* str, int* begin, int length, + unsigned* code_point_out); + +// Generic To-UTF-8 converter. This will call the given append method for each +// character that should be appended, with the given output method. Wrappers +// are provided below for escaped and non-escaped versions of this. +template<class Output, void Appender(unsigned char, Output*)> +inline void DoAppendUTF8(unsigned char_value, Output* output) { + if (char_value <= 0x7f) { + Appender(static_cast<unsigned char>(char_value), output); + } else if (char_value <= 0x7ff) { + // 110xxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xC0 | (char_value >> 6)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0xffff) { + // 1110xxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xe0 | (char_value >> 12)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0x1fffff) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else if (char_value <= 0x10FFFF) { // Max unicode code point. + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + Appender(static_cast<unsigned char>(0xf8 | (char_value >> 24)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 18) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 12) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | ((char_value >> 6) & 0x3f)), + output); + Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), + output); + } else { + // Invalid UTF-8 character (>20 bits) + } +} + +// Helper used by AppendUTF8Value below. We use an unsigned parameter so there +// are no funny sign problems with the input, but then have to convert it to +// a regular char for appending. +inline void AppendCharToOutput(unsigned char ch, CanonOutput* output) { + output->push_back(static_cast<char>(ch)); +} + +// Writes the given character to the output as UTF-8. This does NO checking +// of the validity of the unicode characters; the caller should ensure that +// the value it is appending is valid to append. +inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { + DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); +} + +// Writes the given character to the output as UTF-8, escaping ALL +// characters (even when they are ASCII). This does NO checking of the +// validity of the unicode characters; the caller should ensure that the value +// it is appending is valid to append. +inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { + DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); +} + +// UTF-16 functions ----------------------------------------------------------- + +// Reads one character in UTF-16 starting at |*begin| in |str| and places +// the decoded value into |*code_point|. If the character is valid, we will +// return true. If invalid, we'll return false and put the +// kUnicodeReplacementCharacter into |*code_point|. +// +// |*begin| will be updated to point to the last character consumed so it +// can be incremented in a loop and will be ready for the next character. +// (for a single-16-bit-word character, it will not be changed). +// +// Implementation is in url_canon_icu.cc. +bool ReadUTFChar(const char16* str, int* begin, int length, + unsigned* code_point); + +// Equivalent to U16_APPEND_UNSAFE in ICU but uses our output method. +inline void AppendUTF16Value(unsigned code_point, + CanonOutputT<char16>* output) { + if (code_point > 0xffff) { + output->push_back(static_cast<char16>((code_point >> 10) + 0xd7c0)); + output->push_back(static_cast<char16>((code_point & 0x3ff) | 0xdc00)); + } else { + output->push_back(static_cast<char16>(code_point)); + } +} + +// Escaping functions --------------------------------------------------------- + +// Writes the given character to the output as UTF-8, escaped. Call this +// function only when the input is wide. Returns true on success. Failure +// means there was some problem with the encoding, we'll still try to +// update the |*begin| pointer and add a placeholder character to the +// output so processing can continue. +// +// We will append the character starting at ch[begin] with the buffer ch +// being |length|. |*begin| will be updated to point to the last character +// consumed (we may consume more than one for UTF-16) so that if called in +// a loop, incrementing the pointer will move to the next character. +// +// Every single output character will be escaped. This means that if you +// give it an ASCII character as input, it will be escaped. Some code uses +// this when it knows that a character is invalid according to its rules +// for validity. If you don't want escaping for ASCII characters, you will +// have to filter them out prior to calling this function. +// +// Assumes that ch[begin] is within range in the array, but does not assume +// that any following characters are. +inline bool AppendUTF8EscapedChar(const char16* str, int* begin, int length, + CanonOutput* output) { + // UTF-16 input. Readchar16 will handle invalid characters for us and give + // us the kUnicodeReplacementCharacter, so we don't have to do special + // checking after failure, just pass through the failure to the caller. + unsigned char_value; + bool success = ReadUTFChar(str, begin, length, &char_value); + AppendUTF8EscapedValue(char_value, output); + return success; +} + +// Handles UTF-8 input. See the wide version above for usage. +inline bool AppendUTF8EscapedChar(const char* str, int* begin, int length, + CanonOutput* output) { + // ReadUTF8Char will handle invalid characters for us and give us the + // kUnicodeReplacementCharacter, so we don't have to do special checking + // after failure, just pass through the failure to the caller. + unsigned ch; + bool success = ReadUTFChar(str, begin, length, &ch); + AppendUTF8EscapedValue(ch, output); + return success; +} + +// Given a '%' character at |*begin| in the string |spec|, this will decode +// the escaped value and put it into |*unescaped_value| on success (returns +// true). On failure, this will return false, and will not write into +// |*unescaped_value|. +// +// |*begin| will be updated to point to the last character of the escape +// sequence so that when called with the index of a for loop, the next time +// through it will point to the next character to be considered. On failure, +// |*begin| will be unchanged. +inline bool Is8BitChar(char c) { + return true; // this case is specialized to avoid a warning +} +inline bool Is8BitChar(char16 c) { + return c <= 255; +} + +template<typename CHAR> +inline bool DecodeEscaped(const CHAR* spec, int* begin, int end, + unsigned char* unescaped_value) { + if (*begin + 3 > end || + !Is8BitChar(spec[*begin + 1]) || !Is8BitChar(spec[*begin + 2])) { + // Invalid escape sequence because there's not enough room, or the + // digits are not ASCII. + return false; + } + + unsigned char first = static_cast<unsigned char>(spec[*begin + 1]); + unsigned char second = static_cast<unsigned char>(spec[*begin + 2]); + if (!IsHexChar(first) || !IsHexChar(second)) { + // Invalid hex digits, fail. + return false; + } + + // Valid escape sequence. + *unescaped_value = (HexCharToValue(first) << 4) + HexCharToValue(second); + *begin += 2; + return true; +} + +// Appends the given substring to the output, escaping "some" characters that +// it feels may not be safe. It assumes the input values are all contained in +// 8-bit although it allows any type. +// +// This is used in error cases to append invalid output so that it looks +// approximately correct. Non-error cases should not call this function since +// the escaping rules are not guaranteed! +void AppendInvalidNarrowString(const char* spec, int begin, int end, + CanonOutput* output); +void AppendInvalidNarrowString(const char16* spec, int begin, int end, + CanonOutput* output); + +// Misc canonicalization helpers ---------------------------------------------- + +// Converts between UTF-8 and UTF-16, returning true on successful conversion. +// The output will be appended to the given canonicalizer output (so make sure +// it's empty if you want to replace). +// +// On invalid input, this will still write as much output as possible, +// replacing the invalid characters with the "invalid character". It will +// return false in the failure case, and the caller should not continue as +// normal. +bool ConvertUTF16ToUTF8(const char16* input, int input_len, + CanonOutput* output); +bool ConvertUTF8ToUTF16(const char* input, int input_len, + CanonOutputT<char16>* output); + +// Converts from UTF-16 to 8-bit using the character set converter. If the +// converter is NULL, this will use UTF-8. +void ConvertUTF16ToQueryEncoding(const char16* input, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output); + +// Applies the replacements to the given component source. The component source +// should be pre-initialized to the "old" base. That is, all pointers will +// point to the spec of the old URL, and all of the Parsed components will +// be indices into that string. +// +// The pointers and components in the |source| for all non-NULL strings in the +// |repl| (replacements) will be updated to reference those strings. +// Canonicalizing with the new |source| and |parsed| can then combine URL +// components from many different strings. +void SetupOverrideComponents(const char* base, + const Replacements<char>& repl, + URLComponentSource<char>* source, + url_parse::Parsed* parsed); + +// Like the above 8-bit version, except that it additionally converts the +// UTF-16 input to UTF-8 before doing the overrides. +// +// The given utf8_buffer is used to store the converted components. They will +// be appended one after another, with the parsed structure identifying the +// appropriate substrings. This buffer is a parameter because the source has +// no storage, so the buffer must have the same lifetime as the source +// parameter owned by the caller. +// +// THE CALLER MUST NOT ADD TO THE |utf8_buffer| AFTER THIS CALL. Members of +// |source| will point into this buffer, which could be invalidated if +// additional data is added and the CanonOutput resizes its buffer. +// +// Returns true on success. Fales means that the input was not valid UTF-16, +// although we will have still done the override with "invalid characters" in +// place of errors. +bool SetupUTF16OverrideComponents(const char* base, + const Replacements<char16>& repl, + CanonOutput* utf8_buffer, + URLComponentSource<char>* source, + url_parse::Parsed* parsed); + +// Implemented in url_canon_path.cc, these are required by the relative URL +// resolver as well, so we declare them here. +bool CanonicalizePartialPath(const char* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output); +bool CanonicalizePartialPath(const char16* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output); + +#ifndef WIN32 + +// Implementations of Windows' int-to-string conversions +int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix); +int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix); + +// Secure template overloads for these functions +template<size_t N> +inline int _itoa_s(int value, char (&buffer)[N], int radix) { + return _itoa_s(value, buffer, N, radix); +} + +template<size_t N> +inline int _itow_s(int value, char16 (&buffer)[N], int radix) { + return _itow_s(value, buffer, N, radix); +} + +// _strtoui64 and strtoull behave the same +inline unsigned long long _strtoui64(const char* nptr, + char** endptr, int base) { + return strtoull(nptr, endptr, base); +} + +#endif // WIN32 + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_H__ diff --git a/googleurl/src/url_canon_internal_file.h b/googleurl/src/url_canon_internal_file.h new file mode 100644 index 0000000..63a9c5b --- /dev/null +++ b/googleurl/src/url_canon_internal_file.h @@ -0,0 +1,157 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// As with url_canon_internal.h, this file is intended to be included in +// another C++ file where the template types are defined. This allows the +// programmer to use this to use these functions for their own strings +// types, without bloating the code by having inline templates used in +// every call site. +// +// *** This file must be included after url_canon_internal as we depend on some +// functions in it. *** + +#ifndef GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ +#define GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ + +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +using namespace url_canon; + +// Given a pointer into the spec, this copies and canonicalizes the drive +// letter and colon to the output, if one is found. If there is not a drive +// spec, it won't do anything. The index of the next character in the input +// spec is returned (after the colon when a drive spec is found, the begin +// offset if one is not). +template<typename CHAR> +static int FileDoDriveSpec(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // The path could be one of several things: /foo/bar, c:/foo/bar, /c:/foo, + // (with backslashes instead of slashes as well). + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, end)) + return begin; // Haven't consumed any characters + + // DoesBeginWindowsDriveSpec will ensure that the drive letter is valid + // and that it is followed by a colon/pipe. + + // Normalize Windows drive letters to uppercase + if (spec[after_slashes] >= 'a' && spec[after_slashes] <= 'z') + output->push_back(spec[after_slashes] - 'a' + 'A'); + else + output->push_back(static_cast<char>(spec[after_slashes])); + + // Normalize the character following it to a colon rather than pipe. + output->push_back(':'); + output->push_back('/'); + return after_slashes + 2; +} + +// FileDoDriveSpec will have already added the first backslash, so we need to +// write everything following the slashes using the path canonicalizer. +template<typename CHAR, typename UCHAR> +static void FileDoPath(const CHAR* spec, int begin, int end, + CanonOutput* output) { + // Normalize the number of slashes after the drive letter. The path + // canonicalizer expects the input to begin in a slash already so + // doesn't check. We want to handle no-slashes + int num_slashes = CountConsecutiveSlashes(spec, begin, end); + int after_slashes = begin + num_slashes; + + // Now use the regular path canonicalizer to canonicalize the rest of the + // path. We supply it with the path following the slashes. It won't prepend + // a slash because it assumes any nonempty path already starts with one. + // We explicitly filter out calls with no path here to prevent that case. + ParsedURL::Component sub_path(after_slashes, end - after_slashes); + if (sub_path.len > 0) { + // Give it a fake output component to write into. DoCanonicalizeFile will + // compute the full path component. + ParsedURL::Component fake_output_path; + URLCanonInternal<CHAR, UCHAR>::DoPath( + spec, sub_path, output, &fake_output_path); + } +} + +template<typename CHAR, typename UCHAR> +static bool DoCanonicalizeFileURL(const URLComponentSource<CHAR>& source, + const ParsedURL& parsed, + CanonOutput* output, + ParsedURL* new_parsed) { + // Things we don't set in file: URLs. + new_parsed->username = ParsedURL::Component(0, -1); + new_parsed->password = ParsedURL::Component(0, -1); + new_parsed->port = ParsedURL::Component(0, -1); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->push_back('f'); + output->push_back('i'); + output->push_back('l'); + output->push_back('e'); + new_parsed->scheme.len = output->length() - new_parsed->scheme.begin; + output->push_back(':'); + + // Write the separator for the host. + output->push_back('/'); + output->push_back('/'); + + // Append the host. For many file URLs, this will be empty. For UNC, this + // will be present. + // TODO(brettw) This doesn't do any checking for host name validity. We + // should probably handle validity checking of UNC hosts differently than + // for regular IP hosts. + bool success = URLCanonInternal<CHAR, UCHAR>::DoHost( + source.host, parsed.host, output, &new_parsed->host); + + // Write a separator for the start of the path. We'll ignore any slashes + // already at the beginning of the path. + new_parsed->path.begin = output->length(); + output->push_back('/'); + + // Copies and normalizes the "c:" at the beginning, if present. + int after_drive = FileDoDriveSpec(source.path, parsed.path.begin, + parsed.path.end(), output); + + // Copies the rest of the path + FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output); + new_parsed->path.len = output->length() - new_parsed->path.begin; + + // Things following the path we can use the standard canonicalizers for. + success &= URLCanonInternal<CHAR, UCHAR>::DoQuery( + source.query, parsed.query, output, &new_parsed->query); + success &= URLCanonInternal<CHAR, UCHAR>::DoRef( + source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +#endif // GOOGLEURL_SRC_URL_CANON_INTERNAL_FILE_H__ diff --git a/googleurl/src/url_canon_ip.cc b/googleurl/src/url_canon_ip.cc new file mode 100644 index 0000000..d84ff7d --- /dev/null +++ b/googleurl/src/url_canon_ip.cc @@ -0,0 +1,734 @@ +// Copyright 2009, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "googleurl/src/url_canon_ip.h" + +#include <stdlib.h> + +#include "base/basictypes.h" +#include "base/logging.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +// Converts one of the character types that represent a numerical base to the +// corresponding base. +int BaseForType(SharedCharTypes type) { + switch (type) { + case CHAR_HEX: + return 16; + case CHAR_DEC: + return 10; + case CHAR_OCT: + return 8; + default: + return 0; + } +} + +template<typename CHAR, typename UCHAR> +bool DoFindIPv4Components(const CHAR* spec, + const url_parse::Component& host, + url_parse::Component components[4]) { + int cur_component = 0; // Index of the component we're working on. + int cur_component_begin = host.begin; // Start of the current component. + int end = host.end(); + for (int i = host.begin; /* nothing */; i++) { + if (i == end || spec[i] == '.') { + // Found the end of the current component. + int component_len = i - cur_component_begin; + components[cur_component] = + url_parse::Component(cur_component_begin, component_len); + + // The next component starts after the dot. + cur_component_begin = i + 1; + cur_component++; + + // Don't allow empty components (two dots in a row), except we may + // allow an empty component at the end (this would indicate that the + // input ends in a dot). We also want to error if the component is + // empty and it's the only component (cur_component == 1). + if (component_len == 0 && (i != end || cur_component == 1)) + return false; + + if (i == end) + break; // End of the input. + + if (cur_component == 4) { + // Anything else after the 4th component is an error unless it is a + // dot that would otherwise be treated as the end of input. + if (spec[i] == '.' && i + 1 == end) + break; + return false; + } + } else if (static_cast<UCHAR>(spec[i]) >= 0x80 || + !IsIPv4Char(static_cast<unsigned char>(spec[i]))) { + // Invalid character for an IPv4 address. + return false; + } + } + + // Fill in any unused components. + while (cur_component < 4) + components[cur_component++] = url_parse::Component(); + return true; +} + +// Converts an IPv4 component to a 32-bit number, while checking for overflow. +// +// Possible return values: +// - IPV4 - The number was valid, and did not overflow. +// - BROKEN - The input was numeric, but too large for a 32-bit field. +// - NEUTRAL - Input was not numeric. +// +// The input is assumed to be ASCII. FindIPv4Components should have stripped +// out any input that is greater than 7 bits. The components are assumed +// to be non-empty. +template<typename CHAR> +CanonHostInfo::Family IPv4ComponentToNumber( + const CHAR* spec, + const url_parse::Component& component, + uint32* number) { + // Figure out the base + SharedCharTypes base; + int base_prefix_len = 0; // Size of the prefix for this base. + if (spec[component.begin] == '0') { + // Either hex or dec, or a standalone zero. + if (component.len == 1) { + base = CHAR_DEC; + } else if (spec[component.begin + 1] == 'X' || + spec[component.begin + 1] == 'x') { + base = CHAR_HEX; + base_prefix_len = 2; + } else { + base = CHAR_OCT; + base_prefix_len = 1; + } + } else { + base = CHAR_DEC; + } + + // Extend the prefix to consume all leading zeros. + while (base_prefix_len < component.len && + spec[component.begin + base_prefix_len] == '0') + base_prefix_len++; + + // Put the component, minus any base prefix, into a NULL-terminated buffer so + // we can call the standard library. Because leading zeros have already been + // discarded, filling the entire buffer is guaranteed to trigger the 32-bit + // overflow check. + const int kMaxComponentLen = 16; + char buf[kMaxComponentLen + 1]; // digits + '\0' + int dest_i = 0; + for (int i = component.begin + base_prefix_len; i < component.end(); i++) { + // We know the input is 7-bit, so convert to narrow (if this is the wide + // version of the template) by casting. + char input = static_cast<char>(spec[i]); + + // Validate that this character is OK for the given base. + if (!IsCharOfType(input, base)) + return CanonHostInfo::NEUTRAL; + + // Fill the buffer, if there's space remaining. This check allows us to + // verify that all characters are numeric, even those that don't fit. + if (dest_i < kMaxComponentLen) + buf[dest_i++] = input; + } + + buf[dest_i] = '\0'; + + // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal + // number can overflow a 64-bit number in <= 16 characters). + uint64 num = _strtoui64(buf, NULL, BaseForType(base)); + + // Check for 32-bit overflow. + if (num > kuint32max) + return CanonHostInfo::BROKEN; + + // No overflow. Success! + *number = static_cast<uint32>(num); + return CanonHostInfo::IPV4; +} + +// Writes the given address (with each character representing one dotted +// part of an IPv4 address) to the output, and updating |*out_host| to +// identify the added portion. +void AppendIPv4Address(const unsigned char address[4], + CanonOutput* output, + url_parse::Component* out_host) { + out_host->begin = output->length(); + for (int i = 0; i < 4; i++) { + char str[16]; + _itoa_s(address[i], str, 10); + + for (int ch = 0; str[ch] != 0; ch++) + output->push_back(str[ch]); + + if (i != 3) + output->push_back('.'); + } + out_host->len = output->length() - out_host->begin; +} + +// See declaration of IPv4AddressToNumber for documentation. +template<typename CHAR> +CanonHostInfo::Family DoIPv4AddressToNumber(const CHAR* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components) { + // The identified components. Not all may exist. + url_parse::Component components[4]; + if (!FindIPv4Components(spec, host, components)) + return CanonHostInfo::NEUTRAL; + + // Convert existing components to digits. Values up to + // |existing_components| will be valid. + uint32 component_values[4]; + int existing_components = 0; + for (int i = 0; i < 4; i++) { + if (components[i].len <= 0) + continue; + CanonHostInfo::Family family = IPv4ComponentToNumber( + spec, components[i], &component_values[existing_components]); + + // Stop if we hit an invalid non-empty component. + if (family != CanonHostInfo::IPV4) + return family; + + existing_components++; + } + + // Use that sequence of numbers to fill out the 4-component IP address. + + // First, process all components but the last, while making sure each fits + // within an 8-bit field. + for (int i = 0; i < existing_components - 1; i++) { + if (component_values[i] > kuint8max) + return CanonHostInfo::BROKEN; + address[i] = static_cast<unsigned char>(component_values[i]); + } + + // Next, consume the last component to fill in the remaining bytes. + uint32 last_value = component_values[existing_components - 1]; + for (int i = 3; i >= existing_components - 1; i--) { + address[i] = static_cast<unsigned char>(last_value); + last_value >>= 8; + } + + // If the last component has residual bits, report overflow. + if (last_value != 0) + return CanonHostInfo::BROKEN; + + // Tell the caller how many components we saw. + *num_ipv4_components = existing_components; + + // Success! + return CanonHostInfo::IPV4; +} + +// Return true if we've made a final IPV4/BROKEN decision, false if the result +// is NEUTRAL, and we could use a second opinion. +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeIPv4Address(const CHAR* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + unsigned char address[4]; + host_info->family = IPv4AddressToNumber( + spec, host, address, &host_info->num_ipv4_components); + + switch (host_info->family) { + case CanonHostInfo::IPV4: + // Definitely an IPv4 address. + AppendIPv4Address(address, output, &host_info->out_host); + return true; + case CanonHostInfo::BROKEN: + // Definitely broken. + return true; + default: + // Could be IPv6 or a hostname. + return false; + } +} + +// Helper class that describes the main components of an IPv6 input string. +// See the following examples to understand how it breaks up an input string: +// +// [Example 1]: input = "[::aa:bb]" +// ==> num_hex_components = 2 +// ==> hex_components[0] = Component(3,2) "aa" +// ==> hex_components[1] = Component(6,2) "bb" +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(0, -1) +// +// [Example 2]: input = "[1:2::3:4:5]" +// ==> num_hex_components = 5 +// ==> hex_components[0] = Component(1,1) "1" +// ==> hex_components[1] = Component(3,1) "2" +// ==> hex_components[2] = Component(6,1) "3" +// ==> hex_components[3] = Component(8,1) "4" +// ==> hex_components[4] = Component(10,1) "5" +// ==> index_of_contraction = 2 +// ==> ipv4_component = Component(0, -1) +// +// [Example 3]: input = "[::ffff:192.168.0.1]" +// ==> num_hex_components = 1 +// ==> hex_components[0] = Component(3,4) "ffff" +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(8, 11) "192.168.0.1" +// +// [Example 4]: input = "[1::]" +// ==> num_hex_components = 1 +// ==> hex_components[0] = Component(1,1) "1" +// ==> index_of_contraction = 1 +// ==> ipv4_component = Component(0, -1) +// +// [Example 5]: input = "[::192.168.0.1]" +// ==> num_hex_components = 0 +// ==> index_of_contraction = 0 +// ==> ipv4_component = Component(8, 11) "192.168.0.1" +// +struct IPv6Parsed { + // Zero-out the parse information. + void reset() { + num_hex_components = 0; + index_of_contraction = -1; + ipv4_component.reset(); + } + + // There can be up to 8 hex components (colon separated) in the literal. + url_parse::Component hex_components[8]; + + // The count of hex components present. Ranges from [0,8]. + int num_hex_components; + + // The index of the hex component that the "::" contraction precedes, or + // -1 if there is no contraction. + int index_of_contraction; + + // The range of characters which are an IPv4 literal. + url_parse::Component ipv4_component; +}; + +// Parse the IPv6 input string. If parsing succeeded returns true and fills +// |parsed| with the information. If parsing failed (because the input is +// invalid) returns false. +template<typename CHAR, typename UCHAR> +bool DoParseIPv6(const CHAR* spec, + const url_parse::Component& host, + IPv6Parsed* parsed) { + // Zero-out the info. + parsed->reset(); + + if (!host.is_nonempty()) + return false; + + // The index for start and end of address range (no brackets). + int begin = host.begin; + int end = host.end(); + + int cur_component_begin = begin; // Start of the current component. + + // Scan through the input, searching for hex components, "::" contractions, + // and IPv4 components. + for (int i = begin; /* i <= end */; i++) { + bool is_colon = spec[i] == ':'; + bool is_contraction = is_colon && i < end - 1 && spec[i + 1] == ':'; + + // We reached the end of the current component if we encounter a colon + // (separator between hex components, or start of a contraction), or end of + // input. + if (is_colon || i == end) { + int component_len = i - cur_component_begin; + + // A component should not have more than 4 hex digits. + if (component_len > 4) + return false; + + // Don't allow empty components. + if (component_len == 0) { + // The exception is when contractions appear at beginning of the + // input or at the end of the input. + if (!((is_contraction && i == begin) || (i == end && + parsed->index_of_contraction == parsed->num_hex_components))) + return false; + } + + // Add the hex component we just found to running list. + if (component_len > 0) { + // Can't have more than 8 components! + if (parsed->num_hex_components >= 8) + return false; + + parsed->hex_components[parsed->num_hex_components++] = + url_parse::Component(cur_component_begin, component_len); + } + } + + if (i == end) + break; // Reached the end of the input, DONE. + + // We found a "::" contraction. + if (is_contraction) { + // There can be at most one contraction in the literal. + if (parsed->index_of_contraction != -1) + return false; + parsed->index_of_contraction = parsed->num_hex_components; + ++i; // Consume the colon we peeked. + } + + if (is_colon) { + // Colons are separators between components, keep track of where the + // current component started (after this colon). + cur_component_begin = i + 1; + } else { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + return false; // Not ASCII. + + if (!IsHexChar(static_cast<unsigned char>(spec[i]))) { + // Regular components are hex numbers. It is also possible for + // a component to be an IPv4 address in dotted form. + if (IsIPv4Char(static_cast<unsigned char>(spec[i]))) { + // Since IPv4 address can only appear at the end, assume the rest + // of the string is an IPv4 address. (We will parse this separately + // later). + parsed->ipv4_component = url_parse::Component( + cur_component_begin, end - cur_component_begin); + break; + } else { + // The character was neither a hex digit, nor an IPv4 character. + return false; + } + } + } + } + + return true; +} + +// Verifies the parsed IPv6 information, checking that the various components +// add up to the right number of bits (hex components are 16 bits, while +// embedded IPv4 formats are 32 bits, and contractions are placeholdes for +// 16 or more bits). Returns true if sizes match up, false otherwise. On +// success writes the length of the contraction (if any) to +// |out_num_bytes_of_contraction|. +bool CheckIPv6ComponentsSize(const IPv6Parsed& parsed, + int* out_num_bytes_of_contraction) { + // Each group of four hex digits contributes 16 bits. + int num_bytes_without_contraction = parsed.num_hex_components * 2; + + // If an IPv4 address was embedded at the end, it contributes 32 bits. + if (parsed.ipv4_component.is_valid()) + num_bytes_without_contraction += 4; + + // If there was a "::" contraction, its size is going to be: + // MAX([16bits], [128bits] - num_bytes_without_contraction). + int num_bytes_of_contraction = 0; + if (parsed.index_of_contraction != -1) { + num_bytes_of_contraction = 16 - num_bytes_without_contraction; + if (num_bytes_of_contraction < 2) + num_bytes_of_contraction = 2; + } + + // Check that the numbers add up. + if (num_bytes_without_contraction + num_bytes_of_contraction != 16) + return false; + + *out_num_bytes_of_contraction = num_bytes_of_contraction; + return true; +} + +// Converts a hex comonent into a number. This cannot fail since the caller has +// already verified that each character in the string was a hex digit, and +// that there were no more than 4 characters. +template<typename CHAR> +uint16 IPv6HexComponentToNumber(const CHAR* spec, + const url_parse::Component& component) { + DCHECK(component.len <= 4); + + // Copy the hex string into a C-string. + char buf[5]; + for (int i = 0; i < component.len; ++i) + buf[i] = static_cast<char>(spec[component.begin + i]); + buf[component.len] = '\0'; + + // Convert it to a number (overflow is not possible, since with 4 hex + // characters we can at most have a 16 bit number). + return static_cast<uint16>(_strtoui64(buf, NULL, 16)); +} + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +template<typename CHAR, typename UCHAR> +bool DoIPv6AddressToNumber(const CHAR* spec, + const url_parse::Component& host, + unsigned char address[16]) { + // Make sure the component is bounded by '[' and ']'. + int end = host.end(); + if (!host.is_nonempty() || spec[host.begin] != '[' || spec[end - 1] != ']') + return false; + + // Exclude the square brackets. + url_parse::Component ipv6_comp(host.begin + 1, host.len - 2); + + // Parse the IPv6 address -- identify where all the colon separated hex + // components are, the "::" contraction, and the embedded IPv4 address. + IPv6Parsed ipv6_parsed; + if (!DoParseIPv6<CHAR, UCHAR>(spec, ipv6_comp, &ipv6_parsed)) + return false; + + // Do some basic size checks to make sure that the address doesn't + // specify more than 128 bits or fewer than 128 bits. This also resolves + // how may zero bytes the "::" contraction represents. + int num_bytes_of_contraction; + if (!CheckIPv6ComponentsSize(ipv6_parsed, &num_bytes_of_contraction)) + return false; + + int cur_index_in_address = 0; + + // Loop through each hex components, and contraction in order. + for (int i = 0; i <= ipv6_parsed.num_hex_components; ++i) { + // Append the contraction if it appears before this component. + if (i == ipv6_parsed.index_of_contraction) { + for (int j = 0; j < num_bytes_of_contraction; ++j) + address[cur_index_in_address++] = 0; + } + // Append the hex component's value. + if (i != ipv6_parsed.num_hex_components) { + // Get the 16-bit value for this hex component. + uint16 number = IPv6HexComponentToNumber<CHAR>( + spec, ipv6_parsed.hex_components[i]); + // Append to |address|, in network byte order. + address[cur_index_in_address++] = (number & 0xFF00) >> 8; + address[cur_index_in_address++] = (number & 0x00FF); + } + } + + // If there was an IPv4 section, convert it into a 32-bit number and append + // it to |address|. + if (ipv6_parsed.ipv4_component.is_valid()) { + // We only allow the embedded IPv4 syntax to be used for "compat" and + // "mapped" formats: + // "compat" ==> 0:0:0:0:0:ffff:<IPv4-literal> + // "mapped" ==> 0:0:0:0:0:0000:<IPv4-literal> + for (int j = 0; j < 10; ++j) { + if (address[j] != 0) + return false; + } + if (!((address[10] == 0 && address[11] == 0) || + (address[10] == 0xFF && address[11] == 0xFF))) + return false; + + // Append the 32-bit number to |address|. + int ignored_num_ipv4_components; + if (CanonHostInfo::IPV4 != + IPv4AddressToNumber(spec, + ipv6_parsed.ipv4_component, + &address[cur_index_in_address], + &ignored_num_ipv4_components)) + return false; + } + + return true; +} + +// Searches for the longest sequence of zeros in |address|, and writes the +// range into |contraction_range|. The run of zeros must be at least 16 bits, +// and if there is a tie the first is chosen. +void ChooseIPv6ContractionRange(const unsigned char address[16], + url_parse::Component* contraction_range) { + // The longest run of zeros in |address| seen so far. + url_parse::Component max_range; + + // The current run of zeros in |address| being iterated over. + url_parse::Component cur_range; + + for (int i = 0; i < 16; i += 2) { + // Test for 16 bits worth of zero. + bool is_zero = (address[i] == 0 && address[i + 1] == 0); + + if (is_zero) { + // Add the zero to the current range (or start a new one). + if (!cur_range.is_valid()) + cur_range = url_parse::Component(i, 0); + cur_range.len += 2; + } + + if (!is_zero || i == 14) { + // Just completed a run of zeros. If the run is greater than 16 bits, + // it is a candidate for the contraction. + if (cur_range.len > 2 && cur_range.len > max_range.len) { + max_range = cur_range; + } + cur_range.reset(); + } + } + *contraction_range = max_range; +} + +// Return true if we've made a final IPV6/BROKEN decision, false if the result +// is NEUTRAL, and we could use a second opinion. +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeIPv6Address(const CHAR* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + // Turn the IP address into a 128 bit number. + unsigned char address[16]; + if (!IPv6AddressToNumber(spec, host, address)) { + // If it's not an IPv6 address, scan for characters that should *only* + // exist in an IPv6 address. + for (int i = host.begin; i < host.end(); i++) { + switch (spec[i]) { + case '[': + case ']': + case ':': + host_info->family = CanonHostInfo::BROKEN; + return true; + } + } + + // No invalid characters. Could still be IPv4 or a hostname. + host_info->family = CanonHostInfo::NEUTRAL; + return false; + } + + host_info->out_host.begin = output->length(); + output->push_back('['); + + // We will now output the address according to the rules in: + // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#section-4 + + // Start by finding where to place the "::" contraction (if any). + url_parse::Component contraction_range; + ChooseIPv6ContractionRange(address, &contraction_range); + + for (int i = 0; i <= 14;) { + // We check 2 bytes at a time, from bytes (0, 1) to (14, 15), inclusive. + DCHECK(i % 2 == 0); + if (i == contraction_range.begin && contraction_range.len > 0) { + // Jump over the contraction. + if (i == 0) + output->push_back(':'); + output->push_back(':'); + i = contraction_range.end(); + } else { + // Consume the next 16 bits from |address|. + int x = address[i] << 8 | address[i + 1]; + + i += 2; + + // Stringify the 16 bit number (at most requires 4 hex digits). + char str[5]; + _itoa_s(x, str, 16); + for (int ch = 0; str[ch] != 0; ++ch) + output->push_back(str[ch]); + + // Put a colon after each number, except the last. + if (i < 16) + output->push_back(':'); + } + } + + output->push_back(']'); + host_info->out_host.len = output->length() - host_info->out_host.begin; + + host_info->family = CanonHostInfo::IPV6; + return true; +} + +} // namespace + +bool FindIPv4Components(const char* spec, + const url_parse::Component& host, + url_parse::Component components[4]) { + return DoFindIPv4Components<char, unsigned char>(spec, host, components); +} + +bool FindIPv4Components(const char16* spec, + const url_parse::Component& host, + url_parse::Component components[4]) { + return DoFindIPv4Components<char16, char16>(spec, host, components); +} + +void CanonicalizeIPAddress(const char* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (DoCanonicalizeIPv4Address<char, unsigned char>( + spec, host, output, host_info)) + return; + if (DoCanonicalizeIPv6Address<char, unsigned char>( + spec, host, output, host_info)) + return; +} + +void CanonicalizeIPAddress(const char16* spec, + const url_parse::Component& host, + CanonOutput* output, + CanonHostInfo* host_info) { + if (DoCanonicalizeIPv4Address<char16, char16>( + spec, host, output, host_info)) + return; + if (DoCanonicalizeIPv6Address<char16, char16>( + spec, host, output, host_info)) + return; +} + +CanonHostInfo::Family IPv4AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components) { + return DoIPv4AddressToNumber<char>(spec, host, address, num_ipv4_components); +} + +CanonHostInfo::Family IPv4AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components) { + return DoIPv4AddressToNumber<char16>( + spec, host, address, num_ipv4_components); +} + +bool IPv6AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[16]) { + return DoIPv6AddressToNumber<char, unsigned char>(spec, host, address); +} + +bool IPv6AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[16]) { + return DoIPv6AddressToNumber<char16, char16>(spec, host, address); +} + + +} // namespace url_canon diff --git a/googleurl/src/url_canon_ip.h b/googleurl/src/url_canon_ip.h new file mode 100644 index 0000000..6ce069d --- /dev/null +++ b/googleurl/src/url_canon_ip.h @@ -0,0 +1,98 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_CANON_IP_H__ +#define GOOGLEURL_SRC_URL_CANON_IP_H__ + +#include "base/string16.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_parse.h" + +namespace url_canon { + +// Searches the host name for the portions of the IPv4 address. On success, +// each component will be placed into |components| and it will return true. +// It will return false if the host can not be separated as an IPv4 address +// or if there are any non-7-bit characters or other characters that can not +// be in an IP address. (This is important so we fail as early as possible for +// common non-IP hostnames.) +// +// Not all components may exist. If there are only 3 components, for example, +// the last one will have a length of -1 or 0 to indicate it does not exist. +// +// Note that many platform's inet_addr will ignore everything after a space +// in certain curcumstances if the stuff before the space looks like an IP +// address. IE6 is included in this. We do NOT handle this case. In many cases, +// the browser's canonicalization will get run before this which converts +// spaces to %20 (in the case of IE7) or rejects them (in the case of +// Mozilla), so this code path never gets hit. Our host canonicalization will +// notice these spaces and escape them, which will make IP address finding +// fail. This seems like better behavior than stripping after a space. +bool FindIPv4Components(const char* spec, + const url_parse::Component& host, + url_parse::Component components[4]); +bool FindIPv4Components(const char16* spec, + const url_parse::Component& host, + url_parse::Component components[4]); + +// Converts an IPv4 address to a 32-bit number (network byte order). +// +// Possible return values: +// IPV4 - IPv4 address was successfully parsed. +// BROKEN - Input was formatted like an IPv4 address, but overflow occurred +// during parsing. +// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address. +// It might be an IPv6 address, or a hostname. +// +// On success, |num_ipv4_components| will be populated with the number of +// components in the IPv4 address. +CanonHostInfo::Family IPv4AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); +CanonHostInfo::Family IPv4AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[4], + int* num_ipv4_components); + +// Converts an IPv6 address to a 128-bit number (network byte order), returning +// true on success. False means that the input was not a valid IPv6 address. +// +// NOTE that |host| is expected to be surrounded by square brackets. +// i.e. "[::1]" rather than "::1". +bool IPv6AddressToNumber(const char* spec, + const url_parse::Component& host, + unsigned char address[16]); +bool IPv6AddressToNumber(const char16* spec, + const url_parse::Component& host, + unsigned char address[16]); + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_IP_H__ diff --git a/googleurl/src/url_canon_mailtourl.cc b/googleurl/src/url_canon_mailtourl.cc new file mode 100644 index 0000000..97868b8 --- /dev/null +++ b/googleurl/src/url_canon_mailtourl.cc @@ -0,0 +1,137 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Functions for canonicalizing "mailto:" URLs. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_canon { + +namespace { + + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeMailtoURL(const URLComponentSource<CHAR>& source, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + + // mailto: only uses {scheme, path, query} -- clear the rest. + new_parsed->username = url_parse::Component(); + new_parsed->password = url_parse::Component(); + new_parsed->host = url_parse::Component(); + new_parsed->port = url_parse::Component(); + new_parsed->ref = url_parse::Component(); + + // Scheme (known, so we don't bother running it through the more + // complicated scheme canonicalizer). + new_parsed->scheme.begin = output->length(); + output->Append("mailto:", 7); + new_parsed->scheme.len = 6; + + bool success = true; + + // Path + if (parsed.path.is_valid()) { + new_parsed->path.begin = output->length(); + + // Copy the path using path URL's more lax escaping rules. + // We convert to UTF-8 and escape non-ASCII, but leave all + // ASCII characters alone. + int end = parsed.path.end(); + for (int i = parsed.path.begin; i < end; ++i) { + UCHAR uch = static_cast<UCHAR>(source.path[i]); + if (uch < 0x20 || uch >= 0x80) + success &= AppendUTF8EscapedChar(source.path, &i, end, output); + else + output->push_back(static_cast<char>(uch)); + } + + new_parsed->path.len = output->length() - new_parsed->path.begin; + } else { + // No path at all + new_parsed->path.reset(); + } + + // Query -- always use the default utf8 charset converter. + CanonicalizeQuery(source.query, parsed.query, NULL, + output, &new_parsed->query); + + return success; +} + +} // namespace + +bool CanonicalizeMailtoURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeMailtoURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, output, new_parsed); +} + +bool CanonicalizeMailtoURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeMailtoURL<char16, char16>( + URLComponentSource<char16>(spec), parsed, output, new_parsed); +} + +bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeMailtoURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +bool ReplaceMailtoURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeMailtoURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_path.cc b/googleurl/src/url_canon_path.cc new file mode 100644 index 0000000..98ca40b --- /dev/null +++ b/googleurl/src/url_canon_path.cc @@ -0,0 +1,380 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Canonicalization functions for the paths of URLs. + +#include "base/logging.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_canon { + +namespace { + +enum CharacterFlags { + // Pass through unchanged, whether escaped or unescaped. This doesn't + // actually set anything so you can't OR it to check, it's just to make the + // table below more clear when neither ESCAPE or UNESCAPE is set. + PASS = 0, + + // This character requires special handling in DoPartialPath. Doing this test + // first allows us to filter out the common cases of regular characters that + // can be directly copied. + SPECIAL = 1, + + // This character must be escaped in the canonical output. Note that all + // escaped chars also have the "special" bit set so that the code that looks + // for this is triggered. Not valid with PASS or ESCAPE + ESCAPE_BIT = 2, + ESCAPE = ESCAPE_BIT | SPECIAL, + + // This character must be unescaped in canonical output. Not valid with + // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these + // characters unescaped, they should just be copied. + UNESCAPE = 4, + + // This character is disallowed in URLs. Note that the "special" bit is also + // set to trigger handling. + INVALID_BIT = 8, + INVALID = INVALID_BIT | SPECIAL, +}; + +// This table contains one of the above flag values. Note some flags are more +// than one bits because they also turn on the "special" flag. Special is the +// only flag that may be combined with others. +// +// This table is designed to match exactly what IE does with the characters. +// +// Dot is even more special, and the escaped version is handled specially by +// IsDot. Therefore, we don't need the "escape" flag, and even the "unescape" +// bit is never handled (we just need the "special") bit. +const unsigned char kPathCharLookup[0x100] = { +// NULL control chars... + INVALID, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, +// control chars... + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, +// ' ' ! " # $ % & ' ( ) * + , - . / + ESCAPE, PASS, ESCAPE, ESCAPE, PASS, ESCAPE, PASS, PASS, PASS, PASS, PASS, PASS, PASS, UNESCAPE,SPECIAL, PASS, +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE, +// @ A B C D E F G H I J K L M N O + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, +// P Q R S T U V W X Y Z [ \ ] ^ _ + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE, +// ` a b c d e f g h i j k l m n o + ESCAPE, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE, +// p q r s t u v w x y z { | } ~ <NBSP> + UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE, ESCAPE, ESCAPE, UNESCAPE,ESCAPE, +// ...all the high-bit characters are escaped + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, + ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE}; + +enum DotDisposition { + // The given dot is just part of a filename and is not special. + NOT_A_DIRECTORY, + + // The given dot is the current directory. + DIRECTORY_CUR, + + // The given dot is the first of a double dot that should take us up one. + DIRECTORY_UP +}; + +// When the path resolver finds a dot, this function is called with the +// character following that dot to see what it is. The return value +// indicates what type this dot is (see above). This code handles the case +// where the dot is at the end of the input. +// +// |*consumed_len| will contain the number of characters in the input that +// express what we found. +// +// If the input is "../foo", |after_dot| = 1, |end| = 6, and +// at the end, |*consumed_len| = 2 for the "./" this function consumed. The +// original dot length should be handled by the caller. +template<typename CHAR> +DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot, + int end, int* consumed_len) { + if (after_dot == end) { + // Single dot at the end. + *consumed_len = 0; + return DIRECTORY_CUR; + } + if (url_parse::IsURLSlash(spec[after_dot])) { + // Single dot followed by a slash. + *consumed_len = 1; // Consume the slash + return DIRECTORY_CUR; + } + + int second_dot_len = IsDot(spec, after_dot, end); + if (second_dot_len) { + int after_second_dot = after_dot + second_dot_len; + if (after_second_dot == end) { + // Double dot at the end. + *consumed_len = second_dot_len; + return DIRECTORY_UP; + } + if (url_parse::IsURLSlash(spec[after_second_dot])) { + // Double dot followed by a slash. + *consumed_len = second_dot_len + 1; + return DIRECTORY_UP; + } + } + + // The dots are followed by something else, not a directory. + *consumed_len = 0; + return NOT_A_DIRECTORY; +} + +// Rewinds the output to the previous slash. It is assumed that the output +// ends with a slash and this doesn't count (we call this when we are +// appending directory paths, so the previous path component has and ending +// slash). +// +// This will stop at the first slash (assumed to be at position +// |path_begin_in_output| and not go any higher than that. Some web pages +// do ".." too many times, so we need to handle that brokenness. +// +// It searches for a literal slash rather than including a backslash as well +// because it is run only on the canonical output. +// +// The output is guaranteed to end in a slash when this function completes. +void BackUpToPreviousSlash(int path_begin_in_output, + CanonOutput* output) { + DCHECK(output->length() > 0); + + int i = output->length() - 1; + DCHECK(output->at(i) == '/'); + if (i == path_begin_in_output) + return; // We're at the first slash, nothing to do. + + // Now back up (skipping the trailing slash) until we find another slash. + i--; + while (output->at(i) != '/' && i > path_begin_in_output) + i--; + + // Now shrink the output to just include that last slash we found. + output->set_length(i + 1); +} + +// Appends the given path to the output. It assumes that if the input path +// starts with a slash, it should be copied to the output. If no path has +// already been appended to the output (the case when not resolving +// relative URLs), the path should begin with a slash. +// +// If there are already path components (this mode is used when appending +// relative paths for resolving), it assumes that the output already has +// a trailing slash and that if the input begins with a slash, it should be +// copied to the output. +// +// We do not collapse multiple slashes in a row to a single slash. It seems +// no web browsers do this, and we don't want incompababilities, even though +// it would be correct for most systems. +template<typename CHAR, typename UCHAR> +bool DoPartialPath(const CHAR* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output) { + int end = path.end(); + + bool success = true; + for (int i = path.begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(spec[i]); + if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) { + // We only need to test wide input for having non-ASCII characters. For + // narrow input, we'll always just use the lookup table. We don't try to + // do anything tricky with decoding/validating UTF-8. This function will + // read one or two UTF-16 characters and append the output as UTF-8. This + // call will be removed in 8-bit mode. + success &= AppendUTF8EscapedChar(spec, &i, end, output); + } else { + // Normal ASCII character or 8-bit input, use the lookup table. + unsigned char out_ch = static_cast<unsigned char>(uch); + unsigned char flags = kPathCharLookup[out_ch]; + if (flags & SPECIAL) { + // Needs special handling of some sort. + int dotlen; + if ((dotlen = IsDot(spec, i, end)) > 0) { + // See if this dot was preceeded by a slash in the output. We + // assume that when canonicalizing paths, they will always + // start with a slash and not a dot, so we don't have to + // bounds check the output. + // + // Note that we check this in the case of dots so we don't have to + // special case slashes. Since slashes are much more common than + // dots, this actually increases performance measurably (though + // slightly). + DCHECK(output->length() > path_begin_in_output); + if (output->length() > path_begin_in_output && + output->at(output->length() - 1) == '/') { + // Slash followed by a dot, check to see if this is means relative + int consumed_len; + switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end, + &consumed_len)) { + case NOT_A_DIRECTORY: + // Copy the dot to the output, it means nothing special. + output->push_back('.'); + i += dotlen - 1; + break; + case DIRECTORY_CUR: // Current directory, just skip the input. + i += dotlen + consumed_len - 1; + break; + case DIRECTORY_UP: + BackUpToPreviousSlash(path_begin_in_output, output); + i += dotlen + consumed_len - 1; + break; + } + } else { + // This dot is not preceeded by a slash, it is just part of some + // file name. + output->push_back('.'); + i += dotlen - 1; + } + + } else if (out_ch == '\\') { + // Convert backslashes to forward slashes + output->push_back('/'); + + } else if (out_ch == '%') { + // Handle escape sequences. + unsigned char unescaped_value; + if (DecodeEscaped(spec, &i, end, &unescaped_value)) { + // Valid escape sequence, see if we keep, reject, or unescape it. + char unescaped_flags = kPathCharLookup[unescaped_value]; + + if (unescaped_flags & UNESCAPE) { + // This escaped value shouldn't be escaped, copy it. + output->push_back(unescaped_value); + } else if (unescaped_flags & INVALID_BIT) { + // Invalid escaped character, copy it and remember the error. + output->push_back('%'); + output->push_back(static_cast<char>(spec[i - 1])); + output->push_back(static_cast<char>(spec[i])); + success = false; + } else { + // Valid escaped character but we should keep it escaped. We + // don't want to change the case of any hex letters in case + // the server is sensitive to that, so we just copy the two + // characters without checking (DecodeEscape will have advanced + // to the last character of the pair). + output->push_back('%'); + output->push_back(static_cast<char>(spec[i - 1])); + output->push_back(static_cast<char>(spec[i])); + } + } else { + // Invalid escape sequence. IE7 rejects any URLs with such + // sequences, while Firefox, IE6, and Safari all pass it through + // unchanged. We are more permissive unlike IE7. I don't think this + // can cause significant problems, if it does, we should change + // to be more like IE7. + output->push_back('%'); + } + + } else if (flags & INVALID_BIT) { + // For NULLs, etc. fail. + AppendEscapedChar(out_ch, output); + success = false; + + } else if (flags & ESCAPE_BIT) { + // This character should be escaped. + AppendEscapedChar(out_ch, output); + } + } else { + // Nothing special about this character, just append it. + output->push_back(out_ch); + } + } + } + return success; +} + +template<typename CHAR, typename UCHAR> +bool DoPath(const CHAR* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + bool success = true; + if (path.len > 0) { + out_path->begin = output->length(); + + // Write out an initial slash if the input has none. If we just parse a URL + // and then canonicalize it, it will of course have a slash already. This + // check is for the replacement and relative URL resolving cases of file + // URLs. + if (!url_parse::IsURLSlash(spec[path.begin])) + output->push_back('/'); + + success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output); + out_path->len = output->length() - out_path->begin; + } else { + // No input, canonical path is a slash. + output->push_back('/'); + *out_path = url_parse::Component(); + } + return success; +} + +} // namespace + +bool CanonicalizePath(const char* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + return DoPath<char, unsigned char>(spec, path, output, out_path); +} + +bool CanonicalizePath(const char16* spec, + const url_parse::Component& path, + CanonOutput* output, + url_parse::Component* out_path) { + return DoPath<char16, char16>(spec, path, output, out_path); +} + +bool CanonicalizePartialPath(const char* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output) { + return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output, + output); +} + +bool CanonicalizePartialPath(const char16* spec, + const url_parse::Component& path, + int path_begin_in_output, + CanonOutput* output) { + return DoPartialPath<char16, char16>(spec, path, path_begin_in_output, + output); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_pathurl.cc b/googleurl/src/url_canon_pathurl.cc new file mode 100644 index 0000000..4a990c7 --- /dev/null +++ b/googleurl/src/url_canon_pathurl.cc @@ -0,0 +1,128 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Functions for canonicalizing "path" URLs. Not to be confused with the path +// of a URL, these are URLs that have no authority section, only a path. For +// example, "javascript:" and "data:". + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizePathURL(const URLComponentSource<CHAR>& source, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + // Scheme: this will append the colon. + bool success = CanonicalizeScheme(source.scheme, parsed.scheme, + output, &new_parsed->scheme); + + // We assume there's no authority for path URLs. Note that hosts should never + // have -1 length. + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->host.reset(); + new_parsed->port.reset(); + + if (parsed.path.is_valid()) { + // Copy the path using path URL's more lax escaping rules (think for + // javascript:). We convert to UTF-8 and escape non-ASCII, but leave all + // ASCII characters alone. This helps readability of JavaStript. + new_parsed->path.begin = output->length(); + int end = parsed.path.end(); + for (int i = parsed.path.begin; i < end; i++) { + UCHAR uch = static_cast<UCHAR>(source.path[i]); + if (uch < 0x20 || uch >= 0x80) + success &= AppendUTF8EscapedChar(source.path, &i, end, output); + else + output->push_back(static_cast<char>(uch)); + } + new_parsed->path.len = output->length() - new_parsed->path.begin; + } else { + // Empty path. + new_parsed->path.reset(); + } + + // Assume there's no query or ref. + new_parsed->query.reset(); + new_parsed->ref.reset(); + + return success; +} + +} // namespace + +bool CanonicalizePathURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizePathURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, output, new_parsed); +} + +bool CanonicalizePathURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizePathURL<char16, char16>( + URLComponentSource<char16>(spec), parsed, output, new_parsed); +} + +bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizePathURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +bool ReplacePathURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizePathURL<char, unsigned char>( + source, parsed, output, new_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_query.cc b/googleurl/src/url_canon_query.cc new file mode 100644 index 0000000..cee8774 --- /dev/null +++ b/googleurl/src/url_canon_query.cc @@ -0,0 +1,189 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +// Query canonicalization in IE +// ---------------------------- +// IE is very permissive for query parameters specified in links on the page +// (in contrast to links that it constructs itself based on form data). It does +// not unescape any character. It does not reject any escape sequence (be they +// invalid like "%2y" or freaky like %00). +// +// IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09), +// LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier +// layer since they are removed from all portions of the URL). All other +// characters are passed unmodified. Invalid UTF-16 sequences are preserved as +// well, with each character in the input being converted to UTF-8. It is the +// server's job to make sense of this invalid query. +// +// Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page) +// are converted to the invalid character and sent as unescaped UTF-8 (0xef, +// 0xbf, 0xbd). This may not be canonicalization, the parser may generate these +// strings before the URL handler ever sees them. +// +// Our query canonicalization +// -------------------------- +// We escape all non-ASCII characters and control characters, like Firefox. +// This is more conformant to the URL spec, and there do not seem to be many +// problems relating to Firefox's behavior. +// +// Like IE, we will never unescape (although the application may want to try +// unescaping to present the user with a more understandable URL). We will +// replace all invalid sequences (including invalid UTF-16 sequences, which IE +// doesn't) with the "invalid character," and we will escape it. + +namespace url_canon { + +namespace { + +// Returns true if the characters starting at |begin| and going until |end| +// (non-inclusive) are all representable in 7-bits. +template<typename CHAR, typename UCHAR> +bool IsAllASCII(const CHAR* spec, const url_parse::Component& query) { + int end = query.end(); + for (int i = query.begin; i < end; i++) { + if (static_cast<UCHAR>(spec[i]) >= 0x80) + return false; + } + return true; +} + +// Appends the given string to the output, escaping characters that do not +// match the given |type| in SharedCharTypes. This version will accept 8 or 16 +// bit characters, but assumes that they have only 7-bit values. It also assumes +// that all UTF-8 values are correct, so doesn't bother checking +template<typename CHAR> +void AppendRaw8BitQueryString(const CHAR* source, int length, + CanonOutput* output) { + for (int i = 0; i < length; i++) { + if (!IsQueryChar(static_cast<unsigned char>(source[i]))) + AppendEscapedChar(static_cast<unsigned char>(source[i]), output); + else // Doesn't need escaping. + output->push_back(static_cast<char>(source[i])); + } +} + +// Runs the converter on the given UTF-8 input. Since the converter expects +// UTF-16, we have to convert first. The converter must be non-NULL. +void RunConverter(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output) { + // This function will replace any misencoded values with the invalid + // character. This is what we want so we don't have to check for error. + RawCanonOutputW<1024> utf16; + ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16); + converter->ConvertFromUTF16(utf16.data(), utf16.length(), output); +} + +// Runs the converter with the given UTF-16 input. We don't have to do +// anything, but this overriddden function allows us to use the same code +// for both UTF-8 and UTF-16 input. +void RunConverter(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output) { + converter->ConvertFromUTF16(&spec[query.begin], query.len, output); +} + +template<typename CHAR, typename UCHAR> +void DoConvertToQueryEncoding(const CHAR* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output) { + if (IsAllASCII<CHAR, UCHAR>(spec, query)) { + // Easy: the input can just appended with no character set conversions. + AppendRaw8BitQueryString(&spec[query.begin], query.len, output); + + } else { + // Harder: convert to the proper encoding first. + if (converter) { + // Run the converter to get an 8-bit string, then append it, escaping + // necessary values. + RawCanonOutput<1024> eight_bit; + RunConverter(spec, query, converter, &eight_bit); + AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output); + + } else { + // No converter, do our own UTF-8 conversion. + AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output); + } + } +} + +template<typename CHAR, typename UCHAR> +void DoCanonicalizeQuery(const CHAR* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query) { + if (query.len < 0) { + *out_query = url_parse::Component(); + return; + } + + output->push_back('?'); + out_query->begin = output->length(); + + DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output); + + out_query->len = output->length() - out_query->begin; +} + +} // namespace + +void CanonicalizeQuery(const char* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query) { + DoCanonicalizeQuery<char, unsigned char>(spec, query, converter, + output, out_query); +} + +void CanonicalizeQuery(const char16* spec, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output, + url_parse::Component* out_query) { + DoCanonicalizeQuery<char16, char16>(spec, query, converter, + output, out_query); +} + +void ConvertUTF16ToQueryEncoding(const char16* input, + const url_parse::Component& query, + CharsetConverter* converter, + CanonOutput* output) { + DoConvertToQueryEncoding<char16, char16>(input, query, + converter, output); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_relative.cc b/googleurl/src/url_canon_relative.cc new file mode 100644 index 0000000..446b951 --- /dev/null +++ b/googleurl/src/url_canon_relative.cc @@ -0,0 +1,571 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Canonicalizer functions for working with and resolving relative URLs. + +#include "base/logging.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_canon { + +namespace { + +// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug +// 379034), whereas IE is case-insensetive. +// +// We choose to be more permissive like IE. We don't need to worry about +// unescaping or anything here: neither IE or Firefox allow this. We also +// don't have to worry about invalid scheme characters since we are comparing +// against the canonical scheme of the base. +// +// The base URL should always be canonical, therefore is ASCII. +template<typename CHAR> +bool AreSchemesEqual(const char* base, + const url_parse::Component& base_scheme, + const CHAR* cmp, + const url_parse::Component& cmp_scheme) { + if (base_scheme.len != cmp_scheme.len) + return false; + for (int i = 0; i < base_scheme.len; i++) { + // We assume the base is already canonical, so we don't have to + // canonicalize it. + if (CanonicalSchemeChar(cmp[cmp_scheme.begin + i]) != + base[base_scheme.begin + i]) + return false; + } + return true; +} + +#ifdef WIN32 + +// Here, we also allow Windows paths to be represented as "/C:/" so we can be +// consistent about URL paths beginning with slashes. This function is like +// DoesBeginWindowsDrivePath except that it also requires a slash at the +// beginning. +template<typename CHAR> +bool DoesBeginSlashWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + if (start_offset >= spec_len) + return false; + return url_parse::IsURLSlash(spec[start_offset]) && + url_parse::DoesBeginWindowsDriveSpec(spec, start_offset + 1, spec_len); +} + +#endif // WIN32 + +// See IsRelativeURL in the header file for usage. +template<typename CHAR> +bool DoIsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const CHAR* url, + int url_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component) { + *is_relative = false; // So we can default later to not relative. + + // Trim whitespace and construct a new range for the substring. + int begin = 0; + url_parse::TrimURL(url, &begin, &url_len); + if (begin >= url_len) { + // Empty URLs are relative, but do nothing. + *relative_component = url_parse::Component(begin, 0); + *is_relative = true; + return true; + } + +#ifdef WIN32 + // We special case paths like "C:\foo" so they can link directly to the + // file on Windows (IE compatability). The security domain stuff should + // prevent a link like this from actually being followed if its on a + // web page. + // + // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/" + // as relative, as this will just replace the path when the base scheme + // is a file and the answer will still be correct. + // + // We require strict backslashes when detecting UNC since two forward + // shashes should be treated a a relative URL with a hostname. + if (url_parse::DoesBeginWindowsDriveSpec(url, begin, url_len) || + url_parse::DoesBeginUNCPath(url, begin, url_len, true)) + return true; +#endif // WIN32 + + // See if we've got a scheme, if not, we know this is a relative URL. + // BUT: Just because we have a scheme, doesn't make it absolute. + // "http:foo.html" is a relative URL with path "foo.html". If the scheme is + // empty, we treat it as relative (":foo") like IE does. + url_parse::Component scheme; + if (!url_parse::ExtractScheme(url, url_len, &scheme) || scheme.len == 0) { + // Don't allow relative URLs if the base scheme doesn't support it. + if (!is_base_hierarchical) + return false; + + *relative_component = url_parse::MakeRange(begin, url_len); + *is_relative = true; + return true; + } + + // If the scheme isn't valid, then it's relative. + int scheme_end = scheme.end(); + for (int i = scheme.begin; i < scheme_end; i++) { + if (!CanonicalSchemeChar(url[i])) { + *relative_component = url_parse::MakeRange(begin, url_len); + *is_relative = true; + return true; + } + } + + // If the scheme is not the same, then we can't count it as relative. + if (!AreSchemesEqual(base, base_parsed.scheme, url, scheme)) + return true; + + // When the scheme that they both share is not hierarchical, treat the + // incoming scheme as absolute (this way with the base of "data:foo", + // "data:bar" will be reported as absolute. + if (!is_base_hierarchical) + return true; + + // ExtractScheme guarantees that the colon immediately follows what it + // considers to be the scheme. CountConsecutiveSlashes will handle the + // case where the begin offset is the end of the input. + int colon_offset = scheme.end(); + int num_slashes = url_parse::CountConsecutiveSlashes(url, colon_offset + 1, + url_len); + + if (num_slashes == 0 || num_slashes == 1) { + // No slashes means it's a relative path like "http:foo.html". One slash + // is an absolute path. "http:/home/foo.html" + *is_relative = true; + *relative_component = url_parse::MakeRange(colon_offset + 1, url_len); + return true; + } + + // Two or more slashes after the scheme we treat as absolute. + return true; +} + +// Copies all characters in the range [begin, end) of |spec| to the output, +// up until and including the last slash. There should be a slash in the +// range, if not, nothing will be copied. +// +// The input is assumed to be canonical, so we search only for exact slashes +// and not backslashes as well. We also know that it's ASCII. +void CopyToLastSlash(const char* spec, + int begin, + int end, + CanonOutput* output) { + // Find the last slash. + int last_slash = -1; + for (int i = end - 1; i >= begin; i--) { + if (spec[i] == '/') { + last_slash = i; + break; + } + } + if (last_slash < 0) + return; // No slash. + + // Copy. + for (int i = begin; i <= last_slash; i++) + output->push_back(spec[i]); +} + +// Copies a single component from the source to the output. This is used +// when resolving relative URLs and a given component is unchanged. Since the +// source should already be canonical, we don't have to do anything special, +// and the input is ASCII. +void CopyOneComponent(const char* source, + const url_parse::Component& source_component, + CanonOutput* output, + url_parse::Component* output_component) { + if (source_component.len < 0) { + // This component is not present. + *output_component = url_parse::Component(); + return; + } + + output_component->begin = output->length(); + int source_end = source_component.end(); + for (int i = source_component.begin; i < source_end; i++) + output->push_back(source[i]); + output_component->len = output->length() - output_component->begin; +} + +#ifdef WIN32 + +// Called on Windows when the base URL is a file URL, this will copy the "C:" +// to the output, if there is a drive letter and if that drive letter is not +// being overridden by the relative URL. Otherwise, do nothing. +// +// It will return the index of the beginning of the next character in the +// base to be processed: if there is a "C:", the slash after it, or if +// there is no drive letter, the slash at the beginning of the path, or +// the end of the base. This can be used as the starting offset for further +// path processing. +template<typename CHAR> +int CopyBaseDriveSpecIfNecessary(const char* base_url, + int base_path_begin, + int base_path_end, + const CHAR* relative_url, + int path_start, + int relative_url_len, + CanonOutput* output) { + if (base_path_begin >= base_path_end) + return base_path_begin; // No path. + + // If the relative begins with a drive spec, don't do anything. The existing + // drive spec in the base will be replaced. + if (url_parse::DoesBeginWindowsDriveSpec(relative_url, + path_start, relative_url_len)) { + return base_path_begin; // Relative URL path is "C:/foo" + } + + // The path should begin with a slash (as all canonical paths do). We check + // if it is followed by a drive letter and copy it. + if (DoesBeginSlashWindowsDriveSpec(base_url, + base_path_begin, + base_path_end)) { + // Copy the two-character drive spec to the output. It will now look like + // "file:///C:" so the rest of it can be treated like a standard path. + output->push_back('/'); + output->push_back(base_url[base_path_begin + 1]); + output->push_back(base_url[base_path_begin + 2]); + return base_path_begin + 3; + } + + return base_path_begin; +} + +#endif // WIN32 + +// A subroutine of DoResolveRelativeURL, this resolves the URL knowning that +// the input is a relative path or less (qyuery or ref). +template<typename CHAR> +bool DoResolveRelativePath(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const CHAR* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + bool success = true; + + // We know the authority section didn't change, copy it to the output. We + // also know we have a path so can copy up to there. + url_parse::Component path, query, ref; + url_parse::ParsePathInternal(relative_url, + relative_component, + &path, + &query, + &ref); + // Canonical URLs always have a path, so we can use that offset. + output->Append(base_url, base_parsed.path.begin); + + if (path.len > 0) { + // The path is replaced or modified. + int true_path_begin = output->length(); + + // For file: URLs on Windows, we don't want to treat the drive letter and + // colon as part of the path for relative file resolution when the + // incoming URL does not provide a drive spec. We save the true path + // beginning so we can fix it up after we are done. + int base_path_begin = base_parsed.path.begin; +#ifdef WIN32 + if (base_is_file) { + base_path_begin = CopyBaseDriveSpecIfNecessary( + base_url, base_parsed.path.begin, base_parsed.path.end(), + relative_url, relative_component.begin, relative_component.end(), + output); + // Now the output looks like either "file://" or "file:///C:" + // and we can start appending the rest of the path. |base_path_begin| + // points to the character in the base that comes next. + } +#endif // WIN32 + + if (url_parse::IsURLSlash(relative_url[path.begin])) { + // Easy case: the path is an absolute path on the server, so we can + // just replace everything from the path on with the new versions. + // Since the input should be canonical hierarchical URL, we should + // always have a path. + success &= CanonicalizePath(relative_url, path, + output, &out_parsed->path); + } else { + // Relative path, replace the query, and reference. We take the + // original path with the file part stripped, and append the new path. + // The canonicalizer will take care of resolving ".." and "." + int path_begin = output->length(); + CopyToLastSlash(base_url, base_path_begin, base_parsed.path.end(), + output); + success &= CanonicalizePartialPath(relative_url, path, path_begin, + output); + out_parsed->path = url_parse::MakeRange(path_begin, output->length()); + + // Copy the rest of the stuff after the path from the relative path. + } + + // Finish with the query and reference part (these can't fail). + CanonicalizeQuery(relative_url, query, query_converter, + output, &out_parsed->query); + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + + // Fix the path beginning to add back the "C:" we may have written above. + out_parsed->path = url_parse::MakeRange(true_path_begin, + out_parsed->path.end()); + return success; + } + + // If we get here, the path is unchanged: copy to output. + CopyOneComponent(base_url, base_parsed.path, output, &out_parsed->path); + + if (query.is_valid()) { + // Just the query specified, replace the query and reference (ignore + // failures for refs) + CanonicalizeQuery(relative_url, query, query_converter, + output, &out_parsed->query); + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + return success; + } + + // If we get here, the query is unchanged: copy to output. Note that the + // range of the query parameter doesn't include the question mark, so we + // have to add it manually if there is a component. + if (base_parsed.query.is_valid()) + output->push_back('?'); + CopyOneComponent(base_url, base_parsed.query, output, &out_parsed->query); + + if (ref.is_valid()) { + // Just the reference specified: replace it (ignoring failures). + CanonicalizeRef(relative_url, ref, output, &out_parsed->ref); + return success; + } + + // We should always have something to do in this function, the caller checks + // that some component is being replaced. + DCHECK(false) << "Not reached"; + return success; +} + +// Resolves a relative URL that contains a host. Typically, these will +// be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which +// should be kept from the original URL is the scheme. +template<typename CHAR> +bool DoResolveRelativeHost(const char* base_url, + const url_parse::Parsed& base_parsed, + const CHAR* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + // Parse the relative URL, just like we would for anything following a + // scheme. + url_parse::Parsed relative_parsed; // Everything but the scheme is valid. + url_parse::ParseAfterScheme(&relative_url[relative_component.begin], + relative_component.len, relative_component.begin, + &relative_parsed); + + // Now we can just use the replacement function to replace all the necessary + // parts of the old URL with the new one. + Replacements<CHAR> replacements; + replacements.SetUsername(relative_url, relative_parsed.username); + replacements.SetPassword(relative_url, relative_parsed.password); + replacements.SetHost(relative_url, relative_parsed.host); + replacements.SetPort(relative_url, relative_parsed.port); + replacements.SetPath(relative_url, relative_parsed.path); + replacements.SetQuery(relative_url, relative_parsed.query); + replacements.SetRef(relative_url, relative_parsed.ref); + + return ReplaceStandardURL(base_url, base_parsed, replacements, + query_converter, output, out_parsed); +} + +// Resolves a relative URL that happens to be an absolute file path. Examples +// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo". +template<typename CHAR> +bool DoResolveAbsoluteFile(const CHAR* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + // Parse the file URL. The file URl parsing function uses the same logic + // as we do for determining if the file is absolute, in which case it will + // not bother to look for a scheme. + url_parse::Parsed relative_parsed; + url_parse::ParseFileURL(&relative_url[relative_component.begin], + relative_component.len, &relative_parsed); + + return CanonicalizeFileURL(&relative_url[relative_component.begin], + relative_component.len, relative_parsed, + query_converter, output, out_parsed); +} + +// TODO(brettw) treat two slashes as root like Mozilla for FTP? +template<typename CHAR> +bool DoResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const CHAR* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + // Starting point for our output parsed. We'll fix what we change. + *out_parsed = base_parsed; + + // Sanity check: the input should have a host or we'll break badly below. + // We can only resolve relative URLs with base URLs that have hosts and + // paths (even the default path of "/" is OK). + // + // We allow hosts with no length so we can handle file URLs, for example. + if (base_parsed.path.len <= 0) { + // On error, return the input (resolving a relative URL on a non-relative + // base = the base). + int base_len = base_parsed.Length(); + for (int i = 0; i < base_len; i++) + output->push_back(base_url[i]); + return false; + } + + if (relative_component.len <= 0) { + // Empty relative URL, make no changes. + int base_len = base_parsed.Length(); + for (int i = 0; i < base_len; i++) + output->push_back(base_url[i]); + return true; + } + + int num_slashes = url_parse::CountConsecutiveSlashes( + relative_url, relative_component.begin, relative_component.end()); + +#ifdef WIN32 + // On Windows, two slashes for a file path (regardless of which direction + // they are) means that it's UNC. Two backslashes on any base scheme mean + // that it's an absolute UNC path (we use the base_is_file flag to control + // how strict the UNC finder is). + // + // We also allow Windows absolute drive specs on any scheme (for example + // "c:\foo") like IE does. There must be no preceeding slashes in this + // case (we reject anything like "/c:/foo") because that should be treated + // as a path. For file URLs, we allow any number of slashes since that would + // be setting the path. + // + // This assumes the absolute path resolver handles absolute URLs like this + // properly. url_util::DoCanonicalize does this. + int after_slashes = relative_component.begin + num_slashes; + if (url_parse::DoesBeginUNCPath(relative_url, relative_component.begin, + relative_component.end(), !base_is_file) || + ((num_slashes == 0 || base_is_file) && + url_parse::DoesBeginWindowsDriveSpec(relative_url, after_slashes, + relative_component.end()))) { + return DoResolveAbsoluteFile(relative_url, relative_component, + query_converter, output, out_parsed); + } +#else + // Other platforms need explicit handling for file: URLs with multiple + // slashes because the generic scheme parsing always extracts a host, but a + // file: URL only has a host if it has exactly 2 slashes. This also + // handles the special case where the URL is only slashes, since that + // doesn't have a host part either. + if (base_is_file && + (num_slashes > 2 || num_slashes == relative_component.len)) { + return DoResolveAbsoluteFile(relative_url, relative_component, + query_converter, output, out_parsed); + } +#endif + + // Any other double-slashes mean that this is relative to the scheme. + if (num_slashes >= 2) { + return DoResolveRelativeHost(base_url, base_parsed, + relative_url, relative_component, + query_converter, output, out_parsed); + } + + // When we get here, we know that the relative URL is on the same host. + return DoResolveRelativePath(base_url, base_parsed, base_is_file, + relative_url, relative_component, + query_converter, output, out_parsed); +} + +} // namespace + +bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component) { + return DoIsRelativeURL<char>( + base, base_parsed, fragment, fragment_len, is_base_hierarchical, + is_relative, relative_component); +} + +bool IsRelativeURL(const char* base, + const url_parse::Parsed& base_parsed, + const char16* fragment, + int fragment_len, + bool is_base_hierarchical, + bool* is_relative, + url_parse::Component* relative_component) { + return DoIsRelativeURL<char16>( + base, base_parsed, fragment, fragment_len, is_base_hierarchical, + is_relative, relative_component); +} + +bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + return DoResolveRelativeURL<char>( + base_url, base_parsed, base_is_file, relative_url, + relative_component, query_converter, output, out_parsed); +} + +bool ResolveRelativeURL(const char* base_url, + const url_parse::Parsed& base_parsed, + bool base_is_file, + const char16* relative_url, + const url_parse::Component& relative_component, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* out_parsed) { + return DoResolveRelativeURL<char16>( + base_url, base_parsed, base_is_file, relative_url, + relative_component, query_converter, output, out_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_stdstring.h b/googleurl/src/url_canon_stdstring.h new file mode 100644 index 0000000..2241eb1 --- /dev/null +++ b/googleurl/src/url_canon_stdstring.h @@ -0,0 +1,133 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This header file defines a canonicalizer output method class for STL +// strings. Because the canonicalizer tries not to be dependent on the STL, +// we have segregated it here. + +#ifndef GOOGLEURL_SRC_URL_CANON_STRING_H__ +#define GOOGLEURL_SRC_URL_CANON_STRING_H__ + +#include <string> +#include "googleurl/src/url_canon.h" + +namespace url_canon { + +// Write into a std::string given in the constructor. This object odes not own +// the string itself, and the user must ensure that the string stays alive +// throughout the lifetime of this object. +// +// The given string will be appended to; any existing data in the string will +// be preserved. The caller should reserve() the amount of data in the string +// they expect to be written. We will resize if necessary, but that's slow. +// +// Note that when canonicalization is complete, the string will likely have +// unused space at the end because we make the string very big to start out +// with (by |initial_size|). This ends up being important because resize +// operations are slow, and because the base class needs to write directly +// into the buffer. +// +// Therefore, the user should call Complete() before using the string that +// this class wrote into. +class StdStringCanonOutput : public CanonOutput { + public: + StdStringCanonOutput(std::string* str) + : CanonOutput(), + str_(str) { + cur_len_ = static_cast<int>(str_->size()); // Append to existing data. + str_->resize(str_->capacity()); + buffer_ = &(*str_)[0]; + buffer_len_ = static_cast<int>(str_->size()); + } + virtual ~StdStringCanonOutput() { + // Nothing to do, we don't own the string. + } + + // Must be called after writing has completed but before the string is used. + void Complete() { + str_->resize(cur_len_); + buffer_len_ = cur_len_; + } + + virtual void Resize(int sz) { + str_->resize(sz); + buffer_ = &(*str_)[0]; + buffer_len_ = sz; + } + + protected: + std::string* str_; +}; + +// An extension of the Replacements class that allows the setters to use +// standard strings. +// +// The strings passed as arguments are not copied and must remain valid until +// this class goes out of scope. +template<typename STR> +class StdStringReplacements : + public url_canon::Replacements<typename STR::value_type> { + public: + void SetSchemeStr(const STR& s) { + this->SetScheme(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetUsernameStr(const STR& s) { + this->SetUsername(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPasswordStr(const STR& s) { + this->SetPassword(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetHostStr(const STR& s) { + this->SetHost(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPortStr(const STR& s) { + this->SetPort(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetPathStr(const STR& s) { + this->SetPath(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetQueryStr(const STR& s) { + this->SetQuery(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } + void SetRefStr(const STR& s) { + this->SetRef(s.data(), + url_parse::Component(0, static_cast<int>(s.length()))); + } +}; + +} // namespace url_canon + +#endif // GOOGLEURL_SRC_URL_CANON_STRING_H__ diff --git a/googleurl/src/url_canon_stdurl.cc b/googleurl/src/url_canon_stdurl.cc new file mode 100644 index 0000000..41a8fa9 --- /dev/null +++ b/googleurl/src/url_canon_stdurl.cc @@ -0,0 +1,202 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Functions to canonicalize "standard" URLs, which are ones that have an +// authority section including a host name. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_internal.h" + +namespace url_canon { + +namespace { + +template<typename CHAR, typename UCHAR> +bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + // Scheme: this will append the colon. + bool success = CanonicalizeScheme(source.scheme, parsed.scheme, + output, &new_parsed->scheme); + + // Authority (username, password, host, port) + bool have_authority; + if (parsed.username.is_valid() || parsed.password.is_valid() || + parsed.host.is_nonempty() || parsed.port.is_valid()) { + have_authority = true; + + // Only write the authority separators when we have a scheme. + if (parsed.scheme.is_valid()) { + output->push_back('/'); + output->push_back('/'); + } + + // User info: the canonicalizer will handle the : and @. + success &= CanonicalizeUserInfo(source.username, parsed.username, + source.password, parsed.password, + output, + &new_parsed->username, + &new_parsed->password); + + success &= CanonicalizeHost(source.host, parsed.host, + output, &new_parsed->host); + + // Host must not be empty for standard URLs. + if (!parsed.host.is_nonempty()) + success = false; + + // Port: the port canonicalizer will handle the colon. + int default_port = DefaultPortForScheme( + &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len); + success &= CanonicalizePort(source.port, parsed.port, default_port, + output, &new_parsed->port); + } else { + // No authority, clear the components. + have_authority = false; + new_parsed->host.reset(); + new_parsed->username.reset(); + new_parsed->password.reset(); + new_parsed->port.reset(); + success = false; // Standard URLs must have an authority. + } + + // Path + if (parsed.path.is_valid()) { + success &= CanonicalizePath(source.path, parsed.path, + output, &new_parsed->path); + } else if (have_authority || + parsed.query.is_valid() || parsed.ref.is_valid()) { + // When we have an empty path, make up a path when we have an authority + // or something following the path. The only time we allow an empty + // output path is when there is nothing else. + new_parsed->path = url_parse::Component(output->length(), 1); + output->push_back('/'); + } else { + // No path at all + new_parsed->path.reset(); + } + + // Query + CanonicalizeQuery(source.query, parsed.query, query_converter, + output, &new_parsed->query); + + // Ref: ignore failure for this, since the page can probably still be loaded. + CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref); + + return success; +} + +} // namespace + + +// Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED +// if the scheme is unknown. +int DefaultPortForScheme(const char* scheme, int scheme_len) { + int default_port = url_parse::PORT_UNSPECIFIED; + switch (scheme_len) { + case 4: + if (!strncmp(scheme, "http", scheme_len)) + default_port = 80; + break; + case 5: + if (!strncmp(scheme, "https", scheme_len)) + default_port = 443; + break; + case 3: + if (!strncmp(scheme, "ftp", scheme_len)) + default_port = 21; + else if (!strncmp(scheme, "wss", scheme_len)) + default_port = 443; + break; + case 6: + if (!strncmp(scheme, "gopher", scheme_len)) + default_port = 70; + break; + case 2: + if (!strncmp(scheme, "ws", scheme_len)) + default_port = 80; + break; + } + return default_port; +} + +bool CanonicalizeStandardURL(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeStandardURL<char, unsigned char>( + URLComponentSource<char>(spec), parsed, query_converter, + output, new_parsed); +} + +bool CanonicalizeStandardURL(const char16* spec, + int spec_len, + const url_parse::Parsed& parsed, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + return DoCanonicalizeStandardURL<char16, char16>( + URLComponentSource<char16>(spec), parsed, query_converter, + output, new_parsed); +} + +bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupOverrideComponents(base, replacements, &source, &parsed); + return DoCanonicalizeStandardURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +// For 16-bit replacements, we turn all the replacements into UTF-8 so the +// regular codepath can be used. +bool ReplaceStandardURL(const char* base, + const url_parse::Parsed& base_parsed, + const Replacements<char16>& replacements, + CharsetConverter* query_converter, + CanonOutput* output, + url_parse::Parsed* new_parsed) { + RawCanonOutput<1024> utf8; + URLComponentSource<char> source(base); + url_parse::Parsed parsed(base_parsed); + SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed); + return DoCanonicalizeStandardURL<char, unsigned char>( + source, parsed, query_converter, output, new_parsed); +} + +} // namespace url_canon diff --git a/googleurl/src/url_canon_unittest.cc b/googleurl/src/url_canon_unittest.cc new file mode 100644 index 0000000..c5be423 --- /dev/null +++ b/googleurl/src/url_canon_unittest.cc @@ -0,0 +1,1936 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <errno.h> +#include <unicode/ucnv.h> + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_icu.h" +#include "googleurl/src/url_canon_internal.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_test_utils.h" +#include "testing/gtest/include/gtest/gtest.h" + +// Some implementations of base/basictypes.h may define ARRAYSIZE. +// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro +// which is in our version of basictypes.h. +#ifndef ARRAYSIZE +#define ARRAYSIZE ARRAYSIZE_UNSAFE +#endif + +using url_test_utils::WStringToUTF16; +using url_test_utils::ConvertUTF8ToUTF16; +using url_test_utils::ConvertUTF16ToUTF8; +using url_canon::CanonHostInfo; + +namespace { + +struct ComponentCase { + const char* input; + const char* expected; + url_parse::Component expected_component; + bool expected_success; +}; + +// ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests +// treat each input as optional, and will only try processing if non-NULL. +// The output is always 8-bit. +struct DualComponentCase { + const char* input8; + const wchar_t* input16; + const char* expected; + url_parse::Component expected_component; + bool expected_success; +}; + +// Test cases for CanonicalizeIPAddress(). The inputs are identical to +// DualComponentCase, but the output has extra CanonHostInfo fields. +struct IPAddressCase { + const char* input8; + const wchar_t* input16; + const char* expected; + url_parse::Component expected_component; + + // CanonHostInfo fields, for verbose output. + CanonHostInfo::Family expected_family; + int expected_num_ipv4_components; +}; + +struct ReplaceCase { + const char* base; + const char* scheme; + const char* username; + const char* password; + const char* host; + const char* port; + const char* path; + const char* query; + const char* ref; + const char* expected; +}; + +// Wrapper around a UConverter object that managers creation and destruction. +class UConvScoper { + public: + explicit UConvScoper(const char* charset_name) { + UErrorCode err = U_ZERO_ERROR; + converter_ = ucnv_open(charset_name, &err); + } + + ~UConvScoper() { + if (converter_) + ucnv_close(converter_); + } + + // Returns the converter object, may be NULL. + UConverter* converter() const { return converter_; } + + private: + UConverter* converter_; +}; + +// Magic string used in the replacements code that tells SetupReplComp to +// call the clear function. +const char kDeleteComp[] = "|"; + +// Sets up a replacement for a single component. This is given pointers to +// the set and clear function for the component being replaced, and will +// either set the component (if it exists) or clear it (if the replacement +// string matches kDeleteComp). +// +// This template is currently used only for the 8-bit case, and the strlen +// causes it to fail in other cases. It is left a template in case we have +// tests for wide replacements. +template<typename CHAR> +void SetupReplComp( + void (url_canon::Replacements<CHAR>::*set)(const CHAR*, + const url_parse::Component&), + void (url_canon::Replacements<CHAR>::*clear)(), + url_canon::Replacements<CHAR>* rep, + const CHAR* str) { + if (str && str[0] == kDeleteComp[0]) { + (rep->*clear)(); + } else if (str) { + (rep->*set)(str, url_parse::Component(0, static_cast<int>(strlen(str)))); + } +} + +} // namespace + +TEST(URLCanonTest, UTF) { + // Low-level test that we handle reading, canonicalization, and writing + // UTF-8/UTF-16 strings properly. + struct UTFCase { + const char* input8; + const wchar_t* input16; + bool expected_success; + const char* output; + } utf_cases[] = { + // Valid canonical input should get passed through & escaped. + {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"}, + // Test a characer that takes > 16 bits (U+10300 = old italic letter A) + {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"}, + // Non-shortest-form UTF-8 are invalid. The bad char should be replaced + // with the invalid character (EF BF DB in UTF-8). + {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, "%EF%BF%BD%E5%A5%BD"}, + // Invalid UTF-8 sequences should be marked as invalid (the first + // sequence is truncated). + {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"}, + // Character going off the end. + {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"}, + // ...same with low surrogates with no high surrogate. + {"\xed\xb0\x80", L"\xdc00", false, "%EF%BF%BD"}, + // Test a UTF-8 encoded surrogate value is marked as invalid. + // ED A0 80 = U+D800 + {"\xed\xa0\x80", NULL, false, "%EF%BF%BD"}, + }; + + std::string out_str; + for (size_t i = 0; i < ARRAYSIZE(utf_cases); i++) { + if (utf_cases[i].input8) { + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + int input_len = static_cast<int>(strlen(utf_cases[i].input8)); + bool success = true; + for (int ch = 0; ch < input_len; ch++) { + success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len, + &output); + } + output.Complete(); + EXPECT_EQ(utf_cases[i].expected_success, success); + EXPECT_EQ(std::string(utf_cases[i].output), out_str); + } + if (utf_cases[i].input16) { + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + string16 input_str(WStringToUTF16(utf_cases[i].input16)); + int input_len = static_cast<int>(input_str.length()); + bool success = true; + for (int ch = 0; ch < input_len; ch++) { + success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len, + &output); + } + output.Complete(); + EXPECT_EQ(utf_cases[i].expected_success, success); + EXPECT_EQ(std::string(utf_cases[i].output), out_str); + } + + if (utf_cases[i].input8 && utf_cases[i].input16 && + utf_cases[i].expected_success) { + // Check that the UTF-8 and UTF-16 inputs are equivalent. + + // UTF-16 -> UTF-8 + std::string input8_str(utf_cases[i].input8); + string16 input16_str(WStringToUTF16(utf_cases[i].input16)); + EXPECT_EQ(input8_str, ConvertUTF16ToUTF8(input16_str)); + + // UTF-8 -> UTF-16 + EXPECT_EQ(input16_str, ConvertUTF8ToUTF16(input8_str)); + } + } +} + +TEST(URLCanonTest, ICUCharsetConverter) { + struct ICUCase { + const wchar_t* input; + const char* encoding; + const char* expected; + } icu_cases[] = { + // UTF-8. + {L"Hello, world", "utf-8", "Hello, world"}, + {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"}, + // Non-BMP UTF-8. + {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"}, + // Big5 + {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"}, + // Unrepresentable character in the destination set. + {L"hello\x4f60\x06de\x597dworld", "big5", "hello\xa7\x41%26%231758%3B\xa6\x6eworld"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(icu_cases); i++) { + UConvScoper conv(icu_cases[i].encoding); + ASSERT_TRUE(conv.converter() != NULL); + url_canon::ICUCharsetConverter converter(conv.converter()); + + std::string str; + url_canon::StdStringCanonOutput output(&str); + + string16 input_str(WStringToUTF16(icu_cases[i].input)); + int input_len = static_cast<int>(input_str.length()); + converter.ConvertFromUTF16(input_str.c_str(), input_len, &output); + output.Complete(); + + EXPECT_STREQ(icu_cases[i].expected, str.c_str()); + } + + // Test string sizes around the resize boundary for the output to make sure + // the converter resizes as needed. + const int static_size = 16; + UConvScoper conv("utf-8"); + ASSERT_TRUE(conv.converter()); + url_canon::ICUCharsetConverter converter(conv.converter()); + for (int i = static_size - 2; i <= static_size + 2; i++) { + // Make a string with the appropriate length. + string16 input; + for (int ch = 0; ch < i; ch++) + input.push_back('a'); + + url_canon::RawCanonOutput<static_size> output; + converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()), + &output); + EXPECT_EQ(input.length(), static_cast<size_t>(output.length())); + } +} + +TEST(URLCanonTest, Scheme) { + // Here, we're mostly testing that unusual characters are handled properly. + // The canonicalizer doesn't do any parsing or whitespace detection. It will + // also do its best on error, and will escape funny sequences (these won't be + // valid schemes and it will return error). + // + // Note that the canonicalizer will append a colon to the output to separate + // out the rest of the URL, which is not present in the input. We check, + // however, that the output range includes everything but the colon. + ComponentCase scheme_cases[] = { + {"http", "http:", url_parse::Component(0, 4), true}, + {"HTTP", "http:", url_parse::Component(0, 4), true}, + {" HTTP ", "%20http%20:", url_parse::Component(0, 10), false}, + {"htt: ", "htt%3A%20:", url_parse::Component(0, 9), false}, + {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", url_parse::Component(0, 22), false}, + // Don't re-escape something already escaped. Note that it will + // "canonicalize" the 'A' to 'a', but that's OK. + {"ht%3Atp", "ht%3atp:", url_parse::Component(0, 7), false}, + }; + + std::string out_str; + + for (size_t i = 0; i < arraysize(scheme_cases); i++) { + int url_len = static_cast<int>(strlen(scheme_cases[i].input)); + url_parse::Component in_comp(0, url_len); + url_parse::Component out_comp; + + out_str.clear(); + url_canon::StdStringCanonOutput output1(&out_str); + bool success = url_canon::CanonicalizeScheme(scheme_cases[i].input, + in_comp, &output1, &out_comp); + output1.Complete(); + + EXPECT_EQ(scheme_cases[i].expected_success, success); + EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); + EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); + + // Now try the wide version + out_str.clear(); + url_canon::StdStringCanonOutput output2(&out_str); + + string16 wide_input(ConvertUTF8ToUTF16(scheme_cases[i].input)); + in_comp.len = static_cast<int>(wide_input.length()); + success = url_canon::CanonicalizeScheme(wide_input.c_str(), in_comp, + &output2, &out_comp); + output2.Complete(); + + EXPECT_EQ(scheme_cases[i].expected_success, success); + EXPECT_EQ(std::string(scheme_cases[i].expected), out_str); + EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); + } + + // Test the case where the scheme is declared nonexistant, it should be + // converted into an empty scheme. + url_parse::Component out_comp; + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + EXPECT_TRUE(url_canon::CanonicalizeScheme("", url_parse::Component(0, -1), + &output, &out_comp)); + output.Complete(); + + EXPECT_EQ(std::string(":"), out_str); + EXPECT_EQ(0, out_comp.begin); + EXPECT_EQ(0, out_comp.len); +} + +TEST(URLCanonTest, Host) { + IPAddressCase host_cases[] = { + // Basic canonicalization, uppercase should be converted to lowercase. + {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1}, + // Spaces and some other characters should be escaped. + {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com", url_parse::Component(0, 22), CanonHostInfo::NEUTRAL, -1}, + // Exciting different types of spaces! + {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", url_parse::Component(0, 16), CanonHostInfo::NEUTRAL, -1}, + // Other types of space (no-break, zero-width, zero-width-no-break) are + // name-prepped away to nothing. + {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", url_parse::Component(0, 10), CanonHostInfo::NEUTRAL, -1}, + // Ideographic full stop (full-width period for Chinese, etc.) should be + // treated as a dot. + {NULL, L"www.foo\x3002"L"bar.com", "www.foo.bar.com", url_parse::Component(0, 15), CanonHostInfo::NEUTRAL, -1}, + // Invalid unicode characters should fail... + // ...In wide input, ICU will barf and we'll end up with the input as + // escaped UTF-8 (the invalid character should be replaced with the + // replacement character). + {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1}, + // ...This is the same as previous but with with escaped. + {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com", url_parse::Component(0, 16), CanonHostInfo::BROKEN, -1}, + // Test name prepping, fullwidth input should be converted to ASCII and NOT + // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16. + {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com", url_parse::Component(0, 6), CanonHostInfo::NEUTRAL, -1}, + // Test that fullwidth escaped values are properly name-prepped, + // then converted or rejected. + // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input) + {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1}, + {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com", "a.com", url_parse::Component(0, 5), CanonHostInfo::NEUTRAL, -1}, + // ...%00 in fullwidth should fail (also as escaped UTF-8 input) + {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1}, + {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", url_parse::Component(0, 7), CanonHostInfo::BROKEN, -1}, + // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN + {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1}, + // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped + // UTF-8 (wide case). The output should be equivalent to the true wide + // character input above). + {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd", L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", url_parse::Component(0, 14), CanonHostInfo::NEUTRAL, -1}, + // Invalid escaped characters should fail and the percents should be + // escaped. + {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", url_parse::Component(0, 10), CanonHostInfo::BROKEN, -1}, + // If we get an invalid character that has been escaped. + {"%25", L"%25", "%25", url_parse::Component(0, 3), CanonHostInfo::BROKEN, -1}, + {"hello%00", L"hello%00", "hello%00", url_parse::Component(0, 8), CanonHostInfo::BROKEN, -1}, + // Escaped numbers should be treated like IP addresses if they are. + {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + {"%30%78%63%30%2e%30%32%35%30.01%2e", L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + // Invalid escaping should trigger the regular host error handling. + {"%3g%78%63%30%2e%30%32%35%30%2E.01", L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01", url_parse::Component(0, 17), CanonHostInfo::BROKEN, -1}, + // Something that isn't exactly an IP should get treated as a host and + // spaces escaped. + {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello", url_parse::Component(0, 19), CanonHostInfo::NEUTRAL, -1}, + // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP. + // These are "0Xc0.0250.01" in fullwidth. + {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%8E\xef\xbc\x90\xef\xbc\x91", L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10\xff11", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + // Broken IP addresses get marked as such. + {"192.168.0.257", L"192.168.0.257", "192.168.0.257", url_parse::Component(0, 13), CanonHostInfo::BROKEN, -1}, + {"[google.com]", L"[google.com]", "[google.com]", url_parse::Component(0, 12), CanonHostInfo::BROKEN, -1}, + // Cyrillic letter followed buy ( should return punicode for ( escaped before punicode string was created. I.e. + // if ( is escaped after punicode is created we would get xn--%28-8tb (incorrect). + {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", url_parse::Component(0, 11), CanonHostInfo::NEUTRAL, -1}, + }; + + // CanonicalizeHost() non-verbose. + std::string out_str; + for (size_t i = 0; i < arraysize(host_cases); i++) { + // Narrow version. + if (host_cases[i].input8) { + int host_len = static_cast<int>(strlen(host_cases[i].input8)); + url_parse::Component in_comp(0, host_len); + url_parse::Component out_comp; + + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + bool success = url_canon::CanonicalizeHost(host_cases[i].input8, in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, + success); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len); + } + + // Wide version. + if (host_cases[i].input16) { + string16 input16(WStringToUTF16(host_cases[i].input16)); + int host_len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, host_len); + url_parse::Component out_comp; + + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + + bool success = url_canon::CanonicalizeHost(input16.c_str(), in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN, + success); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len); + } + } + + // CanonicalizeHostVerbose() + for (size_t i = 0; i < arraysize(host_cases); i++) { + // Narrow version. + if (host_cases[i].input8) { + int host_len = static_cast<int>(strlen(host_cases[i].input8)); + url_parse::Component in_comp(0, host_len); + + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + CanonHostInfo host_info; + + url_canon::CanonicalizeHostVerbose(host_cases[i].input8, in_comp, + &output, &host_info); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); + if (host_cases[i].expected_family == CanonHostInfo::IPV4) { + EXPECT_EQ(host_cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } + + // Wide version. + if (host_cases[i].input16) { + string16 input16(WStringToUTF16(host_cases[i].input16)); + int host_len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, host_len); + + out_str.clear(); + url_canon::StdStringCanonOutput output(&out_str); + CanonHostInfo host_info; + + url_canon::CanonicalizeHostVerbose(input16.c_str(), in_comp, + &output, &host_info); + output.Complete(); + + EXPECT_EQ(host_cases[i].expected_family, host_info.family); + EXPECT_EQ(std::string(host_cases[i].expected), out_str); + EXPECT_EQ(host_cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len); + if (host_cases[i].expected_family == CanonHostInfo::IPV4) { + EXPECT_EQ(host_cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } + } +} + +TEST(URLCanonTest, IPv4) { + IPAddressCase cases[] = { + // Empty is not an IP address. + {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {".", L".", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Regular IP addresses in different bases. + {"192.168.0.1", L"192.168.0.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4}, + {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4}, + {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4}, + // Non-IP addresses due to invalid characters. + {"192.168.9.com", L"192.168.9.com", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Invalid characters for the base should be rejected. + {"19a.168.0.1", L"19a.168.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {"0308.0250.00.01", L"0308.0250.00.01", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // If there are not enough components, the last one should fill them out. + {"192", L"192", "0.0.0.192", url_parse::Component(0, 9), CanonHostInfo::IPV4, 1}, + {"0xC0a80001", L"0xC0a80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1}, + {"030052000001", L"030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1}, + {"000030052000001", L"000030052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 1}, + {"192.168", L"192.168", "192.0.0.168", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2}, + {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2}, + {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 2}, + {"192.168.1", L"192.168.1", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + // Too many components means not an IP address. + {"192.168.0.0.1", L"192.168.0.0.1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // We allow a single trailing dot. + {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 4}, + {"192.168.0.1. hello", L"192.168.0.1. hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {"192.168.0.1..", L"192.168.0.1..", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Two dots in a row means not an IP address. + {"192.168..1", L"192.168..1", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Any numerical overflow should be marked as BROKEN. + {"0x100.0", L"0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0x100.0.0", L"0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0x100.0.0.0", L"0x100.0.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0x100.0.0", L"0.0x100.0.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0.0x100.0", L"0.0.0x100.0", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0.0.0x100", L"0.0.0.0x100", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0.0x10000", L"0.0.0x10000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0.0x1000000", L"0.0x1000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0x100000000", L"0x100000000", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Repeat the previous tests, minus 1, to verify boundaries. + {"0xFF.0", L"0xFF.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 2}, + {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 3}, + {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4}, + {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4}, + {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4}, + {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", url_parse::Component(0, 9), CanonHostInfo::IPV4, 4}, + {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", url_parse::Component(0, 13), CanonHostInfo::IPV4, 2}, + {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", url_parse::Component(0, 15), CanonHostInfo::IPV4, 1}, + // Old trunctations tests. They're all "BROKEN" now. + {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"192.168.0.257", L"192.168.0.257", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"192.168.0xa20001", L"192.168.0xa20001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"192.015052000001", L"192.015052000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"0X12C0a80001", L"0X12C0a80001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"276.1.2", L"276.1.2", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Spaces should be rejected. + {"192.168.0.1 hello", L"192.168.0.1 hello", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Very large numbers. + {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", url_parse::Component(0, 11), CanonHostInfo::IPV4, 3}, + {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", url_parse::Component(0, 11), CanonHostInfo::BROKEN, -1}, + // A number has no length limit, but long numbers can still overflow. + {"00000000000000000001", L"00000000000000000001", "0.0.0.1", url_parse::Component(0, 7), CanonHostInfo::IPV4, 1}, + {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // If a long component is non-numeric, it's a hostname, *not* a broken IP. + {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Truncation of all zeros should still result in 0. + {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", url_parse::Component(0, 7), CanonHostInfo::IPV4, 4}, + }; + + for (size_t i = 0; i < arraysize(cases); i++) { + // 8-bit version. + url_parse::Component component(0, + static_cast<int>(strlen(cases[i].input8))); + + std::string out_str1; + url_canon::StdStringCanonOutput output1(&out_str1); + url_canon::CanonHostInfo host_info; + url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1, + &host_info); + output1.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + if (host_info.family == CanonHostInfo::IPV4) { + EXPECT_STREQ(cases[i].expected, out_str1.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + + // 16-bit version. + string16 input16(WStringToUTF16(cases[i].input16)); + component = url_parse::Component(0, static_cast<int>(input16.length())); + + std::string out_str2; + url_canon::StdStringCanonOutput output2(&out_str2); + url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2, + &host_info); + output2.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + if (host_info.family == CanonHostInfo::IPV4) { + EXPECT_STREQ(cases[i].expected, out_str2.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + EXPECT_EQ(cases[i].expected_num_ipv4_components, + host_info.num_ipv4_components); + } + } +} + +TEST(URLCanonTest, IPv6) { + IPAddressCase cases[] = { + // Empty is not an IP address. + {"", L"", "", url_parse::Component(), CanonHostInfo::NEUTRAL, -1}, + // Non-IPs with [:] characters are marked BROKEN. + {":", L":", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[", L"[", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[:", L"[:", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"]", L"]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {":]", L":]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[]", L"[]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[:]", L"[:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Regular IP address is invalid without bounding '[' and ']'. + {"2001:db8::1", L"2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[2001:db8::1", L"[2001:db8::1", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"2001:db8::1]", L"2001:db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Regular IP addresses. + {"[::]", L"[::]", "[::]", url_parse::Component(0,4), CanonHostInfo::IPV6, -1}, + {"[::1]", L"[::1]", "[::1]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1}, + {"[1::]", L"[1::]", "[1::]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1}, + {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", url_parse::Component(0,10), CanonHostInfo::IPV6, -1}, + {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1}, + + // Leading zeros should be stripped. + {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1}, + + // Upper case letters should be lowercased. + {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", url_parse::Component(0,20), CanonHostInfo::IPV6, -1}, + + // The same address can be written with different contractions, but should + // get canonicalized to the same thing. + {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1}, + {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", url_parse::Component(0,14), CanonHostInfo::IPV6, -1}, + + // IPv4 addresses + // Only mapped and compat addresses can have IPv4 syntax embedded. + {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // IPv4 with last component missing. + {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1}, + + // IPv4 using hex. + // TODO(eroman): Should this format be disallowed? + {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", url_parse::Component(0,15), CanonHostInfo::IPV6, -1}, + + // There may be zeros surrounding the "::" contraction. + {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", url_parse::Component(0,5), CanonHostInfo::IPV6, -1}, + + {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", url_parse::Component(0,13), CanonHostInfo::IPV6, -1}, + + // Can only have one "::" contraction in an IPv6 string literal. + {"[2001::db8::1]", L"[2001::db8::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // No more than 2 consecutive ':'s. + {"[2001:db8:::1]", L"[2001:db8:::1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[:::]", L"[:::]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Non-IP addresses due to invalid characters. + {"[2001::.com]", L"[2001::.com]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // If there are not enough components, the last one should fill them out. + // ... omitted at this time ... + // Too many components means not an IP address. Similarly with too few if using IPv4 compat or mapped addresses. + {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Too many bits (even though 8 comonents, the last one holds 32 bits). + {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // Too many bits specified -- the contraction would have to be zero-length + // to not exceed 128 bits. + {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // The contraction is for 16 bits of zero. + {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", url_parse::Component(0,17), CanonHostInfo::IPV6, -1}, + + // Cannot have a trailing colon. + {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // Cannot have negative numbers. + {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // Scope ID -- the URL may contain an optional ["%" <scope_id>] section. + // The scope_id should be included in the canonicalized URL, and is an + // unsigned decimal number. + + // Invalid because no ID was given after the percent. + + // Don't allow scope-id + {"[1::%1]", L"[1::%1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1::%eth0]", L"[1::%eth0]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[1::%]", L"[1::%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[%]", L"[%]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[::%:]", L"[::%:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // Don't allow leading or trailing colons. + {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + + // We allow a single trailing dot. + // ... omitted at this time ... + // Two dots in a row means not an IP address. + {"[::192.168..1]", L"[::192.168..1]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + // Any non-first components get truncated to one byte. + // ... omitted at this time ... + // Spaces should be rejected. + {"[::1 hello]", L"[::1 hello]", "", url_parse::Component(), CanonHostInfo::BROKEN, -1}, + }; + + for (size_t i = 0; i < arraysize(cases); i++) { + // 8-bit version. + url_parse::Component component(0, + static_cast<int>(strlen(cases[i].input8))); + + std::string out_str1; + url_canon::StdStringCanonOutput output1(&out_str1); + url_canon::CanonHostInfo host_info; + url_canon::CanonicalizeIPAddress(cases[i].input8, component, &output1, + &host_info); + output1.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + if (host_info.family == CanonHostInfo::IPV6) { + EXPECT_STREQ(cases[i].expected, out_str1.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, + host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + } + + // 16-bit version. + string16 input16(WStringToUTF16(cases[i].input16)); + component = url_parse::Component(0, static_cast<int>(input16.length())); + + std::string out_str2; + url_canon::StdStringCanonOutput output2(&out_str2); + url_canon::CanonicalizeIPAddress(input16.c_str(), component, &output2, + &host_info); + output2.Complete(); + + EXPECT_EQ(cases[i].expected_family, host_info.family); + if (host_info.family == CanonHostInfo::IPV6) { + EXPECT_STREQ(cases[i].expected, out_str2.c_str()); + EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin); + EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len); + } + } +} + +TEST(URLCanonTest, UserInfo) { + // Note that the canonicalizer should escape and treat empty components as + // not being there. + + // We actually parse a full input URL so we can get the initial components. + struct UserComponentCase { + const char* input; + const char* expected; + url_parse::Component expected_username; + url_parse::Component expected_password; + bool expected_success; + } user_info_cases[] = { + {"http://user:pass@host.com/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true}, + {"http://@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true}, + {"http://:@host.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true}, + {"http://foo:@host.com/", "foo@", url_parse::Component(0, 3), url_parse::Component(0, -1), true}, + {"http://:foo@host.com/", ":foo@", url_parse::Component(0, 0), url_parse::Component(1, 3), true}, + {"http://^ :$\t@host.com/", "%5E%20:$%09@", url_parse::Component(0, 6), url_parse::Component(7, 4), true}, + {"http://user:pass@/", "user:pass@", url_parse::Component(0, 4), url_parse::Component(5, 4), true}, + {"http://%2540:bar@domain.com/", "%2540:bar@", url_parse::Component(0, 5), url_parse::Component(6, 3), true }, + + // IE7 compatability: old versions allowed backslashes in usernames, but + // IE7 does not. We disallow it as well. + {"ftp://me\\mydomain:pass@foo.com/", "", url_parse::Component(0, -1), url_parse::Component(0, -1), true}, + }; + + for (size_t i = 0; i < ARRAYSIZE(user_info_cases); i++) { + int url_len = static_cast<int>(strlen(user_info_cases[i].input)); + url_parse::Parsed parsed; + url_parse::ParseStandardURL(user_info_cases[i].input, url_len, &parsed); + url_parse::Component out_user, out_pass; + std::string out_str; + url_canon::StdStringCanonOutput output1(&out_str); + + bool success = url_canon::CanonicalizeUserInfo(user_info_cases[i].input, + parsed.username, + user_info_cases[i].input, + parsed.password, + &output1, &out_user, + &out_pass); + output1.Complete(); + + EXPECT_EQ(user_info_cases[i].expected_success, success); + EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); + EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); + EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); + EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); + EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); + + // Now try the wide version + out_str.clear(); + url_canon::StdStringCanonOutput output2(&out_str); + string16 wide_input(ConvertUTF8ToUTF16(user_info_cases[i].input)); + success = url_canon::CanonicalizeUserInfo(wide_input.c_str(), + parsed.username, + wide_input.c_str(), + parsed.password, + &output2, &out_user, &out_pass); + output2.Complete(); + + EXPECT_EQ(user_info_cases[i].expected_success, success); + EXPECT_EQ(std::string(user_info_cases[i].expected), out_str); + EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin); + EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len); + EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin); + EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len); + } +} + +TEST(URLCanonTest, Port) { + // We only need to test that the number gets properly put into the output + // buffer. The parser unit tests will test scanning the number correctly. + // + // Note that the CanonicalizePort will always prepend a colon to the output + // to separate it from the colon that it assumes preceeds it. + struct PortCase { + const char* input; + int default_port; + const char* expected; + url_parse::Component expected_component; + bool expected_success; + } port_cases[] = { + // Invalid input should be copied w/ failure. + {"as df", 80, ":as%20df", url_parse::Component(1, 7), false}, + {"-2", 80, ":-2", url_parse::Component(1, 2), false}, + // Default port should be omitted. + {"80", 80, "", url_parse::Component(0, -1), true}, + {"8080", 80, ":8080", url_parse::Component(1, 4), true}, + // PORT_UNSPECIFIED should mean always keep the port. + {"80", url_parse::PORT_UNSPECIFIED, ":80", url_parse::Component(1, 2), true}, + }; + + for (size_t i = 0; i < ARRAYSIZE(port_cases); i++) { + int url_len = static_cast<int>(strlen(port_cases[i].input)); + url_parse::Component in_comp(0, url_len); + url_parse::Component out_comp; + std::string out_str; + url_canon::StdStringCanonOutput output1(&out_str); + bool success = url_canon::CanonicalizePort(port_cases[i].input, in_comp, + port_cases[i].default_port, + &output1, &out_comp); + output1.Complete(); + + EXPECT_EQ(port_cases[i].expected_success, success); + EXPECT_EQ(std::string(port_cases[i].expected), out_str); + EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); + + // Now try the wide version + out_str.clear(); + url_canon::StdStringCanonOutput output2(&out_str); + string16 wide_input(ConvertUTF8ToUTF16(port_cases[i].input)); + success = url_canon::CanonicalizePort(wide_input.c_str(), in_comp, + port_cases[i].default_port, + &output2, &out_comp); + output2.Complete(); + + EXPECT_EQ(port_cases[i].expected_success, success); + EXPECT_EQ(std::string(port_cases[i].expected), out_str); + EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len); + } +} + +TEST(URLCanonTest, Path) { + DualComponentCase path_cases[] = { + // ----- path collapsing tests ----- + {"/././foo", L"/././foo", "/foo", url_parse::Component(0, 4), true}, + {"/./.foo", L"/./.foo", "/.foo", url_parse::Component(0, 5), true}, + {"/foo/.", L"/foo/.", "/foo/", url_parse::Component(0, 5), true}, + {"/foo/./", L"/foo/./", "/foo/", url_parse::Component(0, 5), true}, + // double dots followed by a slash or the end of the string count + {"/foo/bar/..", L"/foo/bar/..", "/foo/", url_parse::Component(0, 5), true}, + {"/foo/bar/../", L"/foo/bar/../", "/foo/", url_parse::Component(0, 5), true}, + // don't count double dots when they aren't followed by a slash + {"/foo/..bar", L"/foo/..bar", "/foo/..bar", url_parse::Component(0, 10), true}, + // some in the middle + {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", url_parse::Component(0, 8), true}, + {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a", url_parse::Component(0, 2), true}, + // we should not be able to go above the root + {"/foo/../../..", L"/foo/../../..", "/", url_parse::Component(0, 1), true}, + {"/foo/../../../ton", L"/foo/../../../ton", "/ton", url_parse::Component(0, 4), true}, + // escaped dots should be unescaped and treated the same as dots + {"/foo/%2e", L"/foo/%2e", "/foo/", url_parse::Component(0, 5), true}, + {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", url_parse::Component(0, 8), true}, + {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar", "/..bar", url_parse::Component(0, 6), true}, + // Multiple slashes in a row should be preserved and treated like empty + // directory names. + {"////../..", L"////../..", "//", url_parse::Component(0, 2), true}, + + // ----- escaping tests ----- + {"/foo", L"/foo", "/foo", url_parse::Component(0, 4), true}, + // Valid escape sequence + {"/%20foo", L"/%20foo", "/%20foo", url_parse::Component(0, 7), true}, + // Invalid escape sequence we should pass through unchanged. + {"/foo%", L"/foo%", "/foo%", url_parse::Component(0, 5), true}, + {"/foo%2", L"/foo%2", "/foo%2", url_parse::Component(0, 6), true}, + // Invalid escape sequence: bad characters should be treated the same as + // the sourrounding text, not as escaped (in this case, UTF-8). + {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", url_parse::Component(0, 10), true}, + {"/foo%2\xc2\xa9zbar", NULL, "/foo%2%C2%A9zbar", url_parse::Component(0, 16), true}, + {NULL, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", url_parse::Component(0, 22), true}, + // Regular characters that are escaped should be unescaped + {"/foo%41%7a", L"/foo%41%7a", "/fooAz", url_parse::Component(0, 6), true}, + // Funny characters that are unescaped should be escaped + {"/foo\x09\x91%91", NULL, "/foo%09%91%91", url_parse::Component(0, 13), true}, + {NULL, L"/foo\x09\x91%91", "/foo%09%C2%91%91", url_parse::Component(0, 16), true}, + // Invalid characters that are escaped should cause a failure. + {"/foo%00%51", L"/foo%00%51", "/foo%00Q", url_parse::Component(0, 8), false}, + // Some characters should be passed through unchanged regardless of esc. + {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", url_parse::Component(0, 13), true}, + // Characters that are properly escaped should not have the case changed + // of hex letters. + {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", url_parse::Component(0, 13), true}, + // Funny characters that are unescaped should be escaped + {"/foo\tbar", L"/foo\tbar", "/foo%09bar", url_parse::Component(0, 10), true}, + // Backslashes should get converted to forward slashes + {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", url_parse::Component(0, 8), true}, + // Hashes found in paths (possibly only when the caller explicitly sets + // the path on an already-parsed URL) should be escaped. + {"/foo#bar", L"/foo#bar", "/foo%23bar", url_parse::Component(0, 10), true}, + // %7f should be allowed and %3D should not be unescaped (these were wrong + // in a previous version). + {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", url_parse::Component(0, 24), true}, + // @ should be unescaped. + {"/@asdf%40", L"/@asdf%40", "/@asdf@", url_parse::Component(0, 7), true}, + + // ----- encoding tests ----- + // Basic conversions + {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", url_parse::Component(0, 37), true}, + // Invalid unicode characters should fail. We only do validation on + // UTF-16 input, so this doesn't happen on 8-bit. + {"/\xef\xb7\x90zyx", NULL, "/%EF%B7%90zyx", url_parse::Component(0, 13), true}, + {NULL, L"/\xfdd0zyx", "/%EF%BF%BDzyx", url_parse::Component(0, 13), false}, + }; + + for (size_t i = 0; i < arraysize(path_cases); i++) { + if (path_cases[i].input8) { + int len = static_cast<int>(strlen(path_cases[i].input8)); + url_parse::Component in_comp(0, len); + url_parse::Component out_comp; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizePath(path_cases[i].input8, in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected_success, success); + EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(path_cases[i].expected, out_str); + } + + if (path_cases[i].input16) { + string16 input16(WStringToUTF16(path_cases[i].input16)); + int len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, len); + url_parse::Component out_comp; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + + bool success = url_canon::CanonicalizePath(input16.c_str(), in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(path_cases[i].expected_success, success); + EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(path_cases[i].expected, out_str); + } + } + + // Manual test: embedded NULLs should be escaped and the URL should be marked + // as invalid. + const char path_with_null[] = "/ab\0c"; + url_parse::Component in_comp(0, 5); + url_parse::Component out_comp; + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizePath(path_with_null, in_comp, + &output, &out_comp); + output.Complete(); + EXPECT_FALSE(success); + EXPECT_EQ("/ab%00c", out_str); +} + +TEST(URLCanonTest, Query) { + struct QueryCase { + const char* input8; + const wchar_t* input16; + const char* encoding; + const char* expected; + } query_cases[] = { + // Regular ASCII case in some different encodings. + {"foo=bar", L"foo=bar", NULL, "?foo=bar"}, + {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"}, + {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"}, + {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"}, + // Allow question marks in the query without escaping + {"as?df", L"as?df", NULL, "?as?df"}, + // Always escape '#' since it would mark the ref. + {"as#df", L"as#df", NULL, "?as%23df"}, + // Escape some questionable 8-bit characters, but never unescape. + {"\x02hello\x7f bye", L"\x02hello\x7f bye", NULL, "?%02hello%7F%20bye"}, + {"%40%41123", L"%40%41123", NULL, "?%40%41123"}, + // Chinese input/output + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", NULL, "?q=%E4%BD%A0%E5%A5%BD"}, + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", "?q=%C4%E3%BA%C3"}, + {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"}, + // Unencodable character in the destination character set should be + // escaped. The escape sequence unescapes to be the entity name: + // "?q=你" + {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", "?q=Chinese%26%2365319%3B"}, + // Invalid UTF-8/16 input should be replaced with invalid characters. + {"q=\xed\xed", L"q=\xd800\xd800", NULL, "?q=%EF%BF%BD%EF%BF%BD"}, + // Don't allow < or > because sometimes they are used for XSS if the + // URL is echoed in content. Firefox does this, IE doesn't. + {"q=<asdf>", L"q=<asdf>", NULL, "?q=%3Casdf%3E"}, + // Escape double quotemarks in the query. + {"q=\"asdf\"", L"q=\"asdf\"", NULL, "?q=%22asdf%22"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(query_cases); i++) { + url_parse::Component out_comp; + + UConvScoper conv(query_cases[i].encoding); + ASSERT_TRUE(!query_cases[i].encoding || conv.converter()); + url_canon::ICUCharsetConverter converter(conv.converter()); + + // Map NULL to a NULL converter pointer. + url_canon::ICUCharsetConverter* conv_pointer = &converter; + if (!query_cases[i].encoding) + conv_pointer = NULL; + + if (query_cases[i].input8) { + int len = static_cast<int>(strlen(query_cases[i].input8)); + url_parse::Component in_comp(0, len); + std::string out_str; + + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeQuery(query_cases[i].input8, in_comp, + conv_pointer, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + + if (query_cases[i].input16) { + string16 input16(WStringToUTF16(query_cases[i].input16)); + int len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, len); + std::string out_str; + + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeQuery(input16.c_str(), in_comp, + conv_pointer, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(query_cases[i].expected, out_str); + } + } + + // Extra test for input with embedded NULL; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Component out_comp; + url_canon::CanonicalizeQuery("a \x00z\x01", url_parse::Component(0, 5), NULL, + &output, &out_comp); + output.Complete(); + EXPECT_EQ("?a%20%00z%01", out_str); +} + +TEST(URLCanonTest, Ref) { + // Refs are trivial, it just checks the encoding. + DualComponentCase ref_cases[] = { + // Regular one, we shouldn't escape spaces, et al. + {"hello, world", L"hello, world", "#hello, world", url_parse::Component(1, 12), true}, + // UTF-8/wide input should be preserved + {"\xc2\xa9", L"\xa9", "#\xc2\xa9", url_parse::Component(1, 2), true}, + // Test a characer that takes > 16 bits (U+10300 = old italic letter A) + {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#\xF0\x90\x8C\x80ss", url_parse::Component(1, 6), true}, + // Escaping should be preserved unchanged, even invalid ones + {"%41%a", L"%41%a", "#%41%a", url_parse::Component(1, 5), true}, + // Invalid UTF-8/16 input should be flagged and the input made valid + {"\xc2", NULL, "#\xef\xbf\xbd", url_parse::Component(1, 3), true}, + {NULL, L"\xd800\x597d", "#\xef\xbf\xbd\xe5\xa5\xbd", url_parse::Component(1, 6), true}, + // Test a Unicode invalid character. + {"a\xef\xb7\x90", L"a\xfdd0", "#a\xef\xbf\xbd", url_parse::Component(1, 4), true}, + // Refs can have # signs and we should preserve them. + {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", url_parse::Component(1, 9), true}, + {"#asdf", L"#asdf", "##asdf", url_parse::Component(1, 5), true}, + }; + + for (size_t i = 0; i < arraysize(ref_cases); i++) { + // 8-bit input + if (ref_cases[i].input8) { + int len = static_cast<int>(strlen(ref_cases[i].input8)); + url_parse::Component in_comp(0, len); + url_parse::Component out_comp; + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeRef(ref_cases[i].input8, in_comp, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(ref_cases[i].expected, out_str); + } + + // 16-bit input + if (ref_cases[i].input16) { + string16 input16(WStringToUTF16(ref_cases[i].input16)); + int len = static_cast<int>(input16.length()); + url_parse::Component in_comp(0, len); + url_parse::Component out_comp; + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp); + output.Complete(); + + EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin); + EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len); + EXPECT_EQ(ref_cases[i].expected, out_str); + } + } + + // Try one with an embedded NULL. It should be stripped. + const char null_input[5] = "ab\x00z"; + url_parse::Component null_input_component(0, 4); + url_parse::Component out_comp; + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_canon::CanonicalizeRef(null_input, null_input_component, + &output, &out_comp); + output.Complete(); + + EXPECT_EQ(1, out_comp.begin); + EXPECT_EQ(3, out_comp.len); + EXPECT_EQ("#abz", out_str); +} + +TEST(URLCanonTest, CanonicalizeStandardURL) { + // The individual component canonicalize tests should have caught the cases + // for each of those components. Here, we just need to test that the various + // parts are included or excluded properly, and have the correct separators. + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + } cases[] = { + {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#", true}, + {"http://[www.google.com]/", "http://[www.google.com]/", false}, + {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#", false}, + {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo", true}, + {"www.google.com", ":www.google.com/", true}, + {"http://192.0x00A80001", "http://192.168.0.1/", true}, + {"http://www/foo%2Ehtml", "http://www/foo.html", true}, + {"http://user:pass@/", "http://user:pass@/", false}, + {"http://%25DOMAIN:foobar@foodomain.com/", "http://%25DOMAIN:foobar@foodomain.com/", true}, + + // Backslashes should get converted to forward slashes. + {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true}, + + // Busted refs shouldn't make the whole thing fail. + {"http://www.google.com/asdf#\xc2", "http://www.google.com/asdf#\xef\xbf\xbd", true}, + + // Basic port tests. + {"http://foo:80/", "http://foo/", true}, + {"http://foo:81/", "http://foo:81/", true}, + {"httpa://foo:80/", "httpa://foo:80/", true}, + {"http://foo:-80/", "http://foo:-80/", false}, + + {"https://foo:443/", "https://foo/", true}, + {"https://foo:80/", "https://foo:80/", true}, + {"ftp://foo:21/", "ftp://foo/", true}, + {"ftp://foo:80/", "ftp://foo:80/", true}, + {"gopher://foo:70/", "gopher://foo/", true}, + {"gopher://foo:443/", "gopher://foo:443/", true}, + {"ws://foo:80/", "ws://foo/", true}, + {"ws://foo:81/", "ws://foo:81/", true}, + {"ws://foo:443/", "ws://foo:443/", true}, + {"ws://foo:815/", "ws://foo:815/", true}, + {"wss://foo:80/", "wss://foo:80/", true}, + {"wss://foo:81/", "wss://foo:81/", true}, + {"wss://foo:443/", "wss://foo/", true}, + {"wss://foo:815/", "wss://foo:815/", true}, + }; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + url_parse::Parsed parsed; + url_parse::ParseStandardURL(cases[i].input, url_len, &parsed); + + url_parse::Parsed out_parsed; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizeStandardURL( + cases[i].input, url_len, parsed, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + } +} + +// The codepath here is the same as for regular canonicalization, so we just +// need to test that things are replaced or not correctly. +TEST(URLCanonTest, ReplaceStandardURL) { + ReplaceCase replace_cases[] = { + // Common case of truncating the path. + {"http://www.google.com/foo?bar=baz#ref", NULL, NULL, NULL, NULL, NULL, "/", kDeleteComp, kDeleteComp, "http://www.google.com/"}, + // Replace everything + {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"}, + // Replace nothing + {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"}, + }; + + for (size_t i = 0; i < arraysize(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + url_parse::Parsed parsed; + url_parse::ParseStandardURL(cur.base, base_len, &parsed); + + url_canon::Replacements<char> r; + typedef url_canon::Replacements<char> R; // Clean up syntax. + + // Note that for the scheme we pass in a different clear function since + // there is no function to clear the scheme. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Parsed out_parsed; + url_canon::ReplaceStandardURL(replace_cases[i].base, parsed, + r, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } + + // The path pointer should be ignored if the address is invalid. + { + const char src[] = "http://www.google.com/here_is_the_path"; + int src_len = static_cast<int>(strlen(src)); + + url_parse::Parsed parsed; + url_parse::ParseStandardURL(src, src_len, &parsed); + + // Replace the path to 0 length string. By using 1 as the string address, + // the test should get an access violation if it tries to dereference it. + url_canon::Replacements<char> r; + r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component(0, 0)); + std::string out_str1; + url_canon::StdStringCanonOutput output1(&out_str1); + url_parse::Parsed new_parsed; + url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output1, &new_parsed); + output1.Complete(); + EXPECT_STREQ("http://www.google.com/", out_str1.c_str()); + + // Same with an "invalid" path. + r.SetPath(reinterpret_cast<char*>(0x00000001), url_parse::Component()); + std::string out_str2; + url_canon::StdStringCanonOutput output2(&out_str2); + url_canon::ReplaceStandardURL(src, parsed, r, NULL, &output2, &new_parsed); + output2.Complete(); + EXPECT_STREQ("http://www.google.com/", out_str2.c_str()); + } +} + +TEST(URLCanonTest, ReplaceFileURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"}, + // Replace nothing + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///C:/gaba?query#ref"}, + // Clear non-path components (common) + {"file:///C:/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///C:/gaba"}, + // Replace path with something that doesn't begin with a slash and make + // sure it get added properly. + {"file:///C:/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, "filer", NULL, "/foo", "b", "c", "file://filer/foo?b#c"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "file:///home/gaba?query#ref"}, + {"file:///home/gaba?query#ref", NULL, NULL, NULL, NULL, NULL, NULL, kDeleteComp, kDeleteComp, "file:///home/gaba"}, + {"file:///home/gaba", NULL, NULL, NULL, NULL, NULL, "interesting/", NULL, NULL, "file:///interesting/"}, + }; + + for (size_t i = 0; i < arraysize(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + url_parse::Parsed parsed; + url_parse::ParseFileURL(cur.base, base_len, &parsed); + + url_canon::Replacements<char> r; + typedef url_canon::Replacements<char> R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Parsed out_parsed; + url_canon::ReplaceFileURL(cur.base, parsed, + r, NULL, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplacePathURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"data:foo", "javascript", NULL, NULL, NULL, NULL, "alert('foo?');", NULL, NULL, "javascript:alert('foo?');"}, + // Replace nothing + {"data:foo", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "data:foo"}, + // Replace one or the other + {"data:foo", "javascript", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "javascript:foo"}, + {"data:foo", NULL, NULL, NULL, NULL, NULL, "bar", NULL, NULL, "data:bar"}, + {"data:foo", NULL, NULL, NULL, NULL, NULL, kDeleteComp, NULL, NULL, "data:"}, + }; + + for (size_t i = 0; i < arraysize(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + url_parse::Parsed parsed; + url_parse::ParsePathURL(cur.base, base_len, &parsed); + + url_canon::Replacements<char> r; + typedef url_canon::Replacements<char> R; // Clean up syntax. + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Parsed out_parsed; + url_canon::ReplacePathURL(cur.base, parsed, + r, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, ReplaceMailtoURL) { + ReplaceCase replace_cases[] = { + // Replace everything + {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"}, + // Replace nothing + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"}, + // Replace the path + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"}, + // Replace the query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"}, + // Replace the path and query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"}, + // Set the query to empty (should leave trailing question mark) + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"}, + // Clear the query + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"}, + // Clear the path + {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"}, + // Clear the path + query + {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"}, + // Setting the ref should have no effect + {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"}, + }; + + for (size_t i = 0; i < arraysize(replace_cases); i++) { + const ReplaceCase& cur = replace_cases[i]; + int base_len = static_cast<int>(strlen(cur.base)); + url_parse::Parsed parsed; + url_parse::ParseMailtoURL(cur.base, base_len, &parsed); + + url_canon::Replacements<char> r; + typedef url_canon::Replacements<char> R; + SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme); + SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username); + SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password); + SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host); + SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port); + SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path); + SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query); + SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + url_parse::Parsed out_parsed; + url_canon::ReplaceMailtoURL(cur.base, parsed, + r, &output, &out_parsed); + output.Complete(); + + EXPECT_EQ(replace_cases[i].expected, out_str); + } +} + +TEST(URLCanonTest, CanonicalizeFileURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + url_parse::Component expected_host; + url_parse::Component expected_path; + } cases[] = { +#ifdef _WIN32 + // Windows-style paths + {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)}, + {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)}, + {"file:", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)}, + {"file:UNChost/path", "file://unchost/path", true, url_parse::Component(7, 7), url_parse::Component(14, 5)}, + // CanonicalizeFileURL supports absolute Windows style paths for IE + // compatability. Note that the caller must decide that this is a file + // URL itself so it can call the file canonicalizer. This is usually + // done automatically as part of relative URL resolving. + {"c:\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)}, + {"C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)}, + {"/C|\\foo\\bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)}, + {"//C|/foo/bar", "file:///C:/foo/bar", true, url_parse::Component(), url_parse::Component(7, 11)}, + {"//server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)}, + {"\\\\server\\file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)}, + {"/\\server/file", "file://server/file", true, url_parse::Component(7, 6), url_parse::Component(13, 5)}, + // We should preserve the number of slashes after the colon for IE + // compatability, except when there is none, in which case we should + // add one. + {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)}, + {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)}, + // Three slashes should be non-UNC, even if there is no drive spec (IE + // does this, which makes the resulting request invalid). + {"file:///foo/bar.txt", "file:///foo/bar.txt", true, url_parse::Component(), url_parse::Component(7, 12)}, + // TODO(brettw) we should probably fail for invalid host names, which + // would change the expected result on this test. We also currently allow + // colon even though it's probably invalid, because its currently the + // "natural" result of the way the canonicalizer is written. There doesn't + // seem to be a strong argument for why allowing it here would be bad, so + // we just tolerate it and the load will fail later. + {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false, url_parse::Component(7, 2), url_parse::Component(9, 16)}, + {"file:filer/home\\me", "file://filer/home/me", true, url_parse::Component(7, 5), url_parse::Component(12, 8)}, + // Make sure relative paths can't go above the "C:" + {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true, url_parse::Component(), url_parse::Component(7, 12)}, + // Busted refs shouldn't make the whole thing fail. + {"file:///C:/asdf#\xc2", "file:///C:/asdf#\xef\xbf\xbd", true, url_parse::Component(), url_parse::Component(7, 8)}, +#else + // Unix-style paths + {"file:///home/me", "file:///home/me", true, url_parse::Component(), url_parse::Component(7, 8)}, + // Windowsy ones should get still treated as Unix-style. + {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 16)}, + {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true, url_parse::Component(), url_parse::Component(7, 19)}, + // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html) + {"//", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)}, + {"///", "file:///", true, url_parse::Component(), url_parse::Component(7, 1)}, + {"///test", "file:///test", true, url_parse::Component(), url_parse::Component(7, 5)}, + {"file://test", "file://test/", true, url_parse::Component(7, 4), url_parse::Component(11, 1)}, + {"file://localhost", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)}, + {"file://localhost/", "file://localhost/", true, url_parse::Component(7, 9), url_parse::Component(16, 1)}, + {"file://localhost/test", "file://localhost/test", true, url_parse::Component(7, 9), url_parse::Component(16, 5)}, +#endif // _WIN32 + }; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + url_parse::Parsed parsed; + url_parse::ParseFileURL(cases[i].input, url_len, &parsed); + + url_parse::Parsed out_parsed; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizeFileURL(cases[i].input, url_len, + parsed, NULL, &output, + &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified, the file canonicalizer has + // different code for writing the spec. + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(4, out_parsed.scheme.len); + + EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin); + EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len); + + EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); + EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); + } +} + +TEST(URLCanonTest, CanonicalizePathURL) { + // Path URLs should get canonicalized schemes but nothing else. + struct PathCase { + const char* input; + const char* expected; + } path_cases[] = { + {"javascript:", "javascript:"}, + {"JavaScript:Foo", "javascript:Foo"}, + {":\":This /is interesting;?#", ":\":This /is interesting;?#"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(path_cases); i++) { + int url_len = static_cast<int>(strlen(path_cases[i].input)); + url_parse::Parsed parsed; + url_parse::ParsePathURL(path_cases[i].input, url_len, &parsed); + + url_parse::Parsed out_parsed; + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizePathURL(path_cases[i].input, url_len, + parsed, &output, + &out_parsed); + output.Complete(); + + EXPECT_TRUE(success); + EXPECT_EQ(path_cases[i].expected, out_str); + + EXPECT_EQ(0, out_parsed.host.begin); + EXPECT_EQ(-1, out_parsed.host.len); + + // When we end with a colon at the end, there should be no path. + if (path_cases[i].input[url_len - 1] == ':') { + EXPECT_EQ(0, out_parsed.path.begin); + EXPECT_EQ(-1, out_parsed.path.len); + } + } +} + +TEST(URLCanonTest, CanonicalizeMailtoURL) { + struct URLCase { + const char* input; + const char* expected; + bool expected_success; + url_parse::Component expected_path; + url_parse::Component expected_query; + } cases[] = { + {"mailto:addr1", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()}, + {"mailto:addr1@foo.com", "mailto:addr1@foo.com", true, url_parse::Component(7, 13), url_parse::Component()}, + // Trailing whitespace is stripped. + {"MaIlTo:addr1 \t ", "mailto:addr1", true, url_parse::Component(7, 5), url_parse::Component()}, + {"MaIlTo:addr1?to=jon", "mailto:addr1?to=jon", true, url_parse::Component(7, 5), url_parse::Component(13,6)}, + {"mailto:addr1,addr2", "mailto:addr1,addr2", true, url_parse::Component(7, 11), url_parse::Component()}, + {"mailto:addr1, addr2", "mailto:addr1, addr2", true, url_parse::Component(7, 12), url_parse::Component()}, + {"mailto:addr1%2caddr2", "mailto:addr1%2caddr2", true, url_parse::Component(7, 13), url_parse::Component()}, + {"mailto:\xF0\x90\x8C\x80", "mailto:%F0%90%8C%80", true, url_parse::Component(7, 12), url_parse::Component()}, + // Null character should be escaped to %00 + {"mailto:addr1\0addr2?foo", "mailto:addr1%00addr2?foo", true, url_parse::Component(7, 13), url_parse::Component(21, 3)}, + // Invalid -- UTF-8 encoded surrogate value. + {"mailto:\xed\xa0\x80", "mailto:%EF%BF%BD", false, url_parse::Component(7, 9), url_parse::Component()}, + {"mailto:addr1?", "mailto:addr1?", true, url_parse::Component(7, 5), url_parse::Component(13, 0)}, + }; + + // Define outside of loop to catch bugs where components aren't reset + url_parse::Parsed parsed; + url_parse::Parsed out_parsed; + + for (size_t i = 0; i < ARRAYSIZE(cases); i++) { + int url_len = static_cast<int>(strlen(cases[i].input)); + if (i == 8) { + // The 9th test case purposely has a '\0' in it -- don't count it + // as the string terminator. + url_len = 22; + } + url_parse::ParseMailtoURL(cases[i].input, url_len, &parsed); + + std::string out_str; + url_canon::StdStringCanonOutput output(&out_str); + bool success = url_canon::CanonicalizeMailtoURL(cases[i].input, url_len, + parsed, &output, + &out_parsed); + output.Complete(); + + EXPECT_EQ(cases[i].expected_success, success); + EXPECT_EQ(cases[i].expected, out_str); + + // Make sure the spec was properly identified + EXPECT_EQ(0, out_parsed.scheme.begin); + EXPECT_EQ(6, out_parsed.scheme.len); + + EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin); + EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len); + + EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin); + EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len); + } +} + +#ifndef WIN32 + +TEST(URLCanonTest, _itoa_s) { + // We fill the buffer with 0xff to ensure that it's getting properly + // null-terminated. We also allocate one byte more than what we tell + // _itoa_s about, and ensure that the extra byte is untouched. + char buf[6]; + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(12, buf, sizeof(buf) - 1, 10)); + EXPECT_STREQ("12", buf); + EXPECT_EQ('\xFF', buf[3]); + + // Test the edge cases - exactly the buffer size and one over + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 10)); + EXPECT_STREQ("1234", buf); + EXPECT_EQ('\xFF', buf[5]); + + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(EINVAL, url_canon::_itoa_s(12345, buf, sizeof(buf) - 1, 10)); + EXPECT_EQ('\xFF', buf[5]); // should never write to this location + + // Test the template overload (note that this will see the full buffer) + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(12, buf, 10)); + EXPECT_STREQ("12", buf); + EXPECT_EQ('\xFF', buf[3]); + + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(12345, buf, 10)); + EXPECT_STREQ("12345", buf); + + EXPECT_EQ(EINVAL, url_canon::_itoa_s(123456, buf, 10)); + + // Test that radix 16 is supported. + memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itoa_s(1234, buf, sizeof(buf) - 1, 16)); + EXPECT_STREQ("4d2", buf); + EXPECT_EQ('\xFF', buf[5]); +} + +TEST(URLCanonTest, _itow_s) { + // We fill the buffer with 0xff to ensure that it's getting properly + // null-terminated. We also allocate one byte more than what we tell + // _itoa_s about, and ensure that the extra byte is untouched. + char16 buf[6]; + const char fill_mem = 0xff; + const char16 fill_char = 0xffff; + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itow_s(12, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(WStringToUTF16(L"12"), string16(buf)); + EXPECT_EQ(fill_char, buf[3]); + + // Test the edge cases - exactly the buffer size and one over + EXPECT_EQ(0, url_canon::_itow_s(1234, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(WStringToUTF16(L"1234"), string16(buf)); + EXPECT_EQ(fill_char, buf[5]); + + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(EINVAL, url_canon::_itow_s(12345, buf, sizeof(buf) / 2 - 1, 10)); + EXPECT_EQ(fill_char, buf[5]); // should never write to this location + + // Test the template overload (note that this will see the full buffer) + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itow_s(12, buf, 10)); + EXPECT_EQ(WStringToUTF16(L"12"), string16(buf)); + EXPECT_EQ(fill_char, buf[3]); + + memset(buf, fill_mem, sizeof(buf)); + EXPECT_EQ(0, url_canon::_itow_s(12345, buf, 10)); + EXPECT_EQ(WStringToUTF16(L"12345"), string16(buf)); + + EXPECT_EQ(EINVAL, url_canon::_itow_s(123456, buf, 10)); +} + +#endif // !WIN32 + +// Returns true if the given two structures are the same. +static bool ParsedIsEqual(const url_parse::Parsed& a, + const url_parse::Parsed& b) { + return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len && + a.username.begin == b.username.begin && a.username.len == b.username.len && + a.password.begin == b.password.begin && a.password.len == b.password.len && + a.host.begin == b.host.begin && a.host.len == b.host.len && + a.port.begin == b.port.begin && a.port.len == b.port.len && + a.path.begin == b.path.begin && a.path.len == b.path.len && + a.query.begin == b.query.begin && a.query.len == b.query.len && + a.ref.begin == b.ref.begin && a.ref.len == b.ref.len; +} + +TEST(URLCanonTest, ResolveRelativeURL) { + struct RelativeCase { + const char* base; // Input base URL: MUST BE CANONICAL + bool is_base_hier; // Is the base URL hierarchical + bool is_base_file; // Tells us if the base is a file URL. + const char* test; // Input URL to test against. + bool succeed_relative; // Whether we expect IsRelativeURL to succeed + bool is_rel; // Whether we expect |test| to be relative or not. + bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed. + const char* resolved; // What we expect in the result when resolving. + } rel_cases[] = { + // Basic absolute input. + {"http://host/a", true, false, "http://another/", true, false, false, NULL}, + {"http://host/a", true, false, "http:////another/", true, false, false, NULL}, + // Empty relative URLs shouldn't change the input. + {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"}, + // Spaces at the ends of the relative path should be ignored. + {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"}, + {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"}, + {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"}, + // Matching schemes without two slashes are treated as relative. + {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"}, + {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"}, + {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"}, + {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"}, + // Nonmatching schemes are absolute. + {"http://host/a", true, false, "https:host2", true, false, false, NULL}, + {"http://host/a", true, false, "htto:/host2", true, false, false, NULL}, + // Absolute path input + {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"}, + {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"}, + {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"}, + {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"}, + {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"}, + {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"}, + // Relative path input + {"http://host/a", true, false, "b", true, true, true, "http://host/b"}, + {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"}, + {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"}, + {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"}, + {"http://host/a/", true, false, "..", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "./..", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "../.", true, true, true, "http://host/"}, + {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"}, + {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"}, + // Query input + {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"}, + {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"}, + {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"}, + // Ref input + {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"}, + {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"}, + {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"}, + // Non-hierarchical base: no relative handling. Relative input should + // error, and if a scheme is present, it should be treated as absolute. + {"data:foobar", false, false, "baz.html", false, false, false, NULL}, + {"data:foobar", false, false, "data:baz", true, false, false, NULL}, + {"data:foobar", false, false, "data:/base", true, false, false, NULL}, + // Non-hierarchical base: absolute input should succeed. + {"data:foobar", false, false, "http://host/", true, false, false, NULL}, + {"data:foobar", false, false, "http:host", true, false, false, NULL}, + // Invalid schemes should be treated as relative. + {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"}, + {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"}, + {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"}, + {"data:asdf", false, false, ":foo", false, false, false, NULL}, + // We should treat semicolons like any other character in URL resolving + {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"}, + {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"}, + {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"}, + // Relative URLs can also be written as "//foo/bar" which is relative to + // the scheme. In this case, it would take the old scheme, so for http + // the example would resolve to "http://foo/bar". + {"http://host/a", true, false, "//another", true, true, true, "http://another/"}, + {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"}, + {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "//", true, true, false, "http:"}, + // IE will also allow one or the other to be a backslash to get the same + // behavior. + {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"}, + {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"}, +#ifdef WIN32 + // Resolving against Windows file base URLs. + {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL}, + {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"}, + {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"}, + {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"}, + // But two backslashes on Windows should be UNC so should be treated + // as absolute. + {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL}, + // IE doesn't support drive specs starting with two slashes. It fails + // immediately and doesn't even try to load. We fix it up to either + // an absolute path or UNC depending on what it looks like. + {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"}, + {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"}, + // Windows drive specs should be allowed and treated as absolute. + {"file:///C:/foo", true, true, "c:", true, false, false, NULL}, + {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL}, + {"http://host/a", true, false, "c:\\foo", true, false, false, NULL}, + // Relative paths with drive letters should be allowed when the base is + // also a file. + {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"}, + // Treat absolute paths as being off of the drive. + {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"}, + {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"}, + {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"}, + // On Windows, two slashes without a drive letter when the base is a file + // means that the path is UNC. + {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"}, + {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"}, +#else + // On Unix we fall back to relative behavior since there's nothing else + // reasonable to do. + {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"}, +#endif + // Even on Windows, we don't allow relative drive specs when the base + // is not file. + {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"}, + {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(rel_cases); i++) { + const RelativeCase& cur_case = rel_cases[i]; + + url_parse::Parsed parsed; + int base_len = static_cast<int>(strlen(cur_case.base)); + if (cur_case.is_base_file) + url_parse::ParseFileURL(cur_case.base, base_len, &parsed); + else if (cur_case.is_base_hier) + url_parse::ParseStandardURL(cur_case.base, base_len, &parsed); + else + url_parse::ParsePathURL(cur_case.base, base_len, &parsed); + + // First see if it is relative. + int test_len = static_cast<int>(strlen(cur_case.test)); + bool is_relative; + url_parse::Component relative_component; + bool succeed_is_rel = url_canon::IsRelativeURL( + cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier, + &is_relative, &relative_component); + + EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) << + "succeed is rel failure on " << cur_case.test; + EXPECT_EQ(cur_case.is_rel, is_relative) << + "is rel failure on " << cur_case.test; + // Now resolve it. + if (succeed_is_rel && is_relative && cur_case.is_rel) { + std::string resolved; + url_canon::StdStringCanonOutput output(&resolved); + url_parse::Parsed resolved_parsed; + + bool succeed_resolve = url_canon::ResolveRelativeURL( + cur_case.base, parsed, cur_case.is_base_file, + cur_case.test, relative_component, NULL, &output, &resolved_parsed); + output.Complete(); + + EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve); + EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test; + + // Verify that the output parsed structure is the same as parsing a + // the URL freshly. + url_parse::Parsed ref_parsed; + int resolved_len = static_cast<int>(resolved.size()); + if (cur_case.is_base_file) + url_parse::ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed); + else if (cur_case.is_base_hier) + url_parse::ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed); + else + url_parse::ParsePathURL(resolved.c_str(), resolved_len, &ref_parsed); + EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed)); + } + } +} + +// It used to be when we did a replacement with a long buffer of UTF-16 +// characters, we would get invalid data in the URL. This is because the buffer +// it used to hold the UTF-8 data was resized, while some pointers were still +// kept to the old buffer that was removed. +TEST(URLCanonTest, ReplacementOverflow) { + const char src[] = "file:///C:/foo/bar"; + int src_len = static_cast<int>(strlen(src)); + url_parse::Parsed parsed; + url_parse::ParseFileURL(src, src_len, &parsed); + + // Override two components, the path with something short, and the query with + // sonething long enough to trigger the bug. + url_canon::Replacements<char16> repl; + string16 new_query; + for (int i = 0; i < 4800; i++) + new_query.push_back('a'); + + string16 new_path(WStringToUTF16(L"/foo")); + repl.SetPath(new_path.c_str(), url_parse::Component(0, 4)); + repl.SetQuery(new_query.c_str(), + url_parse::Component(0, static_cast<int>(new_query.length()))); + + // Call ReplaceComponents on the string. It doesn't matter if we call it for + // standard URLs, file URLs, etc, since they will go to the same replacement + // function that was buggy. + url_parse::Parsed repl_parsed; + std::string repl_str; + url_canon::StdStringCanonOutput repl_output(&repl_str); + url_canon::ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed); + repl_output.Complete(); + + // Generate the expected string and check. + std::string expected("file:///foo?"); + for (size_t i = 0; i < new_query.length(); i++) + expected.push_back('a'); + EXPECT_TRUE(expected == repl_str); +} diff --git a/googleurl/src/url_file.h b/googleurl/src/url_file.h new file mode 100644 index 0000000..c1b8ac9 --- /dev/null +++ b/googleurl/src/url_file.h @@ -0,0 +1,108 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Provides shared functions used by the internals of the parser and +// canonicalizer for file URLs. Do not use outside of these modules. + +#ifndef GOOGLEURL_SRC_URL_FILE_H__ +#define GOOGLEURL_SRC_URL_FILE_H__ + +#include "googleurl/src/url_parse_internal.h" + +namespace url_parse { + +#ifdef WIN32 + +// We allow both "c:" and "c|" as drive identifiers. +inline bool IsWindowsDriveSeparator(char16 ch) { + return ch == ':' || ch == '|'; +} +inline bool IsWindowsDriveLetter(char16 ch) { + return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +#endif // WIN32 + +// Returns the index of the next slash in the input after the given index, or +// spec_len if the end of the input is reached. +template<typename CHAR> +inline int FindNextSlash(const CHAR* spec, int begin_index, int spec_len) { + int idx = begin_index; + while (idx < spec_len && !IsURLSlash(spec[idx])) + idx++; + return idx; +} + +#ifdef WIN32 + +// Returns true if the start_offset in the given spec looks like it begins a +// drive spec, for example "c:". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// If this returns true, the spec is guaranteed to have a valid drive letter +// plus a colon starting at |start_offset|. +template<typename CHAR> +inline bool DoesBeginWindowsDriveSpec(const CHAR* spec, int start_offset, + int spec_len) { + int remaining_len = spec_len - start_offset; + if (remaining_len < 2) + return false; // Not enough room. + if (!IsWindowsDriveLetter(spec[start_offset])) + return false; // Doesn't start with a valid drive letter. + if (!IsWindowsDriveSeparator(spec[start_offset + 1])) + return false; // Isn't followed with a drive separator. + return true; +} + +// Returns true if the start_offset in the given text looks like it begins a +// UNC path, for example "\\". This function explicitly handles start_offset +// values that are equal to or larger than the spec_len to simplify callers. +// +// When strict_slashes is set, this function will only accept backslashes as is +// standard for Windows. Otherwise, it will accept forward slashes as well +// which we use for a lot of URL handling. +template<typename CHAR> +inline bool DoesBeginUNCPath(const CHAR* text, + int start_offset, + int len, + bool strict_slashes) { + int remaining_len = len - start_offset; + if (remaining_len < 2) + return false; + + if (strict_slashes) + return text[start_offset] == '\\' && text[start_offset + 1] == '\\'; + return IsURLSlash(text[start_offset]) && IsURLSlash(text[start_offset + 1]); +} + +#endif // WIN32 + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_FILE_H__ diff --git a/googleurl/src/url_parse.cc b/googleurl/src/url_parse.cc new file mode 100644 index 0000000..7c37f13 --- /dev/null +++ b/googleurl/src/url_parse.cc @@ -0,0 +1,757 @@ +/* Based on nsURLParsers.cc from Mozilla + * ------------------------------------- + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Darin Fisher (original author) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "googleurl/src/url_parse.h" + +#include <stdlib.h> + +#include "base/logging.h" +#include "googleurl/src/url_parse_internal.h" + +namespace url_parse { + +namespace { + +// Returns true if the given character is a valid digit to use in a port. +inline bool IsPortDigit(char16 ch) { + return ch >= '0' && ch <= '9'; +} + +// Returns the offset of the next authority terminator in the input starting +// from start_offset. If no terminator is found, the return value will be equal +// to spec_len. +template<typename CHAR> +int FindNextAuthorityTerminator(const CHAR* spec, + int start_offset, + int spec_len) { + for (int i = start_offset; i < spec_len; i++) { + if (IsAuthorityTerminator(spec[i])) + return i; + } + return spec_len; // Not found. +} + +// Fills in all members of the Parsed structure except for the scheme. +// +// |spec| is the full spec being parsed, of length |spec_len|. +// |after_scheme| is the character immediately following the scheme (after the +// colon) where we'll begin parsing. +// +// Compatability data points. I list "host", "path" extracted: +// Input IE6 Firefox Us +// ----- -------------- -------------- -------------- +// http://foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// http:/foo.com/ fail(*) "foo.com", "/" "foo.com", "/" +// http:\foo.com/ fail(*) "\foo.com", "/"(fail) "foo.com", "/" +// http:////foo.com/ "foo.com", "/" "foo.com", "/" "foo.com", "/" +// +// (*) Interestingly, although IE fails to load these URLs, its history +// canonicalizer handles them, meaning if you've been to the corresponding +// "http://foo.com/" link, it will be colored. +template <typename CHAR> +void DoParseAfterScheme(const CHAR* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + int after_slashes = after_scheme + num_slashes; + + // First split into two main parts, the authority (username, password, host, + // and port) and the full path (path, query, and reference). + Component authority; + Component full_path; + + // Found "//<some data>", looks like an authority section. Treat everything + // from there to the next slash (or end of spec) to be the authority. Note + // that we ignore the number of slashes and treat it as the authority. + int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); + authority = Component(after_slashes, end_auth - after_slashes); + + if (end_auth == spec_len) // No beginning of path found. + full_path = Component(); + else // Everything starting from the slash to the end is the path. + full_path = Component(end_auth, spec_len - end_auth); + + // Now parse those two sub-parts. + DoParseAuthority(spec, authority, &parsed->username, &parsed->password, + &parsed->host, &parsed->port); + ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); +} + +template<typename CHAR> +void ParseUserInfo(const CHAR* spec, + const Component& user, + Component* username, + Component* password) { + // Find the first colon in the user section, which separates the username and + // password. + int colon_offset = 0; + while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') + colon_offset++; + + if (colon_offset < user.len) { + // Found separator: <username>:<password> + *username = Component(user.begin, colon_offset); + *password = MakeRange(user.begin + colon_offset + 1, + user.begin + user.len); + } else { + // No separator, treat everything as the username + *username = user; + *password = Component(); + } +} + +template<typename CHAR> +void ParseServerInfo(const CHAR* spec, + const Component& serverinfo, + Component* hostname, + Component* port_num) { + if (serverinfo.len == 0) { + // No server info, host name is empty. + hostname->reset(); + port_num->reset(); + return; + } + + // If the host starts with a left-bracket, assume the entire host is an + // IPv6 literal. Otherwise, assume none of the host is an IPv6 literal. + // This assumption will be overridden if we find a right-bracket. + // + // Our IPv6 address canonicalization code requires both brackets to exist, + // but the ability to locate an incomplete address can still be useful. + int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; + int colon = -1; + + // Find the last right-bracket, and the last colon. + for (int i = serverinfo.begin; i < serverinfo.end(); i++) { + switch (spec[i]) { + case ']': + ipv6_terminator = i; + break; + case ':': + colon = i; + break; + } + } + + if (colon > ipv6_terminator) { + // Found a port number: <hostname>:<port> + *hostname = MakeRange(serverinfo.begin, colon); + if (hostname->len == 0) + hostname->reset(); + *port_num = MakeRange(colon + 1, serverinfo.end()); + } else { + // No port: <hostname> + *hostname = serverinfo; + port_num->reset(); + } +} + +// Given an already-identified auth section, breaks it into its consituent +// parts. The port number will be parsed and the resulting integer will be +// filled into the given *port variable, or -1 if there is no port number or it +// is invalid. +template<typename CHAR> +void DoParseAuthority(const CHAR* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DCHECK(auth.is_valid()) << "We should always get an authority"; + if (auth.len == 0) { + username->reset(); + password->reset(); + hostname->reset(); + port_num->reset(); + return; + } + + // Search backwards for @, which is the separator between the user info and + // the server info. + int i = auth.begin + auth.len - 1; + while (i > auth.begin && spec[i] != '@') + i--; + + if (spec[i] == '@') { + // Found user info: <user-info>@<server-info> + ParseUserInfo(spec, Component(auth.begin, i - auth.begin), + username, password); + ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), + hostname, port_num); + } else { + // No user info, everything is server info. + username->reset(); + password->reset(); + ParseServerInfo(spec, auth, hostname, port_num); + } +} + +template<typename CHAR> +void ParsePath(const CHAR* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref> + + // Special case when there is no path. + if (path.len == -1) { + filepath->reset(); + query->reset(); + ref->reset(); + return; + } + DCHECK(path.len > 0) << "We should never have 0 length paths"; + + // Search for first occurrence of either ? or #. + int path_end = path.begin + path.len; + + int query_separator = -1; // Index of the '?' + int ref_separator = -1; // Index of the '#' + for (int i = path.begin; i < path_end; i++) { + switch (spec[i]) { + case '?': + // Only match the query string if it precedes the reference fragment + // and when we haven't found one already. + if (ref_separator < 0 && query_separator < 0) + query_separator = i; + break; + case '#': + // Record the first # sign only. + if (ref_separator < 0) + ref_separator = i; + break; + } + } + + // Markers pointing to the character after each of these corresponding + // components. The code below words from the end back to the beginning, + // and will update these indices as it finds components that exist. + int file_end, query_end; + + // Ref fragment: from the # to the end of the path. + if (ref_separator >= 0) { + file_end = query_end = ref_separator; + *ref = MakeRange(ref_separator + 1, path_end); + } else { + file_end = query_end = path_end; + ref->reset(); + } + + // Query fragment: everything from the ? to the next boundary (either the end + // of the path or the ref fragment). + if (query_separator >= 0) { + file_end = query_separator; + *query = MakeRange(query_separator + 1, query_end); + } else { + query->reset(); + } + + // File path: treat an empty file path as no file path. + if (file_end != path.begin) + *filepath = MakeRange(path.begin, file_end); + else + filepath->reset(); +} + +template<typename CHAR> +bool DoExtractScheme(const CHAR* url, + int url_len, + Component* scheme) { + // Skip leading whitespace and control characters. + int begin = 0; + while (begin < url_len && ShouldTrimFromURL(url[begin])) + begin++; + if (begin == url_len) + return false; // Input is empty or all whitespace. + + // Find the first colon character. + for (int i = begin; i < url_len; i++) { + if (url[i] == ':') { + *scheme = MakeRange(begin, i); + return true; + } + } + return false; // No colon found: no scheme +} + +// The main parsing function for standard URLs. Standard URLs have a scheme, +// host, path, etc. +template<typename CHAR> +void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + int after_scheme; + if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { + after_scheme = parsed->scheme.end() + 1; // Skip past the colon. + } else { + // Say there's no scheme when there is a colon. We could also say that + // everything is the scheme. Both would produce an invalid URL, but this way + // seems less wrong in more cases. + parsed->scheme.reset(); + after_scheme = begin; + } + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +// Initializes a path URL which is merely a scheme followed by a path. Examples +// include "about:foo" and "javascript:alert('bar');" +template<typename CHAR> +void DoParsePathURL(const CHAR* spec, int spec_len, Parsed* parsed) { + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + // For compatability with the standard URL parser, we treat no path as + // -1, rather than having a length of 0 (we normally wouldn't care so + // much for these non-standard URLs). + if (parsed->scheme.end() == spec_len - 1) + parsed->path.reset(); + else + parsed->path = MakeRange(parsed->scheme.end() + 1, spec_len); + } else { + // No scheme found, just path. + parsed->scheme.reset(); + parsed->path = MakeRange(begin, spec_len); + } +} + +template<typename CHAR> +void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the non-path and non-scheme parts of the URL out of the way, we never + // use them. + parsed->username.reset(); + parsed->password.reset(); + parsed->host.reset(); + parsed->port.reset(); + parsed->ref.reset(); + parsed->query.reset(); // May use this; reset for convenience. + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Handle empty specs or ones that contain only whitespace or control chars. + if (begin == spec_len) { + parsed->scheme.reset(); + parsed->path.reset(); + return; + } + + int path_begin = -1; + int path_end = -1; + + // Extract the scheme, with the path being everything following. We also + // handle the case where there is no scheme. + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + + if (parsed->scheme.end() != spec_len - 1) { + path_begin = parsed->scheme.end() + 1; + path_end = spec_len; + } + } else { + // No scheme found, just path. + parsed->scheme.reset(); + path_begin = begin; + path_end = spec_len; + } + + // Split [path_begin, path_end) into a path + query. + for (int i = path_begin; i < path_end; ++i) { + if (spec[i] == '?') { + parsed->query = MakeRange(i + 1, path_end); + path_end = i; + break; + } + } + + // For compatability with the standard URL parser, treat no path as + // -1, rather than having a length of 0 + if (path_begin == path_end) { + parsed->path.reset(); + } else { + parsed->path = MakeRange(path_begin, path_end); + } +} + +// Converts a port number in a string to an integer. We'd like to just call +// sscanf but our input is not NULL-terminated, which sscanf requires. Instead, +// we copy the digits to a small stack buffer (since we know the maximum number +// of digits in a valid port number) that we can NULL terminate. +template<typename CHAR> +int DoParsePort(const CHAR* spec, const Component& component) { + // Easy success case when there is no port. + const int kMaxDigits = 5; + if (!component.is_nonempty()) + return PORT_UNSPECIFIED; + + // Skip over any leading 0s. + Component digits_comp(component.end(), 0); + for (int i = 0; i < component.len; i++) { + if (spec[component.begin + i] != '0') { + digits_comp = MakeRange(component.begin + i, component.end()); + break; + } + } + if (digits_comp.len == 0) + return 0; // All digits were 0. + + // Verify we don't have too many digits (we'll be copying to our buffer so + // we need to double-check). + if (digits_comp.len > kMaxDigits) + return PORT_INVALID; + + // Copy valid digits to the buffer. + char digits[kMaxDigits + 1]; // +1 for null terminator + for (int i = 0; i < digits_comp.len; i++) { + CHAR ch = spec[digits_comp.begin + i]; + if (!IsPortDigit(ch)) { + // Invalid port digit, fail. + return PORT_INVALID; + } + digits[i] = static_cast<char>(ch); + } + + // Null-terminate the string and convert to integer. Since we guarantee + // only digits, atoi's lack of error handling is OK. + digits[digits_comp.len] = 0; + int port = atoi(digits); + if (port > 65535) + return PORT_INVALID; // Out of range. + return port; +} + +template<typename CHAR> +void DoExtractFileName(const CHAR* spec, + const Component& path, + Component* file_name) { + // Handle empty paths: they have no file names. + if (!path.is_nonempty()) { + file_name->reset(); + return; + } + + // Search backwards for a parameter, which is a normally unused field in a + // URL delimited by a semicolon. We parse the parameter as part of the + // path, but here, we don't want to count it. The last semicolon is the + // parameter. The path should start with a slash, so we don't need to check + // the first one. + int file_end = path.end(); + for (int i = path.end() - 1; i > path.begin; i--) { + if (spec[i] == ';') { + file_end = i; + break; + } + } + + // Now search backwards from the filename end to the previous slash + // to find the beginning of the filename. + for (int i = file_end - 1; i >= path.begin; i--) { + if (IsURLSlash(spec[i])) { + // File name is everything following this character to the end + *file_name = MakeRange(i + 1, file_end); + return; + } + } + + // No slash found, this means the input was degenerate (generally paths + // will start with a slash). Let's call everything the file name. + *file_name = MakeRange(path.begin, file_end); + return; +} + +template<typename CHAR> +bool DoExtractQueryKeyValue(const CHAR* spec, + Component* query, + Component* key, + Component* value) { + if (!query->is_nonempty()) + return false; + + int start = query->begin; + int cur = start; + int end = query->end(); + + // We assume the beginning of the input is the beginning of the "key" and we + // skip to the end of it. + key->begin = cur; + while (cur < end && spec[cur] != '&' && spec[cur] != '=') + cur++; + key->len = cur - key->begin; + + // Skip the separator after the key (if any). + if (cur < end && spec[cur] == '=') + cur++; + + // Find the value part. + value->begin = cur; + while (cur < end && spec[cur] != '&') + cur++; + value->len = cur - value->begin; + + // Finally skip the next separator if any + if (cur < end && spec[cur] == '&') + cur++; + + // Save the new query + *query = url_parse::MakeRange(cur, end); + return true; +} + +} // namespace + +int Parsed::Length() const { + if (ref.is_valid()) + return ref.end(); + return CountCharactersBefore(REF, false); +} + +int Parsed::CountCharactersBefore(ComponentType type, + bool include_delimiter) const { + if (type == SCHEME) + return scheme.begin; + + // There will be some characters after the scheme like "://" and we don't + // know how many. Search forwards for the next thing until we find one. + int cur = 0; + if (scheme.is_valid()) + cur = scheme.end() + 1; // Advance over the ':' at the end of the scheme. + + if (username.is_valid()) { + if (type <= USERNAME) + return username.begin; + cur = username.end() + 1; // Advance over the '@' or ':' at the end. + } + + if (password.is_valid()) { + if (type <= PASSWORD) + return password.begin; + cur = password.end() + 1; // Advance over the '@' at the end. + } + + if (host.is_valid()) { + if (type <= HOST) + return host.begin; + cur = host.end(); + } + + if (port.is_valid()) { + if (type < PORT || (type == PORT && include_delimiter)) + return port.begin - 1; // Back over delimiter. + if (type == PORT) + return port.begin; // Don't want delimiter counted. + cur = port.end(); + } + + if (path.is_valid()) { + if (type <= PATH) + return path.begin; + cur = path.end(); + } + + if (query.is_valid()) { + if (type < QUERY || (type == QUERY && include_delimiter)) + return query.begin - 1; // Back over delimiter. + if (type == QUERY) + return query.begin; // Don't want delimiter counted. + cur = query.end(); + } + + if (ref.is_valid()) { + if (type == REF && !include_delimiter) + return ref.begin; // Back over delimiter. + + // When there is a ref and we get here, the component we wanted was before + // this and not found, so we always know the beginning of the ref is right. + return ref.begin - 1; // Don't want delimiter counted. + } + + return cur; +} + +bool ExtractScheme(const char* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +bool ExtractScheme(const char16* url, int url_len, Component* scheme) { + return DoExtractScheme(url, url_len, scheme); +} + +// This handles everything that may be an authority terminator, including +// backslash. For special backslash handling see DoParseAfterScheme. +bool IsAuthorityTerminator(char16 ch) { + return IsURLSlash(ch) || ch == '?' || ch == '#' || ch == ';'; +} + +void ExtractFileName(const char* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +void ExtractFileName(const char16* url, + const Component& path, + Component* file_name) { + DoExtractFileName(url, path, file_name); +} + +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +bool ExtractQueryKeyValue(const char16* url, + Component* query, + Component* key, + Component* value) { + return DoExtractQueryKeyValue(url, query, key, value); +} + +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +void ParseAuthority(char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num) { + DoParseAuthority(spec, auth, username, password, hostname, port_num); +} + +int ParsePort(const char* url, const Component& port) { + return DoParsePort(url, port); +} + +int ParsePort(const char16* url, const Component& port) { + return DoParsePort(url, port); +} + +void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParseStandardURL(const char16* url, int url_len, Parsed* parsed) { + DoParseStandardURL(url, url_len, parsed); +} + +void ParsePathURL(const char* url, int url_len, Parsed* parsed) { + DoParsePathURL(url, url_len, parsed); +} + +void ParsePathURL(const char16* url, int url_len, Parsed* parsed) { + DoParsePathURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed) { + DoParseMailtoURL(url, url_len, parsed); +} + +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParsePathInternal(const char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref) { + ParsePath(spec, path, filepath, query, ref); +} + +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +void ParseAfterScheme(const char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed) { + DoParseAfterScheme(spec, spec_len, after_scheme, parsed); +} + +} // namespace url_parse diff --git a/googleurl/src/url_parse.h b/googleurl/src/url_parse.h new file mode 100644 index 0000000..bea2766 --- /dev/null +++ b/googleurl/src/url_parse.h @@ -0,0 +1,334 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_PARSE_H__ +#define GOOGLEURL_SRC_URL_PARSE_H__ + +#include <string> + +#include "base/basictypes.h" +#include "base/string16.h" + +namespace url_parse { + +// Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and +// KURLGoogle.cpp still rely on this type. +typedef char16 UTF16Char; + +// Component ------------------------------------------------------------------ + +// Represents a substring for URL parsing. +struct Component { + Component() : begin(0), len(-1) {} + + // Normal constructor: takes an offset and a length. + Component(int b, int l) : begin(b), len(l) {} + + int end() const { + return begin + len; + } + + // Returns true if this component is valid, meaning the length is given. Even + // valid components may be empty to record the fact that they exist. + bool is_valid() const { + return (len != -1); + } + + // Returns true if the given component is specified on false, the component + // is either empty or invalid. + bool is_nonempty() const { + return (len > 0); + } + + void reset() { + begin = 0; + len = -1; + } + + bool operator==(const Component& other) const { + return begin == other.begin && len == other.len; + } + + int begin; // Byte offset in the string of this component. + int len; // Will be -1 if the component is unspecified. +}; + +// Helper that returns a component created with the given begin and ending +// points. The ending point is non-inclusive. +inline Component MakeRange(int begin, int end) { + return Component(begin, end - begin); +} + +// Parsed --------------------------------------------------------------------- + +// A structure that holds the identified parts of an input URL. This structure +// does NOT store the URL itself. The caller will have to store the URL text +// and its corresponding Parsed structure separately. +// +// Typical usage would be: +// +// url_parse::Parsed parsed; +// url_parse::Component scheme; +// if (!url_parse::ExtractScheme(url, url_len, &scheme)) +// return I_CAN_NOT_FIND_THE_SCHEME_DUDE; +// +// if (IsStandardScheme(url, scheme)) // Not provided by this component +// url_parseParseStandardURL(url, url_len, &parsed); +// else if (IsFileURL(url, scheme)) // Not provided by this component +// url_parse::ParseFileURL(url, url_len, &parsed); +// else +// url_parse::ParsePathURL(url, url_len, &parsed); +// +struct Parsed { + // Identifies different components. + enum ComponentType { + SCHEME, + USERNAME, + PASSWORD, + HOST, + PORT, + PATH, + QUERY, + REF, + }; + + // The default constructor is sufficient for the components. + Parsed() {} + + // Returns the length of the URL (the end of the last component). + // + // Note that for some invalid, non-canonical URLs, this may not be the length + // of the string. For example "http://": the parsed structure will only + // contain an entry for the four-character scheme, and it doesn't know about + // the "://". For all other last-components, it will return the real length. + int Length() const; + + // Returns the number of characters before the given component if it exists, + // or where the component would be if it did exist. This will return the + // string length if the component would be appended to the end. + // + // Note that this can get a little funny for the port, query, and ref + // components which have a delimiter that is not counted as part of the + // component. The |include_delimiter| flag controls if you want this counted + // as part of the component or not when the component exists. + // + // This example shows the difference between the two flags for two of these + // delimited components that is present (the port and query) and one that + // isn't (the reference). The components that this flag affects are marked + // with a *. + // 0 1 2 + // 012345678901234567890 + // Example input: http://foo:80/?query + // include_delim=true, ...=false ("<-" indicates different) + // SCHEME: 0 0 + // USERNAME: 5 5 + // PASSWORD: 5 5 + // HOST: 7 7 + // *PORT: 10 11 <- + // PATH: 13 13 + // *QUERY: 14 15 <- + // *REF: 20 20 + // + int CountCharactersBefore(ComponentType type, bool include_delimiter) const; + + // Scheme without the colon: "http://foo"/ would have a scheme of "http". + // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there + // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed + // to start at the beginning of the string if there are preceeding whitespace + // or control characters. + Component scheme; + + // Username. Specified in URLs with an @ sign before the host. See |password| + Component username; + + // Password. The length will be -1 if unspecified, 0 if specified but empty. + // Not all URLs with a username have a password, as in "http://me@host/". + // The password is separated form the username with a colon, as in + // "http://me:secret@host/" + Component password; + + // Host name. + Component host; + + // Port number. + Component port; + + // Path, this is everything following the host name. Length will be -1 if + // unspecified. This includes the preceeding slash, so the path on + // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to + // have a 0 length path, it will be -1 in cases like "http://host?foo". + // Note that we treat backslashes the same as slashes. + Component path; + + // Stuff between the ? and the # after the path. This does not include the + // preceeding ? character. Length will be -1 if unspecified, 0 if there is + // a question mark but no query string. + Component query; + + // Indicated by a #, this is everything following the hash sign (not + // including it). If there are multiple hash signs, we'll use the last one. + // Length will be -1 if there is no hash sign, or 0 if there is one but + // nothing follows it. + Component ref; +}; + +// Initialization functions --------------------------------------------------- +// +// These functions parse the given URL, filling in all of the structure's +// components. These functions can not fail, they will always do their best +// at interpreting the input given. +// +// The string length of the URL MUST be specified, we do not check for NULLs +// at any point in the process, and will actually handle embedded NULLs. +// +// IMPORTANT: These functions do NOT hang on to the given pointer or copy it +// in any way. See the comment above the struct. +// +// The 8-bit versions require UTF-8 encoding. + +// StandardURL is for when the scheme is known to be one that has an +// authority (host) like "http". This function will not handle weird ones +// like "about:" and "javascript:", or do the right thing for "file:" URLs. +void ParseStandardURL(const char* url, int url_len, Parsed* parsed); +void ParseStandardURL(const char16* url, int url_len, Parsed* parsed); + +// PathURL is for when the scheme is known not to have an authority (host) +// section but that aren't file URLs either. The scheme is parsed, and +// everything after the scheme is considered as the path. This is used for +// things like "about:" and "javascript:" +void ParsePathURL(const char* url, int url_len, Parsed* parsed); +void ParsePathURL(const char16* url, int url_len, Parsed* parsed); + +// FileURL is for file URLs. There are some special rules for interpreting +// these. +void ParseFileURL(const char* url, int url_len, Parsed* parsed); +void ParseFileURL(const char16* url, int url_len, Parsed* parsed); + +// MailtoURL is for mailto: urls. They are made up scheme,path,query +void ParseMailtoURL(const char* url, int url_len, Parsed* parsed); +void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed); + +// Helper functions ----------------------------------------------------------- + +// Locates the scheme according to the URL parser's rules. This function is +// designed so the caller can find the scheme and call the correct Init* +// function according to their known scheme types. +// +// It also does not perform any validation on the scheme. +// +// This function will return true if the scheme is found and will put the +// scheme's range into *scheme. False means no scheme could be found. Note +// that a URL beginning with a colon has a scheme, but it is empty, so this +// function will return true but *scheme will = (0,0). +// +// The scheme is found by skipping spaces and control characters at the +// beginning, and taking everything from there to the first colon to be the +// scheme. The character at scheme.end() will be the colon (we may enhance +// this to handle full width colons or something, so don't count on the +// actual character value). The character at scheme.end()+1 will be the +// beginning of the rest of the URL, be it the authority or the path (or the +// end of the string). +// +// The 8-bit version requires UTF-8 encoding. +bool ExtractScheme(const char* url, int url_len, Component* scheme); +bool ExtractScheme(const char16* url, int url_len, Component* scheme); + +// Returns true if ch is a character that terminates the authority segment +// of a URL. +bool IsAuthorityTerminator(char16 ch); + +// Does a best effort parse of input |spec|, in range |auth|. If a particular +// component is not found, it will be set to invalid. +void ParseAuthority(const char* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); +void ParseAuthority(char16* spec, + const Component& auth, + Component* username, + Component* password, + Component* hostname, + Component* port_num); + +// Computes the integer port value from the given port component. The port +// component should have been identified by one of the init functions on +// |Parsed| for the given input url. +// +// The return value will be a positive integer between 0 and 64K, or one of +// the two special values below. +enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 }; +int ParsePort(const char* url, const Component& port); +int ParsePort(const char16* url, const Component& port); + +// Extracts the range of the file name in the given url. The path must +// already have been computed by the parse function, and the matching URL +// and extracted path are provided to this function. The filename is +// defined as being everything from the last slash/backslash of the path +// to the end of the path. +// +// The file name will be empty if the path is empty or there is nothing +// following the last slash. +// +// The 8-bit version requires UTF-8 encoding. +void ExtractFileName(const char* url, + const Component& path, + Component* file_name); +void ExtractFileName(const char16* url, + const Component& path, + Component* file_name); + +// Extract the first key/value from the range defined by |*query|. Updates +// |*query| to start at the end of the extracted key/value pair. This is +// designed for use in a loop: you can keep calling it with the same query +// object and it will iterate over all items in the query. +// +// Some key/value pairs may have the key, the value, or both be empty (for +// example, the query string "?&"). These will be returned. Note that an empty +// last parameter "foo.com?" or foo.com?a&" will not be returned, this case +// is the same as "done." +// +// The initial query component should not include the '?' (this is the default +// for parsed URLs). +// +// If no key/value are found |*key| and |*value| will be unchanged and it will +// return false. +bool ExtractQueryKeyValue(const char* url, + Component* query, + Component* key, + Component* value); +bool ExtractQueryKeyValue(const char16* url, + Component* query, + Component* key, + Component* value); + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_PARSE_H__ diff --git a/googleurl/src/url_parse_file.cc b/googleurl/src/url_parse_file.cc new file mode 100644 index 0000000..2e8429f --- /dev/null +++ b/googleurl/src/url_parse_file.cc @@ -0,0 +1,243 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "base/logging.h" +#include "googleurl/src/url_file.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_parse_internal.h" + +// Interesting IE file:isms... +// +// INPUT OUTPUT +// ========================= ============================== +// file:/foo/bar file:///foo/bar +// The result here seems totally invalid!?!? This isn't UNC. +// +// file:/ +// file:// or any other number of slashes +// IE6 doesn't do anything at all if you click on this link. No error: +// nothing. IE6's history system seems to always color this link, so I'm +// guessing that it maps internally to the empty URL. +// +// C:\ file:///C:/ +// When on a file: URL source page, this link will work. When over HTTP, +// the file: URL will appear in the status bar but the link will not work +// (security restriction for all file URLs). +// +// file:foo/ file:foo/ (invalid?!?!?) +// file:/foo/ file:///foo/ (invalid?!?!?) +// file://foo/ file://foo/ (UNC to server "foo") +// file:///foo/ file:///foo/ (invalid, seems to be a file) +// file:////foo/ file://foo/ (UNC to server "foo") +// Any more than four slashes is also treated as UNC. +// +// file:C:/ file://C:/ +// file:/C:/ file://C:/ +// The number of slashes after "file:" don't matter if the thing following +// it looks like an absolute drive path. Also, slashes and backslashes are +// equally valid here. + +namespace url_parse { + +namespace { + +// A subcomponent of DoInitFileURL, the input of this function should be a UNC +// path name, with the index of the first character after the slashes following +// the scheme given in |after_slashes|. This will initialize the host, path, +// query, and ref, and leave the other output components untouched +// (DoInitFileURL handles these for us). +template<typename CHAR> +void DoParseUNC(const CHAR* spec, + int after_slashes, + int spec_len, + Parsed* parsed) { + int next_slash = FindNextSlash(spec, after_slashes, spec_len); + if (next_slash == spec_len) { + // No additional slash found, as in "file://foo", treat the text as the + // host with no path (this will end up being UNC to server "foo"). + int host_len = spec_len - after_slashes; + if (host_len) + parsed->host = Component(after_slashes, host_len); + else + parsed->host.reset(); + parsed->path.reset(); + return; + } + +#ifdef WIN32 + // See if we have something that looks like a path following the first + // component. As in "file://localhost/c:/", we get "c:/" out. We want to + // treat this as a having no host but the path given. Works on Windows only. + if (DoesBeginWindowsDriveSpec(spec, next_slash + 1, spec_len)) { + parsed->host.reset(); + ParsePathInternal(spec, MakeRange(next_slash, spec_len), + &parsed->path, &parsed->query, &parsed->ref); + return; + } +#endif + + // Otherwise, everything up until that first slash we found is the host name, + // which will end up being the UNC host. For example "file://foo/bar.txt" + // will get a server name of "foo" and a path of "/bar". Later, on Windows, + // this should be treated as the filename "\\foo\bar.txt" in proper UNC + // notation. + int host_len = next_slash - after_slashes; + if (host_len) + parsed->host = MakeRange(after_slashes, next_slash); + else + parsed->host.reset(); + if (next_slash < spec_len) { + ParsePathInternal(spec, MakeRange(next_slash, spec_len), + &parsed->path, &parsed->query, &parsed->ref); + } else { + parsed->path.reset(); + } +} + +// A subcomponent of DoParseFileURL, the input should be a local file, with the +// beginning of the path indicated by the index in |path_begin|. This will +// initialize the host, path, query, and ref, and leave the other output +// components untouched (DoInitFileURL handles these for us). +template<typename CHAR> +void DoParseLocalFile(const CHAR* spec, + int path_begin, + int spec_len, + Parsed* parsed) { + parsed->host.reset(); + ParsePathInternal(spec, MakeRange(path_begin, spec_len), + &parsed->path, &parsed->query, &parsed->ref); +} + +// Backend for the external functions that operates on either char type. +// We are handed the character after the "file:" at the beginning of the spec. +// Usually this is a slash, but needn't be; we allow paths like "file:c:\foo". +template<typename CHAR> +void DoParseFileURL(const CHAR* spec, int spec_len, Parsed* parsed) { + DCHECK(spec_len >= 0); + + // Get the parts we never use for file URLs out of the way. + parsed->username.reset(); + parsed->password.reset(); + parsed->port.reset(); + + // Many of the code paths don't set these, so it's convenient to just clear + // them. We'll write them in those cases we need them. + parsed->query.reset(); + parsed->ref.reset(); + + // Strip leading & trailing spaces and control characters. + int begin = 0; + TrimURL(spec, &begin, &spec_len); + + // Find the scheme. + int num_slashes; + int after_scheme; + int after_slashes; +#ifdef WIN32 + // See how many slashes there are. We want to handle cases like UNC but also + // "/c:/foo". This is when there is no scheme, so we can allow pages to do + // links like "c:/foo/bar" or "//foo/bar". This is also called by the + // relative URL resolver when it determines there is an absolute URL, which + // may give us input like "/c:/foo". + num_slashes = CountConsecutiveSlashes(spec, begin, spec_len); + after_slashes = begin + num_slashes; + if (DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len)) { + // Windows path, don't try to extract the scheme (for example, "c:\foo"). + parsed->scheme.reset(); + after_scheme = after_slashes; + } else if (DoesBeginUNCPath(spec, begin, spec_len, false)) { + // Windows UNC path: don't try to extract the scheme, but keep the slashes. + parsed->scheme.reset(); + after_scheme = begin; + } else +#endif + { + if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { + // Offset the results since we gave ExtractScheme a substring. + parsed->scheme.begin += begin; + after_scheme = parsed->scheme.end() + 1; + } else { + // No scheme found, remember that. + parsed->scheme.reset(); + after_scheme = begin; + } + } + + // Handle empty specs ones that contain only whitespace or control chars, + // or that are just the scheme (for example "file:"). + if (after_scheme == spec_len) { + parsed->host.reset(); + parsed->path.reset(); + return; + } + + num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); + + after_slashes = after_scheme + num_slashes; +#ifdef WIN32 + // Check whether the input is a drive again. We checked above for windows + // drive specs, but that's only at the very beginning to see if we have a + // scheme at all. This test will be duplicated in that case, but will + // additionally handle all cases with a real scheme such as "file:///C:/". + if (!DoesBeginWindowsDriveSpec(spec, after_slashes, spec_len) && + num_slashes != 3) { + // Anything not beginning with a drive spec ("c:\") on Windows is treated + // as UNC, with the exception of three slashes which always means a file. + // Even IE7 treats file:///foo/bar as "/foo/bar", which then fails. + DoParseUNC(spec, after_slashes, spec_len, parsed); + return; + } +#else + // file: URL with exactly 2 slashes is considered to have a host component. + if (num_slashes == 2) { + DoParseUNC(spec, after_slashes, spec_len, parsed); + return; + } +#endif // WIN32 + + // Easy and common case, the full path immediately follows the scheme + // (modulo slashes), as in "file://c:/foo". Just treat everything from + // there to the end as the path. Empty hosts have 0 length instead of -1. + // We include the last slash as part of the path if there is one. + DoParseLocalFile(spec, + num_slashes > 0 ? after_scheme + num_slashes - 1 : after_scheme, + spec_len, parsed); +} + +} // namespace + +void ParseFileURL(const char* url, int url_len, Parsed* parsed) { + DoParseFileURL(url, url_len, parsed); +} + +void ParseFileURL(const char16* url, int url_len, Parsed* parsed) { + DoParseFileURL(url, url_len, parsed); +} + +} // namespace url_parse diff --git a/googleurl/src/url_parse_internal.h b/googleurl/src/url_parse_internal.h new file mode 100644 index 0000000..61bd068 --- /dev/null +++ b/googleurl/src/url_parse_internal.h @@ -0,0 +1,112 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Contains common inline helper functions used by the URL parsing routines. + +#ifndef GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ +#define GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ + +#include "googleurl/src/url_parse.h" + +namespace url_parse { + +// We treat slashes and backslashes the same for IE compatability. +inline bool IsURLSlash(char16 ch) { + return ch == '/' || ch == '\\'; +} + +// Returns true if we should trim this character from the URL because it is a +// space or a control character. +inline bool ShouldTrimFromURL(char16 ch) { + return ch <= ' '; +} + +// Given an already-initialized begin index and length, this shrinks the range +// to eliminate "should-be-trimmed" characters. Note that the length does *not* +// indicate the length of untrimmed data from |*begin|, but rather the position +// in the input string (so the string starts at character |*begin| in the spec, +// and goes until |*len|). +template<typename CHAR> +inline void TrimURL(const CHAR* spec, int* begin, int* len) { + // Strip leading whitespace and control characters. + while (*begin < *len && ShouldTrimFromURL(spec[*begin])) + (*begin)++; + + // Strip trailing whitespace and control characters. We need the >i test for + // when the input string is all blanks; we don't want to back past the input. + while (*len > *begin && ShouldTrimFromURL(spec[*len - 1])) + (*len)--; +} + +// Counts the number of consecutive slashes starting at the given offset +// in the given string of the given length. +template<typename CHAR> +inline int CountConsecutiveSlashes(const CHAR *str, + int begin_offset, int str_len) { + int count = 0; + while (begin_offset + count < str_len && + IsURLSlash(str[begin_offset + count])) + ++count; + return count; +} + +// Internal functions in url_parse.cc that parse the path, that is, everything +// following the authority section. The input is the range of everything +// following the authority section, and the output is the identified ranges. +// +// This is designed for the file URL parser or other consumers who may do +// special stuff at the beginning, but want regular path parsing, it just +// maps to the internal parsing function for paths. +void ParsePathInternal(const char* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); +void ParsePathInternal(const char16* spec, + const Component& path, + Component* filepath, + Component* query, + Component* ref); + + +// Given a spec and a pointer to the character after the colon following the +// scheme, this parses it and fills in the structure, Every item in the parsed +// structure is filled EXCEPT for the scheme, which is untouched. +void ParseAfterScheme(const char* spec, + int spec_len, + int after_scheme, + Parsed* parsed); +void ParseAfterScheme(const char16* spec, + int spec_len, + int after_scheme, + Parsed* parsed); + +} // namespace url_parse + +#endif // GOOGLEURL_SRC_URL_PARSE_INTERNAL_H__ diff --git a/googleurl/src/url_parse_unittest.cc b/googleurl/src/url_parse_unittest.cc new file mode 100644 index 0000000..219d5a0 --- /dev/null +++ b/googleurl/src/url_parse_unittest.cc @@ -0,0 +1,583 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "base/basictypes.h" +#include "googleurl/src/url_parse.h" +#include "testing/gtest/include/gtest/gtest.h" + +// Some implementations of base/basictypes.h may define ARRAYSIZE. +// If it's not defined, we define it to the ARRAYSIZE_UNSAFE macro +// which is in our version of basictypes.h. +#ifndef ARRAYSIZE +#define ARRAYSIZE ARRAYSIZE_UNSAFE +#endif + +// Interesting IE file:isms... +// +// file:/foo/bar file:///foo/bar +// The result here seems totally invalid!?!? This isn't UNC. +// +// file:/ +// file:// or any other number of slashes +// IE6 doesn't do anything at all if you click on this link. No error: +// nothing. IE6's history system seems to always color this link, so I'm +// guessing that it maps internally to the empty URL. +// +// C:\ file:///C:/ +// / file:///C:/ +// /foo file:///C:/foo +// Interestingly, IE treats "/" as an alias for "c:\", which makes sense, +// but is weird to think about on Windows. +// +// file:foo/ file:foo/ (invalid?!?!?) +// file:/foo/ file:///foo/ (invalid?!?!?) +// file://foo/ file://foo/ (UNC to server "foo") +// file:///foo/ file:///foo/ (invalid) +// file:////foo/ file://foo/ (UNC to server "foo") +// Any more than four slashes is also treated as UNC. +// +// file:C:/ file://C:/ +// file:/C:/ file://C:/ +// The number of slashes after "file:" don't matter if the thing following +// it looks like an absolute drive path. Also, slashes and backslashes are +// equally valid here. + +namespace { + +// Used for regular URL parse cases. +struct URLParseCase { + const char* input; + + const char* scheme; + const char* username; + const char* password; + const char* host; + int port; + const char* path; + const char* query; + const char* ref; +}; + +// Simpler version of URLParseCase for testing path URLs. +struct PathURLParseCase { + const char* input; + + const char* scheme; + const char* path; +}; + +// Simpler version of URLParseCase for testing mailto URLs. +struct MailtoURLParseCase { + const char* input; + + const char* scheme; + const char* path; + const char* query; +}; + + +bool ComponentMatches(const char* input, + const char* reference, + const url_parse::Component& component) { + // If the component is nonexistant (length == -1), it should begin at 0. + EXPECT_TRUE(component.len >= 0 || component.len == -1); + + // Begin should be valid. + EXPECT_LE(0, component.begin); + + // A NULL reference means the component should be nonexistant. + if (!reference) + return component.len == -1; + if (component.len < 0) + return false; // Reference is not NULL but we don't have anything + + if (strlen(reference) != static_cast<size_t>(component.len)) + return false; // Lengths don't match + + // Now check the actual characters. + return strncmp(reference, &input[component.begin], component.len) == 0; +} + +void ExpectInvalidComponent(const url_parse::Component& component) { + EXPECT_EQ(0, component.begin); + EXPECT_EQ(-1, component.len); +} + +} // namespace + +// Parsed ---------------------------------------------------------------------- + +TEST(URLParser, Length) { + const char* length_cases[] = { + // One with everything in it. + "http://user:pass@host:99/foo?bar#baz", + // One with nothing in it. + "", + // Working backwards, let's start taking off stuff from the full one. + "http://user:pass@host:99/foo?bar#", + "http://user:pass@host:99/foo?bar", + "http://user:pass@host:99/foo?", + "http://user:pass@host:99/foo", + "http://user:pass@host:99/", + "http://user:pass@host:99", + "http://user:pass@host:", + "http://user:pass@host", + "http://host", + "http://user@", + "http:", + }; + for (size_t i = 0; i < arraysize(length_cases); i++) { + int true_length = static_cast<int>(strlen(length_cases[i])); + + url_parse::Parsed parsed; + url_parse::ParseStandardURL(length_cases[i], true_length, &parsed); + + EXPECT_EQ(true_length, parsed.Length()); + } +} + +TEST(URLParser, CountCharactersBefore) { + using namespace url_parse; + struct CountCase { + const char* url; + Parsed::ComponentType component; + bool include_delimiter; + int expected_count; + } count_cases[] = { + // Test each possibility in the case where all components are present. +// 0 1 2 +// 0123456789012345678901 + {"http://u:p@h:8/p?q#r", Parsed::SCHEME, true, 0}, + {"http://u:p@h:8/p?q#r", Parsed::SCHEME, false, 0}, + {"http://u:p@h:8/p?q#r", Parsed::USERNAME, true, 7}, + {"http://u:p@h:8/p?q#r", Parsed::USERNAME, false, 7}, + {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, true, 9}, + {"http://u:p@h:8/p?q#r", Parsed::PASSWORD, false, 9}, + {"http://u:p@h:8/p?q#r", Parsed::HOST, true, 11}, + {"http://u:p@h:8/p?q#r", Parsed::HOST, false, 11}, + {"http://u:p@h:8/p?q#r", Parsed::PORT, true, 12}, + {"http://u:p@h:8/p?q#r", Parsed::PORT, false, 13}, + {"http://u:p@h:8/p?q#r", Parsed::PATH, false, 14}, + {"http://u:p@h:8/p?q#r", Parsed::PATH, true, 14}, + {"http://u:p@h:8/p?q#r", Parsed::QUERY, true, 16}, + {"http://u:p@h:8/p?q#r", Parsed::QUERY, false, 17}, + {"http://u:p@h:8/p?q#r", Parsed::REF, true, 18}, + {"http://u:p@h:8/p?q#r", Parsed::REF, false, 19}, + // Now test when the requested component is missing. + {"http://u:p@h:8/p?", Parsed::REF, true, 17}, + {"http://u:p@h:8/p?q", Parsed::REF, true, 18}, + {"http://u:p@h:8/p#r", Parsed::QUERY, true, 16}, + {"http://u:p@h:8#r", Parsed::PATH, true, 14}, + {"http://u:p@h/", Parsed::PORT, true, 12}, + {"http://u:p@/", Parsed::HOST, true, 11}, + // This case is a little weird. It will report that the password would + // start where the host begins. This is arguably correct, although you + // could also argue that it should start at the '@' sign. Doing it + // starting with the '@' sign is actually harder, so we don't bother. + {"http://u@h/", Parsed::PASSWORD, true, 9}, + {"http://h/", Parsed::USERNAME, true, 7}, + {"http:", Parsed::USERNAME, true, 5}, + {"", Parsed::SCHEME, true, 0}, + // Make sure a random component still works when there's nothing there. + {"", Parsed::REF, true, 0}, + // File URLs are special with no host, so we test those. + {"file:///c:/foo", Parsed::USERNAME, true, 7}, + {"file:///c:/foo", Parsed::PASSWORD, true, 7}, + {"file:///c:/foo", Parsed::HOST, true, 7}, + {"file:///c:/foo", Parsed::PATH, true, 7}, + }; + for (size_t i = 0; i < ARRAYSIZE(count_cases); i++) { + int length = static_cast<int>(strlen(count_cases[i].url)); + + // Simple test to distinguish file and standard URLs. + url_parse::Parsed parsed; + if (length > 0 && count_cases[i].url[0] == 'f') + url_parse::ParseFileURL(count_cases[i].url, length, &parsed); + else + url_parse::ParseStandardURL(count_cases[i].url, length, &parsed); + + int chars_before = parsed.CountCharactersBefore( + count_cases[i].component, count_cases[i].include_delimiter); + EXPECT_EQ(count_cases[i].expected_count, chars_before); + } +} + +// Standard -------------------------------------------------------------------- + +// Input Scheme Usrname Passwd Host Port Path Query Ref +// ------------------------------------ ------- ------- ---------- ------------ --- ---------- ------------ ----- +static URLParseCase cases[] = { + // Regular URL with all the parts +{"http://user:pass@foo:21/bar;par?b#c", "http", "user", "pass", "foo", 21, "/bar;par","b", "c"}, + + // Known schemes should lean towards authority identification +{"http:foo.com", "http", NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, + + // Spaces! +{"\t :foo.com \n", "", NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, +{" foo.com ", NULL, NULL, NULL, "foo.com", -1, NULL, NULL, NULL}, +{"a:\t foo.com", "a", NULL, NULL, "\t foo.com", -1, NULL, NULL, NULL}, +{"http://f:21/ b ? d # e ", "http", NULL, NULL, "f", 21, "/ b ", " d ", " e"}, + + // Invalid port numbers should be identified and turned into -2, empty port + // numbers should be -1. Spaces aren't allowed in port numbers +{"http://f:/c", "http", NULL, NULL, "f", -1, "/c", NULL, NULL}, +{"http://f:0/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL}, +{"http://f:00000000000000/c", "http", NULL, NULL, "f", 0, "/c", NULL, NULL}, +{"http://f:00000000000000000000080/c", "http", NULL, NULL, "f", 80, "/c", NULL, NULL}, +{"http://f:b/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f: /c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:\n/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:fifty-two/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f:999999/c", "http", NULL, NULL, "f", -2, "/c", NULL, NULL}, +{"http://f: 21 / b ? d # e ", "http", NULL, NULL, "f", -2, "/ b ", " d ", " e"}, + + // Creative URLs missing key elements +{"", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{" \t", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":foo.com/", "", NULL, NULL, "foo.com", -1, "/", NULL, NULL}, +{":foo.com\\", "", NULL, NULL, "foo.com", -1, "\\", NULL, NULL}, +{":", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":a", "", NULL, NULL, "a", -1, NULL, NULL, NULL}, +{":/", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":\\", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":#", "", NULL, NULL, NULL, -1, NULL, NULL, ""}, +{"#", NULL, NULL, NULL, NULL, -1, NULL, NULL, ""}, +{"#/", NULL, NULL, NULL, NULL, -1, NULL, NULL, "/"}, +{"#\\", NULL, NULL, NULL, NULL, -1, NULL, NULL, "\\"}, +{"#;?", NULL, NULL, NULL, NULL, -1, NULL, NULL, ";?"}, +{"?", NULL, NULL, NULL, NULL, -1, NULL, "", NULL}, +{"/", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{":23", "", NULL, NULL, "23", -1, NULL, NULL, NULL}, +{"/:23", "/", NULL, NULL, "23", -1, NULL, NULL, NULL}, +{"//", NULL, NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"::", "", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"::23", "", NULL, NULL, NULL, 23, NULL, NULL, NULL}, +{"foo://", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + + // Username/passwords and things that look like them +{"http://a:b@c:29/d", "http", "a", "b", "c", 29, "/d", NULL, NULL}, +{"http::@c:29", "http", "", "", "c", 29, NULL, NULL, NULL}, + // ... "]" in the password field isn't allowed, but we tolerate it here... +{"http://&a:foo(b]c@d:2/", "http", "&a", "foo(b]c", "d", 2, "/", NULL, NULL}, +{"http://::@c@d:2", "http", "", ":@c", "d", 2, NULL, NULL, NULL}, +{"http://foo.com:b@d/", "http", "foo.com", "b", "d", -1, "/", NULL, NULL}, + +{"http://foo.com/\\@", "http", NULL, NULL, "foo.com", -1, "/\\@", NULL, NULL}, +{"http:\\\\foo.com\\", "http", NULL, NULL, "foo.com", -1, "\\", NULL, NULL}, +{"http:\\\\a\\b:c\\d@foo.com\\", "http", NULL, NULL, "a", -1, "\\b:c\\d@foo.com\\", NULL, NULL}, + + // Tolerate different numbers of slashes. +{"foo:/", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"foo:/bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL}, +{"foo://///////", "foo", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"foo://///////bar.com/", "foo", NULL, NULL, "bar.com", -1, "/", NULL, NULL}, +{"foo:////://///", "foo", NULL, NULL, NULL, -1, "/////", NULL, NULL}, + + // Raw file paths on Windows aren't handled by the parser. +{"c:/foo", "c", NULL, NULL, "foo", -1, NULL, NULL, NULL}, +{"//foo/bar", NULL, NULL, NULL, "foo", -1, "/bar", NULL, NULL}, + + // Use the first question mark for the query and the ref. +{"http://foo/path;a??e#f#g", "http", NULL, NULL, "foo", -1, "/path;a", "?e", "f#g"}, +{"http://foo/abcd?efgh?ijkl", "http", NULL, NULL, "foo", -1, "/abcd", "efgh?ijkl", NULL}, +{"http://foo/abcd#foo?bar", "http", NULL, NULL, "foo", -1, "/abcd", NULL, "foo?bar"}, + + // IPv6, check also interesting uses of colons. +{"[61:24:74]:98", "[61", NULL, NULL, "24:74]", 98, NULL, NULL, NULL}, +{"http://[61:27]:98", "http", NULL, NULL, "[61:27]", 98, NULL, NULL, NULL}, +{"http:[61:27]/:foo", "http", NULL, NULL, "[61:27]", -1, "/:foo", NULL, NULL}, +{"http://[1::2]:3:4", "http", NULL, NULL, "[1::2]:3", 4, NULL, NULL, NULL}, + + // Partially-complete IPv6 literals, and related cases. +{"http://2001::1", "http", NULL, NULL, "2001:", 1, NULL, NULL, NULL}, +{"http://[2001::1", "http", NULL, NULL, "[2001::1", -1, NULL, NULL, NULL}, +{"http://2001::1]", "http", NULL, NULL, "2001::1]", -1, NULL, NULL, NULL}, +{"http://2001::1]:80", "http", NULL, NULL, "2001::1]", 80, NULL, NULL, NULL}, +{"http://[2001::1]", "http", NULL, NULL, "[2001::1]", -1, NULL, NULL, NULL}, +{"http://[2001::1]:80", "http", NULL, NULL, "[2001::1]", 80, NULL, NULL, NULL}, +{"http://[[::]]", "http", NULL, NULL, "[[::]]", -1, NULL, NULL, NULL}, + +}; + +TEST(URLParser, Standard) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + url_parse::Parsed parsed; + for (size_t i = 0; i < arraysize(cases); i++) { + const char* url = cases[i].input; + url_parse::ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed); + int port = url_parse::ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, cases[i].username, parsed.username)); + EXPECT_TRUE(ComponentMatches(url, cases[i].password, parsed.password)); + EXPECT_TRUE(ComponentMatches(url, cases[i].host, parsed.host)); + EXPECT_EQ(cases[i].port, port); + EXPECT_TRUE(ComponentMatches(url, cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, cases[i].query, parsed.query)); + EXPECT_TRUE(ComponentMatches(url, cases[i].ref, parsed.ref)); + } +} + +// PathURL -------------------------------------------------------------------- + +// Various incarnations of path URLs. +static PathURLParseCase path_cases[] = { +{"", NULL, NULL}, +{":", "", NULL}, +{":/", "", "/"}, +{"/", NULL, "/"}, +{" This is \\interesting// \t", NULL, "This is \\interesting//"}, +{"about:", "about", NULL}, +{"about:blank", "about", "blank"}, +{" about: blank ", "about", " blank"}, +{"javascript :alert(\"He:/l\\l#o?foo\"); ", "javascript ", "alert(\"He:/l\\l#o?foo\");"}, +}; + +TEST(URLParser, PathURL) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + url_parse::Parsed parsed; + for (size_t i = 0; i < arraysize(path_cases); i++) { + const char* url = path_cases[i].input; + url_parse::ParsePathURL(url, static_cast<int>(strlen(url)), &parsed); + + EXPECT_TRUE(ComponentMatches(url, path_cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.path)); + + // The remaining components are never used for path urls. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.host); + ExpectInvalidComponent(parsed.port); + ExpectInvalidComponent(parsed.query); + ExpectInvalidComponent(parsed.ref); + } +} + +#ifdef WIN32 + +// WindowsFile ---------------------------------------------------------------- + +// Various incarnations of file URLs. These are for Windows only. +static URLParseCase file_cases[] = { +{"file:server", "file", NULL, NULL, "server", -1, NULL, NULL, NULL}, +{" file: server \t", "file", NULL, NULL, " server",-1, NULL, NULL, NULL}, +{"FiLe:c|", "FiLe", NULL, NULL, NULL, -1, "c|", NULL, NULL}, +{"FILE:/\\\\/server/file", "FILE", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file://server/", "file", NULL, NULL, "server", -1, "/", NULL, NULL}, +{"file://localhost/c:/", "file", NULL, NULL, NULL, -1, "/c:/", NULL, NULL}, +{"file://127.0.0.1/c|\\", "file", NULL, NULL, NULL, -1, "/c|\\", NULL, NULL}, +{"file:/", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, +{"file:", "file", NULL, NULL, NULL, -1, NULL, NULL, NULL}, + // If there is a Windows drive letter, treat any number of slashes as the + // path part. +{"file:c:\\fo\\b", "file", NULL, NULL, NULL, -1, "c:\\fo\\b", NULL, NULL}, +{"file:/c:\\foo/bar", "file", NULL, NULL, NULL, -1, "/c:\\foo/bar",NULL, NULL}, +{"file://c:/f\\b", "file", NULL, NULL, NULL, -1, "/c:/f\\b", NULL, NULL}, +{"file:///C:/foo", "file", NULL, NULL, NULL, -1, "/C:/foo", NULL, NULL}, +{"file://///\\/\\/c:\\f\\b", "file", NULL, NULL, NULL, -1, "/c:\\f\\b", NULL, NULL}, + // If there is not a drive letter, we should treat is as UNC EXCEPT for + // three slashes, which we treat as a Unix style path. +{"file:server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file:/server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file://server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, +{"file:///server/file", "file", NULL, NULL, NULL, -1, "/server/file",NULL, NULL}, +{"file://\\server/file", "file", NULL, NULL, NULL, -1, "\\server/file",NULL, NULL}, +{"file:////server/file", "file", NULL, NULL, "server", -1, "/file", NULL, NULL}, + // Queries and refs are valid for file URLs as well. +{"file:///C:/foo.html?#", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "", ""}, +{"file:///C:/foo.html?query=yes#ref", "file", NULL, NULL, NULL, -1, "/C:/foo.html", "query=yes", "ref"}, +}; + +TEST(URLParser, WindowsFile) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + url_parse::Parsed parsed; + for (int i = 0; i < arraysize(file_cases); i++) { + const char* url = file_cases[i].input; + url_parse::ParseFileURL(url, static_cast<int>(strlen(url)), &parsed); + int port = url_parse::ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].username, parsed.username)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].password, parsed.password)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].host, parsed.host)); + EXPECT_EQ(file_cases[i].port, port); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].query, parsed.query)); + EXPECT_TRUE(ComponentMatches(url, file_cases[i].ref, parsed.ref)); + } +} + +#endif // WIN32 + +TEST(URLParser, ExtractFileName) { + struct FileCase { + const char* input; + const char* expected; + } file_cases[] = { + {"http://www.google.com", NULL}, + {"http://www.google.com/", ""}, + {"http://www.google.com/search", "search"}, + {"http://www.google.com/search/", ""}, + {"http://www.google.com/foo/bar.html?baz=22", "bar.html"}, + {"http://www.google.com/foo/bar.html#ref", "bar.html"}, + {"http://www.google.com/search/;param", ""}, + {"http://www.google.com/foo/bar.html;param#ref", "bar.html"}, + {"http://www.google.com/foo/bar.html;foo;param#ref", "bar.html;foo"}, + {"http://www.google.com/foo/bar.html?query#ref", "bar.html"}, + }; + + for (size_t i = 0; i < ARRAYSIZE(file_cases); i++) { + const char* url = file_cases[i].input; + int len = static_cast<int>(strlen(url)); + + url_parse::Parsed parsed; + url_parse::ParseStandardURL(url, len, &parsed); + + url_parse::Component file_name; + url_parse::ExtractFileName(url, parsed.path, &file_name); + + EXPECT_TRUE(ComponentMatches(url, file_cases[i].expected, file_name)); + } +} + +// Returns true if the parameter with index |parameter| in the given URL's +// query string. The expected key can be NULL to indicate no such key index +// should exist. The parameter number is 1-based. +static bool NthParameterIs(const char* url, + int parameter, + const char* expected_key, + const char* expected_value) { + url_parse::Parsed parsed; + url_parse::ParseStandardURL(url, static_cast<int>(strlen(url)), &parsed); + + url_parse::Component query = parsed.query; + + for (int i = 1; i <= parameter; i++) { + url_parse::Component key, value; + if (!url_parse::ExtractQueryKeyValue(url, &query, &key, &value)) { + if (parameter >= i && !expected_key) + return true; // Expected nonexistant key, got one. + return false; // Not enough keys. + } + + if (i == parameter) { + if (!expected_key) + return false; + + if (strncmp(&url[key.begin], expected_key, key.len) != 0) + return false; + if (strncmp(&url[value.begin], expected_value, value.len) != 0) + return false; + return true; + } + } + return expected_key == NULL; // We didn't find that many parameters. +} + +TEST(URLParser, ExtractQueryKeyValue) { + EXPECT_TRUE(NthParameterIs("http://www.google.com", 1, NULL, NULL)); + + // Basic case. + char a[] = "http://www.google.com?arg1=1&arg2=2&bar"; + EXPECT_TRUE(NthParameterIs(a, 1, "arg1", "1")); + EXPECT_TRUE(NthParameterIs(a, 2, "arg2", "2")); + EXPECT_TRUE(NthParameterIs(a, 3, "bar", "")); + EXPECT_TRUE(NthParameterIs(a, 4, NULL, NULL)); + + // Empty param at the end. + char b[] = "http://www.google.com?foo=bar&"; + EXPECT_TRUE(NthParameterIs(b, 1, "foo", "bar")); + EXPECT_TRUE(NthParameterIs(b, 2, NULL, NULL)); + + // Empty param at the beginning. + char c[] = "http://www.google.com?&foo=bar"; + EXPECT_TRUE(NthParameterIs(c, 1, "", "")); + EXPECT_TRUE(NthParameterIs(c, 2, "foo", "bar")); + EXPECT_TRUE(NthParameterIs(c, 3, NULL, NULL)); + + // Empty key with value. + char d[] = "http://www.google.com?=foo"; + EXPECT_TRUE(NthParameterIs(d, 1, "", "foo")); + EXPECT_TRUE(NthParameterIs(d, 2, NULL, NULL)); + + // Empty value with key. + char e[] = "http://www.google.com?foo="; + EXPECT_TRUE(NthParameterIs(e, 1, "foo", "")); + EXPECT_TRUE(NthParameterIs(e, 2, NULL, NULL)); + + // Empty key and values. + char f[] = "http://www.google.com?&&==&="; + EXPECT_TRUE(NthParameterIs(f, 1, "", "")); + EXPECT_TRUE(NthParameterIs(f, 2, "", "")); + EXPECT_TRUE(NthParameterIs(f, 3, "", "=")); + EXPECT_TRUE(NthParameterIs(f, 4, "", "")); + EXPECT_TRUE(NthParameterIs(f, 5, NULL, NULL)); +} + +// MailtoURL -------------------------------------------------------------------- + +static MailtoURLParseCase mailto_cases[] = { +//|input |scheme |path |query +{"mailto:foo@gmail.com", "mailto", "foo@gmail.com", NULL}, +{" mailto: to \t", "mailto", " to", NULL}, +{"mailto:addr1%2C%20addr2 ", "mailto", "addr1%2C%20addr2", NULL}, +{"Mailto:addr1, addr2 ", "Mailto", "addr1, addr2", NULL}, +{"mailto:addr1:addr2 ", "mailto", "addr1:addr2", NULL}, +{"mailto:?to=addr1,addr2", "mailto", NULL, "to=addr1,addr2"}, +{"mailto:?to=addr1%2C%20addr2", "mailto", NULL, "to=addr1%2C%20addr2"}, +{"mailto:addr1?to=addr2", "mailto", "addr1", "to=addr2"}, +{"mailto:?body=#foobar#", "mailto", NULL, "body=#foobar#",}, +{"mailto:#?body=#foobar#", "mailto", "#", "body=#foobar#"}, +}; + +TEST(URLParser, MailtoUrl) { + // Declared outside for loop to try to catch cases in init() where we forget + // to reset something that is reset by the construtor. + url_parse::Parsed parsed; + for (size_t i = 0; i < arraysize(mailto_cases); ++i) { + const char* url = mailto_cases[i].input; + url_parse::ParseMailtoURL(url, static_cast<int>(strlen(url)), &parsed); + int port = url_parse::ParsePort(url, parsed.port); + + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].scheme, parsed.scheme)); + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].path, parsed.path)); + EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query)); + EXPECT_EQ(url_parse::PORT_UNSPECIFIED, port); + + // The remaining components are never used for mailto urls. + ExpectInvalidComponent(parsed.username); + ExpectInvalidComponent(parsed.password); + ExpectInvalidComponent(parsed.port); + ExpectInvalidComponent(parsed.ref); + } +} diff --git a/googleurl/src/url_test_utils.h b/googleurl/src/url_test_utils.h new file mode 100644 index 0000000..5294202 --- /dev/null +++ b/googleurl/src/url_test_utils.h @@ -0,0 +1,85 @@ +// Copyright 2007 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Convenience functions for string conversions. +// These are mostly intended for use in unit tests. + +#ifndef GOOGLEURL_SRC_URL_TEST_UTILS_H__ +#define GOOGLEURL_SRC_URL_TEST_UTILS_H__ + +#include <string> + +#include "base/string16.h" +#include "googleurl/src/url_canon_internal.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace url_test_utils { + +// Converts a UTF-16 string from native wchar_t format to char16, by +// truncating the high 32 bits. This is not meant to handle true UTF-32 +// encoded strings. +inline string16 WStringToUTF16(const wchar_t* src) { + string16 str; + int length = static_cast<int>(wcslen(src)); + for (int i = 0; i < length; ++i) { + str.push_back(static_cast<char16>(src[i])); + } + return str; +} + +// Converts a string from UTF-8 to UTF-16 +inline string16 ConvertUTF8ToUTF16(const std::string& src) { + int length = static_cast<int>(src.length()); + EXPECT_LT(length, 1024); + url_canon::RawCanonOutputW<1024> output; + EXPECT_TRUE(url_canon::ConvertUTF8ToUTF16(src.data(), length, &output)); + return string16(output.data(), output.length()); +} + +// Converts a string from UTF-16 to UTF-8 +inline std::string ConvertUTF16ToUTF8(const string16& src) { + std::string str; + url_canon::StdStringCanonOutput output(&str); + EXPECT_TRUE(url_canon::ConvertUTF16ToUTF8(src.data(), + static_cast<int>(src.length()), + &output)); + output.Complete(); + return str; +} + +} // namespace url_test_utils + +// This operator allows EXPECT_EQ(astring16, anotherstring16); to work. +inline std::ostream& operator<<(std::ostream& os, + const string16& str) { + // Convert to UTF-8 and print the string + return os << url_test_utils::ConvertUTF16ToUTF8(str); +} + +#endif // GOOGLEURL_SRC_URL_TEST_UTILS_H__ diff --git a/googleurl/src/url_util.cc b/googleurl/src/url_util.cc new file mode 100644 index 0000000..d623b45 --- /dev/null +++ b/googleurl/src/url_util.cc @@ -0,0 +1,453 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <string.h> +#include <vector> + +#include "googleurl/src/url_util.h" + +#include "base/logging.h" +#include "googleurl/src/url_file.h" + +namespace url_util { + +namespace { + +// ASCII-specific tolower. The standard library's tolower is locale sensitive, +// so we don't want to use it here. +template <class Char> inline Char ToLowerASCII(Char c) { + return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; +} + +// Backend for LowerCaseEqualsASCII. +template<typename Iter> +inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) { + for (Iter it = a_begin; it != a_end; ++it, ++b) { + if (!*b || ToLowerASCII(*it) != *b) + return false; + } + return *b == 0; +} + +const char kFileScheme[] = "file"; // Used in a number of places. +const char kMailtoScheme[] = "mailto"; + +const int kNumStandardURLSchemes = 5; +const char* kStandardURLSchemes[kNumStandardURLSchemes] = { + "http", + "https", + kFileScheme, // Yes, file urls can have a hostname! + "ftp", + "gopher", +}; + +// List of the currently installed standard schemes. This list is lazily +// initialized by InitStandardSchemes and is leaked on shutdown to prevent +// any destructors from being called that will slow us down or cause problems. +std::vector<const char*>* standard_schemes = NULL; + +// Ensures that the standard_schemes list is initialized, does nothing if it +// already has values. +void InitStandardSchemes() { + if (standard_schemes) + return; + standard_schemes = new std::vector<const char*>; + for (int i = 0; i < kNumStandardURLSchemes; i++) + standard_schemes->push_back(kStandardURLSchemes[i]); +} + +// Given a string and a range inside the string, compares it to the given +// lower-case |compare_to| buffer. +template<typename CHAR> +inline bool CompareSchemeComponent(const CHAR* spec, + const url_parse::Component& component, + const char* compare_to) { + if (!component.is_nonempty()) + return compare_to[0] == 0; // When component is empty, match empty scheme. + return LowerCaseEqualsASCII(&spec[component.begin], + &spec[component.end()], + compare_to); +} + +// Returns true if the given scheme identified by |scheme| within |spec| is one +// of the registered "standard" schemes. Note that this does not check for +// "://", use IsStandard for that. +template<typename CHAR> +bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) { + if (!scheme.is_nonempty()) + return false; // Empty or invalid schemes are non-standard. + + InitStandardSchemes(); + for (size_t i = 0; i < standard_schemes->size(); i++) { + if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()], + standard_schemes->at(i))) + return true; + } + return false; +} + +// Returns true if the stuff following the scheme in the given spec indicates +// a "standard" URL. The presence of "://" after the scheme indicates that +// there is a hostname, etc. which we call a standard URL. +template<typename CHAR> +bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len, + const url_parse::Component& scheme) { + int after_scheme = scheme.end(); + if (spec_len < after_scheme + 3) + return false; + return spec[after_scheme] == ':' && + spec[after_scheme + 1] == '/' && + spec[after_scheme + 2] == '/'; +} + +template<typename CHAR> +bool DoIsStandard(const CHAR* spec, int spec_len, + const url_parse::Component& scheme) { + return HasStandardSchemeSeparator(spec, spec_len, scheme) || + IsStandardScheme(spec, scheme); +} + +template<typename CHAR> +bool DoFindAndCompareScheme(const CHAR* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme) { + url_parse::Component our_scheme; + if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) { + // No scheme. + if (found_scheme) + *found_scheme = url_parse::Component(); + return false; + } + if (found_scheme) + *found_scheme = our_scheme; + return CompareSchemeComponent(str, our_scheme, compare); +} + +template<typename CHAR> +bool DoCanonicalize(const CHAR* in_spec, int in_spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + // Remove any whitespace from the middle of the relative URL, possibly + // copying to the new buffer. + url_canon::RawCanonOutputT<CHAR> whitespace_buffer; + int spec_len; + const CHAR* spec = RemoveURLWhitespace(in_spec, in_spec_len, + &whitespace_buffer, &spec_len); + + url_parse::Parsed parsed_input; +#ifdef WIN32 + // For Windows, we allow things that look like absolute Windows paths to be + // fixed up magically to file URLs. This is done for IE compatability. For + // example, this will change "c:/foo" into a file URL rather than treating + // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt"). + // There is similar logic in url_canon_relative.cc for + // + // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which + // has no meaning as an absolute path name. This is because browsers on Mac + // & Unix don't generally do this, so there is no compatibility reason for + // doing so. + if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) || + url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) { + url_parse::ParseFileURL(spec, spec_len, &parsed_input); + return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input, + charset_converter, + output, output_parsed); + } +#endif + + url_parse::Component scheme; + if(!url_parse::ExtractScheme(spec, spec_len, &scheme)) + return false; + + // This is the parsed version of the input URL, we have to canonicalize it + // before storing it in our object. + bool success; + if (CompareSchemeComponent(spec, scheme, kFileScheme)) { + // File URLs are special. + url_parse::ParseFileURL(spec, spec_len, &parsed_input); + success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input, + charset_converter, + output, output_parsed); + + } else if (IsStandard(spec, spec_len, scheme)) { + // All "normal" URLs. + url_parse::ParseStandardURL(spec, spec_len, &parsed_input); + success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input, + charset_converter, + output, output_parsed); + + } else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) { + // Mailto are treated like a standard url with only a scheme, path, query + url_parse::ParseMailtoURL(spec, spec_len, &parsed_input); + success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input, + output, output_parsed); + + } else { + // "Weird" URLs like data: and javascript: + url_parse::ParsePathURL(spec, spec_len, &parsed_input); + success = url_canon::CanonicalizePathURL(spec, spec_len, parsed_input, + output, output_parsed); + } + return success; +} + +template<typename CHAR> +bool DoResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const CHAR* in_relative, + int in_relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + // Remove any whitespace from the middle of the relative URL, possibly + // copying to the new buffer. + url_canon::RawCanonOutputT<CHAR> whitespace_buffer; + int relative_length; + const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length, + &whitespace_buffer, + &relative_length); + + // See if our base URL should be treated as "standard". + bool standard_base_scheme = + base_parsed.scheme.is_nonempty() && + IsStandard(base_spec, base_spec_len, base_parsed.scheme); + + bool is_relative; + url_parse::Component relative_component; + if (!url_canon::IsRelativeURL(base_spec, base_parsed, + relative, relative_length, + standard_base_scheme, + &is_relative, + &relative_component)) { + // Error resolving. + return false; + } + + if (is_relative) { + // Relative, resolve and canonicalize. + bool file_base_scheme = base_parsed.scheme.is_nonempty() && + CompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme); + return url_canon::ResolveRelativeURL(base_spec, base_parsed, + file_base_scheme, relative, + relative_component, charset_converter, + output, output_parsed); + } + + // Not relative, canonicalize the input. + return DoCanonicalize(relative, relative_length, charset_converter, + output, output_parsed); +} + +template<typename CHAR> +bool DoReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<CHAR>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed) { + // Note that we dispatch to the parser according the the scheme type of + // the OUTPUT URL. Normally, this is the same as our scheme, but if the + // scheme is being overridden, we need to test that. + + if (// Either the scheme is not replaced and the old one is a file, + (!replacements.IsSchemeOverridden() && + CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) || + // ...or it is being replaced and the new one is a file. + (replacements.IsSchemeOverridden() && + CompareSchemeComponent(replacements.sources().scheme, + replacements.components().scheme, + kFileScheme))) { + return url_canon::ReplaceFileURL(spec, parsed, replacements, + charset_converter, output, out_parsed); + } + + if (// Either the scheme is not replaced and the old one is standard, + (!replacements.IsSchemeOverridden() && + IsStandard(spec, spec_len, parsed.scheme)) || + // ...or it is being replaced and the new one is standard. + (replacements.IsSchemeOverridden() && + IsStandardScheme(replacements.sources().scheme, + replacements.components().scheme))) { + // Standard URL with all parts. + return url_canon::ReplaceStandardURL(spec, parsed, replacements, + charset_converter, output, out_parsed); + } + + if (// Either the scheme is not replaced and the old one is mailto, + (!replacements.IsSchemeOverridden() && + CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) || + // ...or it is being replaced and the new one is a mailto. + (replacements.IsSchemeOverridden() && + CompareSchemeComponent(replacements.sources().scheme, + replacements.components().scheme, + kMailtoScheme))) { + return url_canon::ReplaceMailtoURL(spec, parsed, replacements, + output, out_parsed); + } + + return url_canon::ReplacePathURL(spec, parsed, replacements, + output, out_parsed); +} + +} // namespace + +void AddStandardScheme(const char* new_scheme) { + size_t scheme_len = strlen(new_scheme); + if (scheme_len == 0) + return; + + // Dulicate the scheme into a new buffer and add it to the list of standard + // schemes. This pointer will be leaked on shutdown. + char* dup_scheme = new char[scheme_len + 1]; + memcpy(dup_scheme, new_scheme, scheme_len + 1); + + InitStandardSchemes(); + standard_schemes->push_back(dup_scheme); +} + +bool IsStandard(const char* spec, int spec_len, + const url_parse::Component& scheme) { + return DoIsStandard(spec, spec_len, scheme); +} + +bool IsStandard(const char16* spec, int spec_len, + const url_parse::Component& scheme) { + return DoIsStandard(spec, spec_len, scheme); +} + +bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme) { + return DoFindAndCompareScheme(str, str_len, compare, found_scheme); +} + +bool FindAndCompareScheme(const char16* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme) { + return DoFindAndCompareScheme(str, str_len, compare, found_scheme); +} + +bool Canonicalize(const char* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + return DoCanonicalize(spec, spec_len, charset_converter, + output, output_parsed); +} + +bool Canonicalize(const char16* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + return DoCanonicalize(spec, spec_len, charset_converter, + output, output_parsed); +} + +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + return DoResolveRelative(base_spec, base_spec_len, base_parsed, + relative, relative_length, + charset_converter, output, output_parsed); +} + +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char16* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed) { + return DoResolveRelative(base_spec, base_spec_len, base_parsed, + relative, relative_length, + charset_converter, output, output_parsed); +} + +bool ReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed) { + return DoReplaceComponents(spec, spec_len, parsed, replacements, + charset_converter, output, out_parsed); +} + +bool ReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char16>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed) { + return DoReplaceComponents(spec, spec_len, parsed, replacements, + charset_converter, output, out_parsed); +} + +// Front-ends for LowerCaseEqualsASCII. +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} + +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b_begin, + const char* b_end) { + while (a_begin != a_end && b_begin != b_end && + ToLowerASCII(*a_begin) == *b_begin) { + a_begin++; + b_begin++; + } + return a_begin == a_end && b_begin == b_end; +} + +bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} + +} // namespace url_util diff --git a/googleurl/src/url_util.h b/googleurl/src/url_util.h new file mode 100644 index 0000000..62813a6 --- /dev/null +++ b/googleurl/src/url_util.h @@ -0,0 +1,170 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef GOOGLEURL_SRC_URL_UTIL_H__ +#define GOOGLEURL_SRC_URL_UTIL_H__ + +#include <string> + +#include "base/string16.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_canon.h" + +namespace url_util { + +// Schemes -------------------------------------------------------------------- + +// Adds an application-defined scheme to the internal list of "standard" URL +// schemes. +void AddStandardScheme(const char* new_scheme); + +// Locates the scheme in the given string and places it into |found_scheme|, +// which may be NULL to indicate the caller does not care about the range. +// Returns whether the given |compare| scheme matches the scheme found in the +// input (if any). +bool FindAndCompareScheme(const char* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +bool FindAndCompareScheme(const char16* str, + int str_len, + const char* compare, + url_parse::Component* found_scheme); +inline bool FindAndCompareScheme(const std::string& str, + const char* compare, + url_parse::Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast<int>(str.size()), + compare, found_scheme); +} +inline bool FindAndCompareScheme(const string16& str, + const char* compare, + url_parse::Component* found_scheme) { + return FindAndCompareScheme(str.data(), static_cast<int>(str.size()), + compare, found_scheme); +} + +// Returns true if the given string represents a standard URL. This means that +// either the scheme is in the list of known standard schemes, or there is a +// "://" following the scheme. +bool IsStandard(const char* spec, int spec_len, + const url_parse::Component& scheme); +bool IsStandard(const char16* spec, int spec_len, + const url_parse::Component& scheme); + +// URL library wrappers ------------------------------------------------------- + +// Parses the given spec according to the extracted scheme type. Normal users +// should use the URL object, although this may be useful if performance is +// critical and you don't want to do the heap allocation for the std::string. +// +// As with the url_canon::Canonicalize* functions, the charset converter can +// be NULL to use UTF-8 (it will be faster in this case). +// +// Returns true if a valid URL was produced, false if not. On failure, the +// output and parsed structures will still be filled and will be consistent, +// but they will not represent a loadable URL. +bool Canonicalize(const char* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +bool Canonicalize(const char16* spec, + int spec_len, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); + +// Resolves a potentially relative URL relative to the given parsed base URL. +// The base MUST be valid. The resulting canonical URL and parsed information +// will be placed in to the given out variables. +// +// The relative need not be relative. If we discover that it's absolute, this +// will produce a canonical version of that URL. See Canonicalize() for more +// about the charset_converter. +// +// Returns true if the output is valid, false if the input could not produce +// a valid URL. +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); +bool ResolveRelative(const char* base_spec, + int base_spec_len, + const url_parse::Parsed& base_parsed, + const char16* relative, + int relative_length, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* output_parsed); + +// Replaces components in the given VALID input url. The new canonical URL info +// is written to output and out_parsed. +// +// Returns true if the resulting URL is valid. +bool ReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); +bool ReplaceComponents(const char* spec, + int spec_len, + const url_parse::Parsed& parsed, + const url_canon::Replacements<char16>& replacements, + url_canon::CharsetConverter* charset_converter, + url_canon::CanonOutput* output, + url_parse::Parsed* out_parsed); + +// String helper functions ---------------------------------------------------- + +// Compare the lower-case form of the given string against the given ASCII +// string. This is useful for doing checking if an input string matches some +// token, and it is optimized to avoid intermediate string copies. +// +// The versions of this function that don't take a b_end assume that the b +// string is NULL terminated. +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b); +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b_begin, + const char* b_end); +bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b); + +} // namespace url_util + +#endif // GOOGLEURL_SRC_URL_UTIL_H__ diff --git a/googleurl/src/url_util_unittest.cc b/googleurl/src/url_util_unittest.cc new file mode 100644 index 0000000..12e5254 --- /dev/null +++ b/googleurl/src/url_util_unittest.cc @@ -0,0 +1,98 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_canon_stdstring.h" +#include "googleurl/src/url_parse.h" +#include "googleurl/src/url_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +TEST(URLUtilTest, FindAndCompareScheme) { + url_parse::Component found_scheme; + + // Simple case where the scheme is found and matches. + const char kStr1[] = "http://www.com/"; + EXPECT_TRUE(url_util::FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "http", NULL)); + EXPECT_TRUE(url_util::FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "http", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component(0, 4)); + + // A case where the scheme is found and doesn't match. + EXPECT_FALSE(url_util::FindAndCompareScheme( + kStr1, static_cast<int>(strlen(kStr1)), "https", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component(0, 4)); + + // A case where there is no scheme. + const char kStr2[] = "httpfoobar"; + EXPECT_FALSE(url_util::FindAndCompareScheme( + kStr2, static_cast<int>(strlen(kStr2)), "http", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component()); + + // When there is an empty scheme, it should match the empty scheme. + const char kStr3[] = ":foo.com/"; + EXPECT_TRUE(url_util::FindAndCompareScheme( + kStr3, static_cast<int>(strlen(kStr3)), "", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component(0, 0)); + + // But when there is no scheme, it should fail. + EXPECT_FALSE(url_util::FindAndCompareScheme("", 0, "", &found_scheme)); + EXPECT_TRUE(found_scheme == url_parse::Component()); +} + +TEST(URLUtilTest, ReplaceComponents) { + url_parse::Parsed parsed; + url_canon::RawCanonOutputT<char> output; + url_parse::Parsed new_parsed; + + // Check that the following calls do not cause crash + url_canon::Replacements<char> replacements; + replacements.SetRef("test", url_parse::Component(0, 4)); + url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output, + &new_parsed); + replacements.ClearRef(); + replacements.SetHost("test", url_parse::Component(0, 4)); + url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output, + &new_parsed); + + replacements.ClearHost(); + url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents(NULL, 0, parsed, replacements, NULL, &output, + &new_parsed); + url_util::ReplaceComponents("", 0, parsed, replacements, NULL, &output, + &new_parsed); +} + diff --git a/googleurl/third_party/icu/build/using_icu.vsprops b/googleurl/third_party/icu/build/using_icu.vsprops new file mode 100644 index 0000000..a3989ef --- /dev/null +++ b/googleurl/third_party/icu/build/using_icu.vsprops @@ -0,0 +1,11 @@ +<?xml version="1.0" encoding="Windows-1252"?> +<VisualStudioPropertySheet + ProjectType="Visual C++" + Version="8.00" + Name="using_icu" + > + <Tool + Name="VCCLCompilerTool" + AdditionalIncludeDirectories=""$(SolutionDir)..\..\third_party\icu\public\common";"$(SolutionDir)..\..\third_party\icu\public\i18n"" + /> +</VisualStudioPropertySheet> |