diff options
author | mbelshe@chromium.org <mbelshe@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-06-07 14:57:17 +0000 |
---|---|---|
committer | mbelshe@chromium.org <mbelshe@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-06-07 14:57:17 +0000 |
commit | 196f286c045d4bf22364f00801c5a9e1d59e8a40 (patch) | |
tree | 31e165e8898eac61b2426c86b2aa4bc25358f44e | |
parent | 01540765be8ae546c7f2e2105621bd431f8462b1 (diff) | |
download | chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.zip chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.tar.gz chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.tar.bz2 |
Import the server-side code for URL encoding & unittest.
BUG=none
TEST=UrlToFilenameEncoderTest
Review URL: http://codereview.chromium.org/2511001
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@49056 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | net/net.gyp | 5 | ||||
-rw-r--r-- | net/spdy/spdy_session.cc | 1 | ||||
-rw-r--r-- | net/tools/dump_cache/cache_dumper.cc | 5 | ||||
-rw-r--r-- | net/tools/dump_cache/url_to_filename_encoder.cc | 302 | ||||
-rw-r--r-- | net/tools/dump_cache/url_to_filename_encoder.h | 199 | ||||
-rw-r--r-- | net/tools/dump_cache/url_to_filename_encoder_unittest.cc | 268 | ||||
-rw-r--r-- | net/tools/dump_cache/url_utilities.h | 64 |
7 files changed, 786 insertions, 58 deletions
diff --git a/net/net.gyp b/net/net.gyp index 5645177..eb6686a 100644 --- a/net/net.gyp +++ b/net/net.gyp @@ -732,6 +732,9 @@ 'spdy/spdy_session_unittest.cc', 'spdy/spdy_stream_unittest.cc', 'spdy/spdy_test_util.h', + 'tools/dump_cache/url_to_filename_encoder.cc', + 'tools/dump_cache/url_to_filename_encoder.h', + 'tools/dump_cache/url_to_filename_encoder_unittest.cc', 'url_request/url_request_unittest.cc', 'url_request/url_request_unittest.h', 'url_request/view_cache_helper_unittest.cc', @@ -1041,7 +1044,9 @@ 'tools/dump_cache/dump_cache.cc', 'tools/dump_cache/dump_files.cc', 'tools/dump_cache/upgrade.cc', + 'tools/dump_cache/url_to_filename_encoder.cc', 'tools/dump_cache/url_to_filename_encoder.h', + 'tools/dump_cache/url_utilties.h', ], }, ], diff --git a/net/spdy/spdy_session.cc b/net/spdy/spdy_session.cc index 25cafec..21b74bf 100644 --- a/net/spdy/spdy_session.cc +++ b/net/spdy/spdy_session.cc @@ -29,7 +29,6 @@ #include "net/spdy/spdy_protocol.h" #include "net/spdy/spdy_settings_storage.h" #include "net/spdy/spdy_stream.h" -#include "net/tools/dump_cache/url_to_filename_encoder.h" namespace { diff --git a/net/tools/dump_cache/cache_dumper.cc b/net/tools/dump_cache/cache_dumper.cc index 47602e6..74f1482 100644 --- a/net/tools/dump_cache/cache_dumper.cc +++ b/net/tools/dump_cache/cache_dumper.cc @@ -67,7 +67,10 @@ bool DiskDumper::CreateEntry(const std::string& key, // The URL may not start with a valid protocol; search for it. int urlpos = key.find("http"); std::string url = urlpos > 0 ? key.substr(urlpos) : key; - entry_path_ = net::UrlToFilenameEncoder::Encode(url, path); + std::string base_path = WideToASCII(path_); + std::string new_path = + net::UrlToFilenameEncoder::Encode(url, base_path, false); + entry_path_ = FilePath(ASCIIToWide(new_path)); #ifdef WIN32_LARGE_FILENAME_SUPPORT // In order for long filenames to work, we'll need to prepend diff --git a/net/tools/dump_cache/url_to_filename_encoder.cc b/net/tools/dump_cache/url_to_filename_encoder.cc new file mode 100644 index 0000000..89a1ca4 --- /dev/null +++ b/net/tools/dump_cache/url_to_filename_encoder.cc @@ -0,0 +1,302 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/logging.h" +#include "base/string_util.h" +#include "net/base/net_util.h" +#include "net/tools/dump_cache/url_to_filename_encoder.h" + +using std::string; + +namespace { + +inline bool IsHexDigit(unsigned char c) { + return (('0' <= c && c <= '9') || ('A' <= c && c <= 'F') || + ('a' <= c && c <= 'f')); +} + +// Returns 1 if buf is prefixed by "num_digits" of hex digits +// Teturns 0 otherwise. +// The function checks for '\0' for string termination. +int HexDigitsPrefix(const char* buf, int num_digits) { + for (int i = 0; i < num_digits; i++) + if (!IsHexDigit(buf[i])) + return 0; // This also detects end of string as '\0' is not xdigit. + return 1; +} + +#ifdef WIN32 +#define strtoull _strtoui64 +#endif + +// A simple parser for long long values. Returns the parsed value if a +// valid integer is found; else returns deflt +// UInt64 and Int64 cannot handle decimal numbers with leading 0s. +uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) { + char *error = NULL; + const uint64 value = strtoull(str, &error, 16); + return (error == str) ? deflt : value; +} + +} + +namespace net { + +// The escape character choice is made here -- all code and tests in this +// directory are based off of this constant. However, our test ata +// has tons of dependencies on this, so it cannot be changed without +// re-running those tests and fixing them. +const char kTruncationChar = '-'; +const char kEscapeChar = ','; +const size_t kMaximumSubdirectoryLength = 128; + +void UrlToFilenameEncoder::AppendSegment( + char dir_separator, string* segment, string* dest) { + if (segment->empty() || (*segment == ".") || (*segment == "..")) { + dest->append(1, kEscapeChar); + dest->append(*segment); + segment->clear(); + } else { + size_t segment_size = segment->size(); + if (segment_size > kMaximumSubdirectoryLength) { + // We need to inject ",-" at the end of the segment to signify that + // we are inserting an artificial '/'. This means we have to chop + // off at least two characters to make room. + segment_size = kMaximumSubdirectoryLength - 2; + + // But we don't want to break up an escape sequence that happens to lie at + // the end. Escape sequences are at most 2 characters. + if ((*segment)[segment_size - 1] == kEscapeChar) { + segment_size -= 1; + } else if ((*segment)[segment_size - 2] == kEscapeChar) { + segment_size -= 2; + } + dest->append(segment->data(), segment_size); + dest->append(1, kEscapeChar); + dest->append(1, kTruncationChar); + segment->erase(0, segment_size); + + // At this point, if we had segment_size=3, and segment="abcd", + // then after this erase, we will have written "abc,-" and set segment="d" + } else { + dest->append(*segment); + segment->clear(); + } + } +} + +void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix, + const string& filename_ending, + char dir_separator, + string* encoded_filename) { + char encoded[3]; + int encoded_len; + string segment; + + // TODO(jmarantz): This code would be a bit simpler if we disallowed + // Instaweb allowing filename_prefix to not end in "/". We could + // then change the is routine to just take one input string. + size_t start_of_segment = filename_prefix.find_last_of(dir_separator); + if (start_of_segment == string::npos) { + segment = filename_prefix; + } else { + segment = filename_prefix.substr(start_of_segment + 1); + *encoded_filename = filename_prefix.substr(0, start_of_segment + 1); + } + + size_t index = 0; + // Special case the first / to avoid adding a leading kEscapeChar. + if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) { + encoded_filename->append(segment); + segment.clear(); + encoded_filename->append(1, dir_separator); + ++index; + } + + for (; index < filename_ending.length(); ++index) { + unsigned char ch = static_cast<unsigned char>(filename_ending[index]); + + if (ch == dir_separator) { + AppendSegment(dir_separator, &segment, encoded_filename); + encoded_filename->append(1, dir_separator); + segment.clear(); + } else { + // & is common in URLs and is legal filename syntax, but is also + // a special Unix shell character, so let's avoid making + // filenames with &, as well as ?. It's probably better to + // blow up query-params than it is to make it hard to work with + // the files in shell-scripts. + if ((ch == 0x5F) || (ch == 0x2E) || // underscore period + (ch == 0x25) || (ch == 0x3D) || // percent equals + (ch == 0x2B) || (ch == 0x2D) || // plus dash + ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9] + ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z] + ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z] + encoded[0] = ch; + encoded_len = 1; + } else { + encoded[0] = kEscapeChar; + encoded[1] = ch / 16; + encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; + encoded[2] = ch % 16; + encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; + encoded_len = 3; + } + segment.append(encoded, encoded_len); + + // Note: We chop paths into medium sized 'chunks'. + // This is due to filename limits on Windows and Unix. + // The Windows limit appears to be 128 characters, and + // Unix is larger, but not as large as URLs with large + // numbers of query params. + if (segment.size() > kMaximumSubdirectoryLength) { + AppendSegment(dir_separator, &segment, encoded_filename); + encoded_filename->append(1, dir_separator); + } + } + } + + // Append "," to the leaf filename so the leaf can also be a branch., e.g. + // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and + // /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed + // us over the 128 char limit, then we will need to append "/" and the + // remaining chars. + segment += kEscapeChar; + AppendSegment(dir_separator, &segment, encoded_filename); + if (!segment.empty()) { + // The last overflow segment is special, because we appended in + // kEscapeChar above. We won't need to check it again for size + // or further escaping. + encoded_filename->append(1, dir_separator); + encoded_filename->append(segment); + } +} + +// Note: this decoder is not the exact inverse of the EncodeSegment above, +// because it does not take into account a prefix. +bool UrlToFilenameEncoder::Decode(const string& encoded_filename, + char dir_separator, + string* decoded_url) { + enum State { + kStart, + kEscape, + kFirstDigit, + kTruncate, + kEscapeDot + }; + State state = kStart; + int char_code = 0; + char hex_buffer[3]; + hex_buffer[2] = '\0'; + for (size_t i = 0; i < encoded_filename.size(); ++i) { + char ch = encoded_filename[i]; + switch (state) { + case kStart: + if (ch == kEscapeChar) { + state = kEscape; + } else { + decoded_url->append(1, ch); + } + break; + case kEscape: + if (HexDigitsPrefix(&ch, 1) == 1) { + hex_buffer[0] = ch; + state = kFirstDigit; + } else if (ch == kTruncationChar) { + state = kTruncate; + } else if (ch == '.') { + decoded_url->append(1, '.'); + state = kEscapeDot; // Look for at most one more dot. + } else if (ch == dir_separator) { + // Consider url "//x". This will get encoded to "/,/x,". + // This code is what skips the first Escape. + decoded_url->append(1, ch); + state = kStart; + } else { + return false; + } + break; + case kFirstDigit: + if (HexDigitsPrefix(&ch, 1) == 1) { + hex_buffer[1] = ch; + uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0); + decoded_url->append(1, static_cast<char>(hex_value)); + char_code = 0; + state = kStart; + } else { + return false; + } + break; + case kTruncate: + if (ch == dir_separator) { + // Skip this separator, it was only put in to break up long + // path segments, but is not part of the URL. + state = kStart; + } else { + return false; + } + break; + case kEscapeDot: + decoded_url->append(1, ch); + state = kStart; + break; + } + } + + // All legal encoded filenames end in kEscapeChar. + return (state == kEscape); +} + +// Escapes the given input |path| and chop any individual components +// of the path which are greater than kMaximumSubdirectoryLength characters +// into two chunks. +// +// This legacy version has several issues with aliasing of different URLs, +// inability to represent both /a/b/c and /a/b/c/d, and inability to decode +// the filenames back into URLs. +// +// But there is a large body of slurped data which depends on this format, +// so leave it as the default for spdy_in_mem_edsm_server. +string UrlToFilenameEncoder::LegacyEscape(const string& path) { + string output; + + // Note: We also chop paths into medium sized 'chunks'. + // This is due to the incompetence of the windows + // filesystem, which still hasn't figured out how + // to deal with long filenames. + int last_slash = 0; + for (size_t index = 0; index < path.length(); index++) { + char ch = path[index]; + if (ch == 0x5C) + last_slash = index; + if ((ch == 0x2D) || // hyphen + (ch == 0x5C) || (ch == 0x5F) || // backslash, underscore + ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9] + ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z] + ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z] + output.append(&path[index], 1); + } else { + char encoded[3]; + encoded[0] = 'x'; + encoded[1] = ch / 16; + encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; + encoded[2] = ch % 16; + encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; + output.append(encoded, 3); + } + if (index - last_slash > kMaximumSubdirectoryLength) { +#ifdef WIN32 + char slash = '\\'; +#else + char slash = '/'; +#endif + output.append(&slash, 1); + last_slash = index; + } + } + return output; +} + +} // namespace net + diff --git a/net/tools/dump_cache/url_to_filename_encoder.h b/net/tools/dump_cache/url_to_filename_encoder.h index 4b9e6c5..b5cac37 100644 --- a/net/tools/dump_cache/url_to_filename_encoder.h +++ b/net/tools/dump_cache/url_to_filename_encoder.h @@ -1,7 +1,77 @@ -// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +// URL filename encoder goals: +// +// 1. Allow URLs with arbitrary path-segment length, generating filenames +// with a maximum of 128 characters. +// 2. Provide a somewhat human readable filenames, for easy debugging flow. +// 3. Provide reverse-mapping from filenames back to URLs. +// 4. Be able to distinguish http://x from http://x/ from http://x/index.html. +// Those can all be different URLs. +// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen +// with Facebook Connect. +// +// We need an escape-character for representing characters that are legal +// in URL paths, but not in filenames, such as '?'. Illegal characters +// in Windows are <>:"/\|?*. For reference, see +// http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx +// +// We can pick any legal character as an escape, as long as we escape it too. +// But as we have a goal of having filenames that humans can correlate with +// URLs, we should pick one that doesn't show up frequently in URLs. Candidates +// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are +// shell escapes, and characters that occur frequently in URLs. +// +// .#&%-=_+ occur frequently in URLs. +// ~`!$^&(){}[] are special to Unix shells +// +// @ might seem like a reasonble option, but some build tools don't appreciate +// filenames with @ in testdata. Perforce does not appreciate # in a filename. +// +// Though a web-site http://www.vias.org/linux-knowhow/lnag_05_05_09.html +// identifies ^ as a special shell character, it did not appear to be an +// issue to use it unquoted as a filename in bash or tcsh. +// +// Here are some frequencies of some special characters in a data set from Fall +// '09. We find only 3 occurences of "x5E" (^ is ascii 0x53): +// ^ 3 build tools don't like ^ in testdata filenames +// @ 10 build tools don't like @ in testdata filenames +// . 1676 too frequent in URLs +// , 76 THE WINNER +// # 0 build tools doesn't like it +// & 487 Prefer to avoid shell escapes +// % 374 g4 doesn't like it +// = 579 very frequent in URLs -- leave unmodified +// - 464 very frequent in URLs -- leave unmodified +// _ 798 very frequent in URLs -- leave unmodified +// +// It is interesting that there were no slurped URLs with #, but I suspect this +// might be due to the slurping methdology. So let's stick with the relatively +// rare ','. +// +// Here's the escaping methodology: +// +// URL File +// / /, +// /. /., +// // /,/, +// /./ /,./, +// /../ /,../, +// /, /,2C, +// /,/ /,2C/, +// /a/b /a/b, (, at the end of a name indicates a leaf). +// /a/b/ /a/b/, +// +// path segments greater than 128 characters (after escape expansion) are +// suffixed with ,- so we can know that the next "/" is not part of the URL: +// +// /verylongname/ /verylong,-/name + +// NOTE: we avoid using some classes here (like FilePath and GURL) because we +// share this code with other projects externally. + #ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_ #define NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_ @@ -10,25 +80,31 @@ #include "base/file_path.h" #include "base/file_util.h" #include "base/string_util.h" -#include "googleurl/src/gurl.h" +#include "net/tools/dump_cache/url_utilities.h" namespace net { // Helper class for converting a URL into a filename. class UrlToFilenameEncoder { public: - // Given a |url| and a |base_path|, returns a FilePath which represents this + // Given a |url| and a |base_path|, returns a string which represents this // |url|. - static FilePath Encode(const std::string& url, FilePath base_path) { + // |legacy_escape| indicates that this function should use the old-style + // of encoding. + // TODO(mbelshe): delete the legacy_escape code. + static std::string Encode(const std::string& url, std::string base_path, + bool legacy_escape) { std::string clean_url(url); if (clean_url.length() && clean_url[clean_url.length()-1] == '/') clean_url.append("index.html"); - GURL gurl(clean_url); - FilePath filename(base_path); - filename = filename.AppendASCII(gurl.host()); + std::string host = UrlUtilities::GetUrlHost(clean_url); + std::string filename(base_path); + filename.append("\\"); + filename = filename.append(host); + filename.append("\\"); - std::string url_filename = gurl.PathForRequest(); + std::string url_filename = UrlUtilities::GetUrlPath(clean_url); // Strip the leading '/' if (url_filename[0] == '/') url_filename = url_filename.substr(1); @@ -40,59 +116,71 @@ class UrlToFilenameEncoder { StripDoubleSlashes(&url_filename); // Save path as filesystem-safe characters - url_filename = Escape(url_filename); - filename = filename.AppendASCII(url_filename); + if (legacy_escape) { + url_filename = LegacyEscape(url_filename); + } else { + url_filename = Escape(url_filename); + } + filename = filename.append(url_filename); + +#ifndef WIN32 + // Last step - convert to native slashes! + const std::string slash("/"); + const std::string backslash("\\"); + ReplaceAll(&filename, backslash, slash); +#endif return filename; } - private: - // This is the length at which we chop individual subdirectories. - // Technically, we shouldn't need to do this, but I found that - // even with long-filename support, windows had trouble creating - // long subdirectories, and making them shorter helps. - static const size_t kMaximumSubdirectoryLength = 128; + // Rewrite HTML in a form that the SPDY in-memory server + // can read. + // |filename_prefix| is prepended without escaping. + // |filename_ending| is the URL to be encoded into a filename. + // |dir_separator| is "/" on Unix, "\" on Windows. + // |encoded_filename| is the resultant filename. + static void EncodeSegment( + const std::string& filename_prefix, + const std::string& filename_ending, + char dir_separator, + std::string* encoded_filename); + + // Decodes a filename that was encoded with EncodeSegment, + // yielding back the original URL. + static bool Decode(const std::string& encoded_filename, + char dir_separator, + std::string* decoded_url); - // Escape the given input |path| and chop any individual components + private: + // Appends a segment of the path, special-casing ".", "..", and "", and + // ensuring that the segment does not exceed the path length. If it does, + // it chops the end off the segment, writes the segment with a separator of + // ",-/", and then rewrites segment to contain just the truncated piece so + // it can be used in the next iteration. + // |dir_separator| is "/" on Unix, "\" on Windows. + // |segment| is a read/write parameter containing segment to write + static void AppendSegment( + char dir_separator, + std::string* segment, + std::string* dest); + + // Escapes the given input |path| and chop any individual components // of the path which are greater than kMaximumSubdirectoryLength characters // into two chunks. static std::string Escape(const std::string& path) { std::string output; - int last_slash = 0; - for (size_t index = 0; index < path.length(); index++) { - char ch = path[index]; - if (ch == 0x5C) - last_slash = index; - if ((ch == 0x2D) || // hyphen - (ch == 0x5C) || (ch == 0x5F) || // backslash, underscore - ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9] - ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z] - ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z] - output.append(&path[index],1); - } else { - char encoded[3]; - encoded[0] = 'x'; - encoded[1] = ch / 16; - encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; - encoded[2] = ch % 16; - encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; - output.append(encoded, 3); - } - if (index - last_slash > kMaximumSubdirectoryLength) { - char backslash = '\\'; - output.append(&backslash, 1); - last_slash = index; - } - } + EncodeSegment("", path, '\\', &output); return output; } + // Allow reading of old slurped files. + static std::string LegacyEscape(const std::string& path); + // Replace all instances of |from| within |str| as |to|. - static void ReplaceAll(const std::string& from, - const std::string& to, - std::string* str) { + static void ReplaceAll(std::string* str, const std::string& from, + const std::string& to) { std::string::size_type pos(0); - while((pos = str->find(from, pos)) != std::string::npos) { + while ((pos = str->find(from, pos)) != std::string::npos) { str->replace(pos, from.size(), to); pos += from.size(); } @@ -100,21 +188,20 @@ class UrlToFilenameEncoder { // Replace all instances of "/" with "\" in |path|. static void ConvertToSlashes(std::string* path) { - static const char slash[] = { '/', '\0' }; - static const char backslash[] = { '\\', '\0' }; - ReplaceAll(slash, backslash, path); + const std::string slash("/"); + const std::string backslash("\\"); + ReplaceAll(path, slash, backslash); } // Replace all instances of "\\" with "%5C%5C" in |path|. static void StripDoubleSlashes(std::string* path) { - static const char doubleslash[] = { '\\', '\\', '\0' }; - static const char escaped_doubleslash[] = - { '%', '5', 'C', '%', '5', 'C','\0' }; - ReplaceAll(doubleslash, escaped_doubleslash, path); + const std::string doubleslash("\\\\"); + const std::string escaped_doubleslash("%5C%5C"); + ReplaceAll(path, doubleslash, escaped_doubleslash); } }; -} // namespace net +} // namespace net -#endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H__ +#endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_ diff --git a/net/tools/dump_cache/url_to_filename_encoder_unittest.cc b/net/tools/dump_cache/url_to_filename_encoder_unittest.cc new file mode 100644 index 0000000..32cef99 --- /dev/null +++ b/net/tools/dump_cache/url_to_filename_encoder_unittest.cc @@ -0,0 +1,268 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "net/tools/dump_cache/url_to_filename_encoder.h" + +#include <string> +#include <vector> +#include "base/string_piece.h" +#include "base/string_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +using base::StringPiece; +using std::string; + +namespace net { + +// The escape character choice is made here -- all code and tests in this +// directory are based off of this constant. However, our test ata +// has tons of dependencies on this, so it cannot be changed without +// re-running those tests and fixing them. +const char kTruncationChar = '-'; +const char kEscapeChar = ','; +const size_t kMaximumSubdirectoryLength = 128; + +class UrlToFilenameEncoderTest : public ::testing::Test { + protected: + UrlToFilenameEncoderTest() : escape_(1, kEscapeChar) {} + + void CheckSegmentLength(const StringPiece& escaped_word) { + std::vector<StringPiece> components; + Tokenize(escaped_word, StringPiece("/"), &components); + for (size_t i = 0; i < components.size(); ++i) { + EXPECT_GE(kMaximumSubdirectoryLength, + components[i].size()); + } + } + + void CheckValidChars(const StringPiece& escaped_word) { + // These characters are invalid in Windows. We will + // ignore / for this test, but add in ', as that's pretty + // inconvenient in a Unix filename. + // + // See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx + static const char kInvalidChars[] = "<>:\"\\|?*'"; + for (size_t i = 0; i < escaped_word.size(); ++i) { + char c = escaped_word[i]; + EXPECT_EQ(NULL, strchr(kInvalidChars, c)); + EXPECT_NE('\0', c); // only invalid character in Posix + EXPECT_GT(0x7E, c); // only English printable characters + } + } + + void Validate(const string& in_word, const string& gold_word) { + string escaped_word, url; + UrlToFilenameEncoder::EncodeSegment("", in_word, '/', &escaped_word); + EXPECT_EQ(gold_word, escaped_word); + CheckSegmentLength(escaped_word); + CheckValidChars(escaped_word); + UrlToFilenameEncoder::Decode(escaped_word, '/', &url); + EXPECT_EQ(in_word, url); + } + + void ValidateAllSegmentsSmall(const string& in_word) { + string escaped_word, url; + UrlToFilenameEncoder::EncodeSegment("", in_word, '/', &escaped_word); + CheckSegmentLength(escaped_word); + CheckValidChars(escaped_word); + UrlToFilenameEncoder::Decode(escaped_word, '/', &url); + EXPECT_EQ(in_word, url); + } + + void ValidateNoChange(const string& word) { + // We always suffix the leaf with kEscapeChar, unless the leaf is empty. + Validate(word, word + escape_); + } + + void ValidateEscaped(unsigned char ch) { + // We always suffix the leaf with kEscapeChar, unless the leaf is empty. + char escaped[100]; + const char escape = kEscapeChar; + base::snprintf(escaped, sizeof(escaped), "%c%02X%c", escape, ch, escape); + Validate(string(1, ch), escaped); + } + + string escape_; +}; + +TEST_F(UrlToFilenameEncoderTest, DoesNotEscape) { + ValidateNoChange(""); + ValidateNoChange("abcdefg"); + ValidateNoChange("abcdefghijklmnopqrstuvwxyz"); + ValidateNoChange("ZYXWVUT"); + ValidateNoChange("ZYXWVUTSRQPONMLKJIHGFEDCBA"); + ValidateNoChange("01234567689"); + ValidateNoChange("/-_"); + ValidateNoChange("abcdefghijklmnopqrstuvwxyzZYXWVUTSRQPONMLKJIHGFEDCBA" + "01234567689/-_"); + ValidateNoChange("index.html"); + ValidateNoChange("/"); + ValidateNoChange("/."); + ValidateNoChange("."); + ValidateNoChange(".."); + ValidateNoChange("%"); + ValidateNoChange("="); + ValidateNoChange("+"); + ValidateNoChange("_"); +} + +TEST_F(UrlToFilenameEncoderTest, Escapes) { + ValidateEscaped('!'); + ValidateEscaped('"'); + ValidateEscaped('#'); + ValidateEscaped('$'); + ValidateEscaped('&'); + ValidateEscaped('('); + ValidateEscaped(')'); + ValidateEscaped('*'); + ValidateEscaped(','); + ValidateEscaped(':'); + ValidateEscaped(';'); + ValidateEscaped('<'); + ValidateEscaped('>'); + ValidateEscaped('@'); + ValidateEscaped('['); + ValidateEscaped('\''); + ValidateEscaped('\\'); + ValidateEscaped(']'); + ValidateEscaped('^'); + ValidateEscaped('`'); + ValidateEscaped('{'); + ValidateEscaped('|'); + ValidateEscaped('}'); + ValidateEscaped('~'); + + // check non-printable characters + ValidateEscaped('\0'); + for (int i = 127; i < 256; ++i) { + ValidateEscaped(static_cast<char>(i)); + } +} + +TEST_F(UrlToFilenameEncoderTest, DoesEscapeCorrectly) { + Validate("mysite.com&x", "mysite.com" + escape_ + "26x" + escape_); + Validate("/./", "/" + escape_ + "./" + escape_); + Validate("/../", "/" + escape_ + "../" + escape_); + Validate("//", "/" + escape_ + "/" + escape_); + Validate("/./leaf", "/" + escape_ + "./leaf" + escape_); + Validate("/../leaf", "/" + escape_ + "../leaf" + escape_); + Validate("//leaf", "/" + escape_ + "/leaf" + escape_); + Validate("mysite/u?param1=x¶m2=y", + "mysite/u" + escape_ + "3Fparam1=x" + escape_ + "26param2=y" + + escape_); + Validate("search?q=dogs&go=&form=QBLH&qs=n", // from Latency Labs bing test. + "search" + escape_ + "3Fq=dogs" + escape_ + "26go=" + escape_ + + "26form=QBLH" + escape_ + "26qs=n" + escape_); + Validate("~joebob/my_neeto-website+with_stuff.asp?id=138&content=true", + "" + escape_ + "7Ejoebob/my_neeto-website+with_stuff.asp" + escape_ + + "3Fid=138" + escape_ + "26content=true" + escape_); +} + +TEST_F(UrlToFilenameEncoderTest, LongTail) { + static char long_word[] = + "~joebob/briggs/12345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890123456789012345678901234567890"; + + // the long lines in the string below are 64 characters, so we can see + // the slashes every 128. + string gold_long_word = + escape_ + "7Ejoebob/briggs/" + "1234567890123456789012345678901234567890123456789012345678901234" + "56789012345678901234567890123456789012345678901234567890123456" + + escape_ + "-/" + "7890123456789012345678901234567890123456789012345678901234567890" + "12345678901234567890123456789012345678901234567890123456789012" + + escape_ + "-/" + "3456789012345678901234567890123456789012345678901234567890123456" + "78901234567890123456789012345678901234567890123456789012345678" + + escape_ + "-/" + "9012345678901234567890" + escape_; + EXPECT_LT(kMaximumSubdirectoryLength, + sizeof(long_word)); + Validate(long_word, gold_long_word); +} + +TEST_F(UrlToFilenameEncoderTest, LongTailQuestion) { + // Here the '?' in the last path segment expands to @3F, making + // it hit 128 chars before the input segment gets that big. + static char long_word[] = + "~joebob/briggs/1234567?1234567?1234567?1234567?1234567?" + "1234567?1234567?1234567?1234567?1234567?1234567?1234567?" + "1234567?1234567?1234567?1234567?1234567?1234567?1234567?" + "1234567?1234567?1234567?1234567?1234567?1234567?1234567?" + "1234567?1234567?1234567?1234567?1234567?1234567?1234567?" + "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"; + + // Notice that at the end of the third segment, we avoid splitting + // the (escape_ + "3F") that was generated from the "?", so that segment is + // only 127 characters. + string pattern = "1234567" + escape_ + "3F"; // 10 characters + string gold_long_word = + escape_ + "7Ejoebob/briggs/" + + pattern + pattern + pattern + pattern + pattern + pattern + "1234" + "567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern + + "123456" + escape_ + "-/" + "7" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern + + pattern + pattern + pattern + pattern + pattern + pattern + pattern + + "12" + + escape_ + "-/" + "34567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern + + "1234567" + escape_ + "3F" + pattern + pattern + pattern + pattern + + pattern + "1234567" + + escape_ + "-/" + + escape_ + "3F" + pattern + pattern + escape_; + EXPECT_LT(kMaximumSubdirectoryLength, + sizeof(long_word)); + Validate(long_word, gold_long_word); +} + +TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenNoEscape) { + // hit corner cases, +/- 4 characters from kMaxLen + for (int i = -4; i <= 4; ++i) { + string input; + input.append(i + kMaximumSubdirectoryLength, 'x'); + ValidateAllSegmentsSmall(input); + } +} + +TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenWithEscape) { + // hit corner cases, +/- 4 characters from kMaxLen. This time we + // leave off the last 'x' and put in a '.', which ensures that we + // are truncating with '/' *after* the expansion. + for (int i = -4; i <= 4; ++i) { + string input; + input.append(i + kMaximumSubdirectoryLength - 1, 'x'); + input.append(1, '.'); // this will expand to 3 characters. + ValidateAllSegmentsSmall(input); + } +} + +TEST_F(UrlToFilenameEncoderTest, LeafBranchAlias) { + Validate("/a/b/c", "/a/b/c" + escape_); // c is leaf file "c," + Validate("/a/b/c/d", "/a/b/c/d" + escape_); // c is directory "c" + Validate("/a/b/c/d/", "/a/b/c/d/" + escape_); +} + + +TEST_F(UrlToFilenameEncoderTest, BackslashSeparator) { + string long_word; + string escaped_word; + long_word.append(kMaximumSubdirectoryLength + 1, 'x'); + UrlToFilenameEncoder::EncodeSegment("", long_word, '\\', &escaped_word); + + // check that one backslash, plus the escape ",-", and the ending , got added. + EXPECT_EQ(long_word.size() + 4, escaped_word.size()); + ASSERT_LT(kMaximumSubdirectoryLength, + escaped_word.size()); + // Check that the backslash got inserted at the correct spot. + EXPECT_EQ('\\', escaped_word[ + kMaximumSubdirectoryLength]); +} + +} // namespace + diff --git a/net/tools/dump_cache/url_utilities.h b/net/tools/dump_cache/url_utilities.h new file mode 100644 index 0000000..4de95dc --- /dev/null +++ b/net/tools/dump_cache/url_utilities.h @@ -0,0 +1,64 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_ +#define NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_ + +#include <string> + +namespace net { + +namespace UrlUtilities { + +// Gets the host from an url, strips the port number as well if the url +// has one. +// For example: calling GetUrlHost(www.foo.com:8080/boo) returns www.foo.com +static std::string GetUrlHost(const std::string& url) { + size_t b = url.find("//"); + if (b == std::string::npos) + b = 0; + else + b += 2; + size_t next_slash = url.find_first_of('/', b); + size_t next_colon = url.find_first_of(':', b); + if (next_slash != std::string::npos + && next_colon != std::string::npos + && next_colon < next_slash) { + return std::string(url, b, next_colon - b); + } + if (next_slash == std::string::npos) { + if (next_colon != std::string::npos) { + return std::string(url, next_colon - b); + } else { + next_slash = url.size(); + } + } + return std::string(url, b, next_slash - b); +} + +// Gets the path portion of an url. +// e.g http://www.foo.com/path +// returns /path +static std::string GetUrlPath(const std::string& url) { + size_t b = url.find("//"); + if (b == std::string::npos) + b = 0; + else + b += 2; + b = url.find("/", b); + if (b == std::string::npos) + return "/"; + + size_t e = url.find("#", b+1); + if (e != std::string::npos) + return std::string(url, b, (e - b)); + return std::string(url, b); +} + +} // namespace UrlUtilities + +} // namespace net + +#endif // NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_ + |