Import the server-side code for URL encoding & unittest.

BUG=none TEST=UrlToFilenameEncoderTest Review URL: http://codereview.chromium.org/2511001 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@49056 0039d316-1c4b-4281-b951-d872f2087c98
author: mbelshe@chromium.org <mbelshe@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-06-07 14:57:17 +0000
committer: mbelshe@chromium.org <mbelshe@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-06-07 14:57:17 +0000
commit: 196f286c045d4bf22364f00801c5a9e1d59e8a40 (patch)
tree: 31e165e8898eac61b2426c86b2aa4bc25358f44e
parent: 01540765be8ae546c7f2e2105621bd431f8462b1 (diff)
download: chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.zip
chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.tar.gz
chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.tar.bz2
7 files changed, 786 insertions, 58 deletions
diff --git a/net/net.gyp b/net/net.gyp
index 5645177..eb6686a 100644
--- a/net/net.gyp
+++ b/net/net.gyp
@@ -732,6 +732,9 @@
         'spdy/spdy_session_unittest.cc',
         'spdy/spdy_stream_unittest.cc',
         'spdy/spdy_test_util.h',
+        'tools/dump_cache/url_to_filename_encoder.cc',
+        'tools/dump_cache/url_to_filename_encoder.h',
+        'tools/dump_cache/url_to_filename_encoder_unittest.cc',
         'url_request/url_request_unittest.cc',
         'url_request/url_request_unittest.h',
         'url_request/view_cache_helper_unittest.cc',
@@ -1041,7 +1044,9 @@
             'tools/dump_cache/dump_cache.cc',
             'tools/dump_cache/dump_files.cc',
             'tools/dump_cache/upgrade.cc',
+            'tools/dump_cache/url_to_filename_encoder.cc',
             'tools/dump_cache/url_to_filename_encoder.h',
+            'tools/dump_cache/url_utilties.h',
           ],
         },
       ],
diff --git a/net/spdy/spdy_session.cc b/net/spdy/spdy_session.cc
index 25cafec..21b74bf 100644
--- a/net/spdy/spdy_session.cc
+++ b/net/spdy/spdy_session.cc
@@ -29,7 +29,6 @@
 #include "net/spdy/spdy_protocol.h"
 #include "net/spdy/spdy_settings_storage.h"
 #include "net/spdy/spdy_stream.h"
-#include "net/tools/dump_cache/url_to_filename_encoder.h"
 
 namespace {
 
diff --git a/net/tools/dump_cache/cache_dumper.cc b/net/tools/dump_cache/cache_dumper.cc
index 47602e6..74f1482 100644
--- a/net/tools/dump_cache/cache_dumper.cc
+++ b/net/tools/dump_cache/cache_dumper.cc
@@ -67,7 +67,10 @@ bool DiskDumper::CreateEntry(const std::string& key,
   // The URL may not start with a valid protocol; search for it.
   int urlpos = key.find("http");
   std::string url = urlpos > 0 ? key.substr(urlpos) : key;
-  entry_path_ = net::UrlToFilenameEncoder::Encode(url, path);
+  std::string base_path = WideToASCII(path_);
+  std::string new_path =
+      net::UrlToFilenameEncoder::Encode(url, base_path, false);
+  entry_path_ = FilePath(ASCIIToWide(new_path));
 
 #ifdef WIN32_LARGE_FILENAME_SUPPORT
   // In order for long filenames to work, we'll need to prepend
diff --git a/net/tools/dump_cache/url_to_filename_encoder.cc b/net/tools/dump_cache/url_to_filename_encoder.cc
new file mode 100644
index 0000000..89a1ca4
--- /dev/null
+++ b/net/tools/dump_cache/url_to_filename_encoder.cc
@@ -0,0 +1,302 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/logging.h"
+#include "base/string_util.h"
+#include "net/base/net_util.h"
+#include "net/tools/dump_cache/url_to_filename_encoder.h"
+
+using std::string;
+
+namespace {
+
+inline bool IsHexDigit(unsigned char c) {
+  return (('0' <= c && c <= '9') || ('A' <= c && c <= 'F') ||
+          ('a' <= c && c <= 'f'));
+}
+
+// Returns 1 if buf is prefixed by "num_digits" of hex digits
+// Teturns 0 otherwise.
+// The function checks for '\0' for string termination.
+int HexDigitsPrefix(const char* buf, int num_digits) {
+  for (int i = 0; i < num_digits; i++)
+    if (!IsHexDigit(buf[i]))
+      return 0;  // This also detects end of string as '\0' is not xdigit.
+  return 1;
+}
+
+#ifdef WIN32
+#define strtoull _strtoui64
+#endif
+
+// A simple parser for long long values. Returns the parsed value if a
+// valid integer is found; else returns deflt
+// UInt64 and Int64 cannot handle decimal numbers with leading 0s.
+uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
+  char *error = NULL;
+  const uint64 value = strtoull(str, &error, 16);
+  return (error == str) ? deflt : value;
+}
+
+}
+
+namespace net {
+
+// The escape character choice is made here -- all code and tests in this
+// directory are based off of this constant.  However, our test ata
+// has tons of dependencies on this, so it cannot be changed without
+// re-running those tests and fixing them.
+const char kTruncationChar = '-';
+const char kEscapeChar = ',';
+const size_t kMaximumSubdirectoryLength = 128;
+
+void UrlToFilenameEncoder::AppendSegment(
+    char dir_separator, string* segment, string* dest) {
+  if (segment->empty() || (*segment == ".") || (*segment == "..")) {
+    dest->append(1, kEscapeChar);
+    dest->append(*segment);
+    segment->clear();
+  } else {
+    size_t segment_size = segment->size();
+    if (segment_size > kMaximumSubdirectoryLength) {
+      // We need to inject ",-" at the end of the segment to signify that
+      // we are inserting an artificial '/'.  This means we have to chop
+      // off at least two characters to make room.
+      segment_size = kMaximumSubdirectoryLength - 2;
+
+      // But we don't want to break up an escape sequence that happens to lie at
+      // the end.  Escape sequences are at most 2 characters.
+      if ((*segment)[segment_size - 1] == kEscapeChar) {
+        segment_size -= 1;
+      } else if ((*segment)[segment_size - 2] == kEscapeChar) {
+        segment_size -= 2;
+      }
+      dest->append(segment->data(), segment_size);
+      dest->append(1, kEscapeChar);
+      dest->append(1, kTruncationChar);
+      segment->erase(0, segment_size);
+
+      // At this point, if we had segment_size=3, and segment="abcd",
+      // then after this erase, we will have written "abc,-" and set segment="d"
+    } else {
+      dest->append(*segment);
+      segment->clear();
+    }
+  }
+}
+
+void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
+                                         const string& filename_ending,
+                                         char dir_separator,
+                                         string* encoded_filename) {
+  char encoded[3];
+  int encoded_len;
+  string segment;
+
+  // TODO(jmarantz): This code would be a bit simpler if we disallowed
+  // Instaweb allowing filename_prefix to not end in "/".  We could
+  // then change the is routine to just take one input string.
+  size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
+  if (start_of_segment == string::npos) {
+    segment = filename_prefix;
+  } else {
+    segment = filename_prefix.substr(start_of_segment + 1);
+    *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
+  }
+
+  size_t index = 0;
+  // Special case the first / to avoid adding a leading kEscapeChar.
+  if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
+    encoded_filename->append(segment);
+    segment.clear();
+    encoded_filename->append(1, dir_separator);
+    ++index;
+  }
+
+  for (; index < filename_ending.length(); ++index) {
+    unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
+
+    if (ch == dir_separator) {
+      AppendSegment(dir_separator, &segment, encoded_filename);
+      encoded_filename->append(1, dir_separator);
+      segment.clear();
+    } else {
+      // & is common in URLs and is legal filename syntax, but is also
+      // a special Unix shell character, so let's avoid making
+      // filenames with &, as well as ?.  It's probably better to
+      // blow up query-params than it is to make it hard to work with
+      // the files in shell-scripts.
+      if ((ch == 0x5F) || (ch == 0x2E) ||    // underscore period
+          (ch == 0x25) || (ch == 0x3D) ||    // percent equals
+          (ch == 0x2B) || (ch == 0x2D) ||    // plus dash
+          ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
+          ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
+          ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
+        encoded[0] = ch;
+        encoded_len = 1;
+      } else {
+        encoded[0] = kEscapeChar;
+        encoded[1] = ch / 16;
+        encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
+        encoded[2] = ch % 16;
+        encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
+        encoded_len = 3;
+      }
+      segment.append(encoded, encoded_len);
+
+      // Note:  We chop paths into medium sized 'chunks'.
+      //        This is due to filename limits on Windows and Unix.
+      //        The Windows limit appears to be 128 characters, and
+      //        Unix is larger, but not as large as URLs with large
+      //        numbers of query params.
+      if (segment.size() > kMaximumSubdirectoryLength) {
+        AppendSegment(dir_separator, &segment, encoded_filename);
+        encoded_filename->append(1, dir_separator);
+      }
+    }
+  }
+
+  // Append "," to the leaf filename so the leaf can also be a branch., e.g.
+  // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
+  // /a/b/c/d".  So we will rename the "d" here to "d,".  If doing that pushed
+  // us over the 128 char limit, then we will need to append "/" and the
+  // remaining chars.
+  segment += kEscapeChar;
+  AppendSegment(dir_separator, &segment, encoded_filename);
+  if (!segment.empty()) {
+    // The last overflow segment is special, because we appended in
+    // kEscapeChar above.  We won't need to check it again for size
+    // or further escaping.
+    encoded_filename->append(1, dir_separator);
+    encoded_filename->append(segment);
+  }
+}
+
+// Note: this decoder is not the exact inverse of the EncodeSegment above,
+// because it does not take into account a prefix.
+bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
+                                  char dir_separator,
+                                  string* decoded_url) {
+  enum State {
+    kStart,
+    kEscape,
+    kFirstDigit,
+    kTruncate,
+    kEscapeDot
+  };
+  State state = kStart;
+  int char_code = 0;
+  char hex_buffer[3];
+  hex_buffer[2] = '\0';
+  for (size_t i = 0; i < encoded_filename.size(); ++i) {
+    char ch = encoded_filename[i];
+    switch (state) {
+      case kStart:
+        if (ch == kEscapeChar) {
+          state = kEscape;
+        } else {
+          decoded_url->append(1, ch);
+        }
+        break;
+      case kEscape:
+        if (HexDigitsPrefix(&ch, 1) == 1) {
+          hex_buffer[0] = ch;
+          state = kFirstDigit;
+        } else if (ch == kTruncationChar) {
+          state = kTruncate;
+        } else if (ch == '.') {
+          decoded_url->append(1, '.');
+          state = kEscapeDot;  // Look for at most one more dot.
+        } else if (ch == dir_separator) {
+          // Consider url "//x".  This will get encoded to "/,/x,".
+          // This code is what skips the first Escape.
+          decoded_url->append(1, ch);
+          state = kStart;
+        } else {
+          return false;
+        }
+        break;
+      case kFirstDigit:
+        if (HexDigitsPrefix(&ch, 1) == 1) {
+          hex_buffer[1] = ch;
+          uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
+          decoded_url->append(1, static_cast<char>(hex_value));
+          char_code = 0;
+          state = kStart;
+        } else {
+          return false;
+        }
+        break;
+      case kTruncate:
+        if (ch == dir_separator) {
+          // Skip this separator, it was only put in to break up long
+          // path segments, but is not part of the URL.
+          state = kStart;
+        } else {
+          return false;
+        }
+        break;
+      case kEscapeDot:
+        decoded_url->append(1, ch);
+        state = kStart;
+        break;
+    }
+  }
+
+  // All legal encoded filenames end in kEscapeChar.
+  return (state == kEscape);
+}
+
+// Escapes the given input |path| and chop any individual components
+// of the path which are greater than kMaximumSubdirectoryLength characters
+// into two chunks.
+//
+// This legacy version has several issues with aliasing of different URLs,
+// inability to represent both /a/b/c and /a/b/c/d, and inability to decode
+// the filenames back into URLs.
+//
+// But there is a large body of slurped data which depends on this format,
+// so leave it as the default for spdy_in_mem_edsm_server.
+string UrlToFilenameEncoder::LegacyEscape(const string& path) {
+  string output;
+
+  // Note:  We also chop paths into medium sized 'chunks'.
+  //        This is due to the incompetence of the windows
+  //        filesystem, which still hasn't figured out how
+  //        to deal with long filenames.
+  int last_slash = 0;
+  for (size_t index = 0; index < path.length(); index++) {
+    char ch = path[index];
+    if (ch == 0x5C)
+      last_slash = index;
+    if ((ch == 0x2D) ||                    // hyphen
+        (ch == 0x5C) || (ch == 0x5F) ||    // backslash, underscore
+        ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
+        ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
+        ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
+      output.append(&path[index], 1);
+    } else {
+      char encoded[3];
+      encoded[0] = 'x';
+      encoded[1] = ch / 16;
+      encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
+      encoded[2] = ch % 16;
+      encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
+      output.append(encoded, 3);
+    }
+    if (index - last_slash > kMaximumSubdirectoryLength) {
+#ifdef WIN32
+      char slash = '\\';
+#else
+      char slash = '/';
+#endif
+      output.append(&slash, 1);
+      last_slash = index;
+    }
+  }
+  return output;
+}
+
+}  // namespace net
+
diff --git a/net/tools/dump_cache/url_to_filename_encoder.h b/net/tools/dump_cache/url_to_filename_encoder.h
index 4b9e6c5..b5cac37 100644
--- a/net/tools/dump_cache/url_to_filename_encoder.h
+++ b/net/tools/dump_cache/url_to_filename_encoder.h
@@ -1,7 +1,77 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+// URL filename encoder goals:
+//
+// 1. Allow URLs with arbitrary path-segment length, generating filenames
+//    with a maximum of 128 characters.
+// 2. Provide a somewhat human readable filenames, for easy debugging flow.
+// 3. Provide reverse-mapping from filenames back to URLs.
+// 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
+//    Those can all be different URLs.
+// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
+//    with Facebook Connect.
+//
+// We need an escape-character for representing characters that are legal
+// in URL paths, but not in filenames, such as '?'.  Illegal characters
+// in Windows are <>:"/\|?*.  For reference, see
+//   http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
+//
+// We can pick any legal character as an escape, as long as we escape it too.
+// But as we have a goal of having filenames that humans can correlate with
+// URLs, we should pick one that doesn't show up frequently in URLs. Candidates
+// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
+// shell escapes, and characters that occur frequently in URLs.
+//
+// .#&%-=_+ occur frequently in URLs.
+// ~`!$^&(){}[] are special to Unix shells
+//
+// @ might seem like a reasonble option, but some build tools don't appreciate
+// filenames with @ in testdata.  Perforce does not appreciate # in a filename.
+//
+// Though a web-site http://www.vias.org/linux-knowhow/lnag_05_05_09.html
+// identifies ^ as a special shell character, it did not appear to be an
+// issue to use it unquoted as a filename in bash or tcsh.
+//
+// Here are some frequencies of some special characters in a data set from Fall
+// '09.  We find only 3 occurences of "x5E" (^ is ascii 0x53):
+//   ^   3               build tools don't like ^ in testdata filenames
+//   @   10              build tools don't like @ in testdata filenames
+//   .   1676            too frequent in URLs
+//   ,   76              THE WINNER
+//   #   0               build tools doesn't like it
+//   &   487             Prefer to avoid shell escapes
+//   %   374             g4 doesn't like it
+//   =   579             very frequent in URLs -- leave unmodified
+//   -   464             very frequent in URLs -- leave unmodified
+//   _   798             very frequent in URLs -- leave unmodified
+//
+// It is interesting that there were no slurped URLs with #, but I suspect this
+// might be due to the slurping methdology.  So let's stick with the relatively
+// rare ','.
+//
+// Here's the escaping methodology:
+//
+//     URL               File
+//     /                 /,
+//     /.                /.,
+//     //                /,/,
+//     /./               /,./,
+//     /../              /,../,
+//     /,                /,2C,
+//     /,/               /,2C/,
+//     /a/b              /a/b,     (, at the end of a name indicates a leaf).
+//     /a/b/             /a/b/,
+//
+// path segments greater than 128 characters (after escape expansion) are
+// suffixed with ,- so we can know that the next "/" is not part of the URL:
+//
+//    /verylongname/    /verylong,-/name
+
+// NOTE: we avoid using some classes here (like FilePath and GURL) because we
+//       share this code with other projects externally.
+
 #ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_
 #define NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_
 
@@ -10,25 +80,31 @@
 #include "base/file_path.h"
 #include "base/file_util.h"
 #include "base/string_util.h"
-#include "googleurl/src/gurl.h"
+#include "net/tools/dump_cache/url_utilities.h"
 
 namespace net {
 
 // Helper class for converting a URL into a filename.
 class UrlToFilenameEncoder {
  public:
-  // Given a |url| and a |base_path|, returns a FilePath which represents this
+  // Given a |url| and a |base_path|, returns a string which represents this
   // |url|.
-  static FilePath Encode(const std::string& url, FilePath base_path) {
+  // |legacy_escape| indicates that this function should use the old-style
+  // of encoding.
+  // TODO(mbelshe): delete the legacy_escape code.
+  static std::string Encode(const std::string& url, std::string base_path,
+                            bool legacy_escape) {
     std::string clean_url(url);
     if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
       clean_url.append("index.html");
 
-    GURL gurl(clean_url);
-    FilePath filename(base_path);
-    filename = filename.AppendASCII(gurl.host());
+    std::string host = UrlUtilities::GetUrlHost(clean_url);
+    std::string filename(base_path);
+    filename.append("\\");
+    filename = filename.append(host);
+    filename.append("\\");
 
-    std::string url_filename = gurl.PathForRequest();
+    std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
     // Strip the leading '/'
     if (url_filename[0] == '/')
       url_filename = url_filename.substr(1);
@@ -40,59 +116,71 @@ class UrlToFilenameEncoder {
     StripDoubleSlashes(&url_filename);
 
     // Save path as filesystem-safe characters
-    url_filename = Escape(url_filename);
-    filename = filename.AppendASCII(url_filename);
+    if (legacy_escape) {
+      url_filename = LegacyEscape(url_filename);
+    } else {
+      url_filename = Escape(url_filename);
+    }
+    filename = filename.append(url_filename);
+
+#ifndef WIN32
+    // Last step - convert to native slashes!
+    const std::string slash("/");
+    const std::string backslash("\\");
+    ReplaceAll(&filename, backslash, slash);
+#endif
 
     return filename;
   }
 
- private:
-  // This is the length at which we chop individual subdirectories.
-  // Technically, we shouldn't need to do this, but I found that
-  // even with long-filename support, windows had trouble creating
-  // long subdirectories, and making them shorter helps.
-  static const size_t kMaximumSubdirectoryLength = 128;
+  // Rewrite HTML in a form that the SPDY in-memory server
+  // can read.
+  // |filename_prefix| is prepended without escaping.
+  // |filename_ending| is the URL to be encoded into a filename.
+  // |dir_separator| is "/" on Unix, "\" on Windows.
+  // |encoded_filename| is the resultant filename.
+  static void EncodeSegment(
+      const std::string& filename_prefix,
+      const std::string& filename_ending,
+      char dir_separator,
+      std::string* encoded_filename);
+
+  // Decodes a filename that was encoded with EncodeSegment,
+  // yielding back the original URL.
+  static bool Decode(const std::string& encoded_filename,
+                     char dir_separator,
+                     std::string* decoded_url);
 
-  // Escape the given input |path| and chop any individual components
+ private:
+  // Appends a segment of the path, special-casing ".", "..", and "", and
+  // ensuring that the segment does not exceed the path length.  If it does,
+  // it chops the end off the segment, writes the segment with a separator of
+  // ",-/", and then rewrites segment to contain just the truncated piece so
+  // it can be used in the next iteration.
+  // |dir_separator| is "/" on Unix, "\" on Windows.
+  // |segment| is a read/write parameter containing segment to write
+  static void AppendSegment(
+      char dir_separator,
+      std::string* segment,
+      std::string* dest);
+
+  // Escapes the given input |path| and chop any individual components
   // of the path which are greater than kMaximumSubdirectoryLength characters
   // into two chunks.
   static std::string Escape(const std::string& path) {
     std::string output;
-    int last_slash = 0;
-    for (size_t index = 0; index < path.length(); index++) {
-      char ch = path[index];
-      if (ch == 0x5C)
-        last_slash = index;
-      if ((ch == 0x2D) ||                   // hyphen
-          (ch == 0x5C) || (ch == 0x5F) ||   // backslash, underscore
-          ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9]
-          ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z]
-          ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z]
-        output.append(&path[index],1);
-      } else {
-        char encoded[3];
-        encoded[0] = 'x';
-        encoded[1] = ch / 16;
-        encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
-        encoded[2] = ch % 16;
-        encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
-        output.append(encoded, 3);
-      }
-      if (index - last_slash > kMaximumSubdirectoryLength) {
-        char backslash = '\\';
-        output.append(&backslash, 1);
-        last_slash = index;
-      }
-    }
+    EncodeSegment("", path, '\\', &output);
     return output;
   }
 
+  // Allow reading of old slurped files.
+  static std::string LegacyEscape(const std::string& path);
+
   // Replace all instances of |from| within |str| as |to|.
-  static void ReplaceAll(const std::string& from,
-                         const std::string& to,
-                         std::string* str) {
+  static void ReplaceAll(std::string* str, const std::string& from,
+                  const std::string& to) {
     std::string::size_type pos(0);
-    while((pos = str->find(from, pos)) != std::string::npos) {
+    while ((pos = str->find(from, pos)) != std::string::npos) {
       str->replace(pos, from.size(), to);
       pos += from.size();
     }
@@ -100,21 +188,20 @@ class UrlToFilenameEncoder {
 
   // Replace all instances of "/" with "\" in |path|.
   static void ConvertToSlashes(std::string* path) {
-    static const char slash[] = { '/', '\0' };
-    static const char backslash[] = { '\\', '\0' };
-    ReplaceAll(slash, backslash, path);
+    const std::string slash("/");
+    const std::string backslash("\\");
+    ReplaceAll(path, slash, backslash);
   }
 
   // Replace all instances of "\\" with "%5C%5C" in |path|.
   static void StripDoubleSlashes(std::string* path) {
-    static const char doubleslash[] = { '\\', '\\', '\0' };
-    static const char escaped_doubleslash[] =
-      { '%', '5', 'C', '%', '5', 'C','\0' };
-    ReplaceAll(doubleslash, escaped_doubleslash, path);
+    const std::string doubleslash("\\\\");
+    const std::string escaped_doubleslash("%5C%5C");
+    ReplaceAll(path, doubleslash, escaped_doubleslash);
   }
 };
 
-} // namespace net
+}  // namespace net
 
-#endif  // NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H__
+#endif  // NET_TOOLS_DUMP_CACHE_URL_TO_FILE_ENCODER_H_
 
diff --git a/net/tools/dump_cache/url_to_filename_encoder_unittest.cc b/net/tools/dump_cache/url_to_filename_encoder_unittest.cc
new file mode 100644
index 0000000..32cef99
--- /dev/null
+++ b/net/tools/dump_cache/url_to_filename_encoder_unittest.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "net/tools/dump_cache/url_to_filename_encoder.h"
+
+#include <string>
+#include <vector>
+#include "base/string_piece.h"
+#include "base/string_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using base::StringPiece;
+using std::string;
+
+namespace net {
+
+// The escape character choice is made here -- all code and tests in this
+// directory are based off of this constant.  However, our test ata
+// has tons of dependencies on this, so it cannot be changed without
+// re-running those tests and fixing them.
+const char kTruncationChar = '-';
+const char kEscapeChar = ',';
+const size_t kMaximumSubdirectoryLength = 128;
+
+class UrlToFilenameEncoderTest : public ::testing::Test {
+ protected:
+  UrlToFilenameEncoderTest() : escape_(1, kEscapeChar) {}
+
+  void CheckSegmentLength(const StringPiece& escaped_word) {
+    std::vector<StringPiece> components;
+    Tokenize(escaped_word, StringPiece("/"), &components);
+    for (size_t i = 0; i < components.size(); ++i) {
+      EXPECT_GE(kMaximumSubdirectoryLength,
+                components[i].size());
+    }
+  }
+
+  void CheckValidChars(const StringPiece& escaped_word) {
+    // These characters are invalid in Windows.  We will
+    // ignore / for this test, but add in ', as that's pretty
+    // inconvenient in a Unix filename.
+    //
+    // See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
+    static const char kInvalidChars[] = "<>:\"\\|?*'";
+    for (size_t i = 0; i < escaped_word.size(); ++i) {
+      char c = escaped_word[i];
+      EXPECT_EQ(NULL, strchr(kInvalidChars, c));
+      EXPECT_NE('\0', c);  // only invalid character in Posix
+      EXPECT_GT(0x7E, c);  // only English printable characters
+    }
+  }
+
+  void Validate(const string& in_word, const string& gold_word) {
+    string escaped_word, url;
+    UrlToFilenameEncoder::EncodeSegment("", in_word, '/', &escaped_word);
+    EXPECT_EQ(gold_word, escaped_word);
+    CheckSegmentLength(escaped_word);
+    CheckValidChars(escaped_word);
+    UrlToFilenameEncoder::Decode(escaped_word, '/', &url);
+    EXPECT_EQ(in_word, url);
+  }
+
+  void ValidateAllSegmentsSmall(const string& in_word) {
+    string escaped_word, url;
+    UrlToFilenameEncoder::EncodeSegment("", in_word, '/', &escaped_word);
+    CheckSegmentLength(escaped_word);
+    CheckValidChars(escaped_word);
+    UrlToFilenameEncoder::Decode(escaped_word, '/', &url);
+    EXPECT_EQ(in_word, url);
+  }
+
+  void ValidateNoChange(const string& word) {
+    // We always suffix the leaf with kEscapeChar, unless the leaf is empty.
+    Validate(word, word + escape_);
+  }
+
+  void ValidateEscaped(unsigned char ch) {
+    // We always suffix the leaf with kEscapeChar, unless the leaf is empty.
+    char escaped[100];
+    const char escape = kEscapeChar;
+    base::snprintf(escaped, sizeof(escaped), "%c%02X%c", escape, ch, escape);
+    Validate(string(1, ch), escaped);
+  }
+
+  string escape_;
+};
+
+TEST_F(UrlToFilenameEncoderTest, DoesNotEscape) {
+  ValidateNoChange("");
+  ValidateNoChange("abcdefg");
+  ValidateNoChange("abcdefghijklmnopqrstuvwxyz");
+  ValidateNoChange("ZYXWVUT");
+  ValidateNoChange("ZYXWVUTSRQPONMLKJIHGFEDCBA");
+  ValidateNoChange("01234567689");
+  ValidateNoChange("/-_");
+  ValidateNoChange("abcdefghijklmnopqrstuvwxyzZYXWVUTSRQPONMLKJIHGFEDCBA"
+                   "01234567689/-_");
+  ValidateNoChange("index.html");
+  ValidateNoChange("/");
+  ValidateNoChange("/.");
+  ValidateNoChange(".");
+  ValidateNoChange("..");
+  ValidateNoChange("%");
+  ValidateNoChange("=");
+  ValidateNoChange("+");
+  ValidateNoChange("_");
+}
+
+TEST_F(UrlToFilenameEncoderTest, Escapes) {
+  ValidateEscaped('!');
+  ValidateEscaped('"');
+  ValidateEscaped('#');
+  ValidateEscaped('$');
+  ValidateEscaped('&');
+  ValidateEscaped('(');
+  ValidateEscaped(')');
+  ValidateEscaped('*');
+  ValidateEscaped(',');
+  ValidateEscaped(':');
+  ValidateEscaped(';');
+  ValidateEscaped('<');
+  ValidateEscaped('>');
+  ValidateEscaped('@');
+  ValidateEscaped('[');
+  ValidateEscaped('\'');
+  ValidateEscaped('\\');
+  ValidateEscaped(']');
+  ValidateEscaped('^');
+  ValidateEscaped('`');
+  ValidateEscaped('{');
+  ValidateEscaped('|');
+  ValidateEscaped('}');
+  ValidateEscaped('~');
+
+  // check non-printable characters
+  ValidateEscaped('\0');
+  for (int i = 127; i < 256; ++i) {
+    ValidateEscaped(static_cast<char>(i));
+  }
+}
+
+TEST_F(UrlToFilenameEncoderTest, DoesEscapeCorrectly) {
+  Validate("mysite.com&x", "mysite.com" + escape_ + "26x" + escape_);
+  Validate("/./", "/" + escape_ + "./" + escape_);
+  Validate("/../", "/" + escape_ + "../" + escape_);
+  Validate("//", "/" + escape_ + "/" + escape_);
+  Validate("/./leaf", "/" + escape_ + "./leaf" + escape_);
+  Validate("/../leaf", "/" + escape_ + "../leaf" + escape_);
+  Validate("//leaf", "/" + escape_ + "/leaf" + escape_);
+  Validate("mysite/u?param1=x&param2=y",
+           "mysite/u" + escape_ + "3Fparam1=x" + escape_ + "26param2=y" +
+           escape_);
+  Validate("search?q=dogs&go=&form=QBLH&qs=n",  // from Latency Labs bing test.
+           "search" + escape_ + "3Fq=dogs" + escape_ + "26go=" + escape_ +
+           "26form=QBLH" + escape_ + "26qs=n" + escape_);
+  Validate("~joebob/my_neeto-website+with_stuff.asp?id=138&content=true",
+           "" + escape_ + "7Ejoebob/my_neeto-website+with_stuff.asp" + escape_ +
+           "3Fid=138" + escape_ + "26content=true" + escape_);
+}
+
+TEST_F(UrlToFilenameEncoderTest, LongTail) {
+  static char long_word[] =
+      "~joebob/briggs/12345678901234567890123456789012345678901234567890"
+      "1234567890123456789012345678901234567890123456789012345678901234567890"
+      "1234567890123456789012345678901234567890123456789012345678901234567890"
+      "1234567890123456789012345678901234567890123456789012345678901234567890"
+      "1234567890123456789012345678901234567890123456789012345678901234567890"
+      "1234567890123456789012345678901234567890123456789012345678901234567890";
+
+  // the long lines in the string below are 64 characters, so we can see
+  // the slashes every 128.
+  string gold_long_word =
+      escape_ + "7Ejoebob/briggs/"
+      "1234567890123456789012345678901234567890123456789012345678901234"
+      "56789012345678901234567890123456789012345678901234567890123456" +
+      escape_ + "-/"
+      "7890123456789012345678901234567890123456789012345678901234567890"
+      "12345678901234567890123456789012345678901234567890123456789012" +
+      escape_ + "-/"
+      "3456789012345678901234567890123456789012345678901234567890123456"
+      "78901234567890123456789012345678901234567890123456789012345678" +
+      escape_ + "-/"
+      "9012345678901234567890" + escape_;
+  EXPECT_LT(kMaximumSubdirectoryLength,
+            sizeof(long_word));
+  Validate(long_word, gold_long_word);
+}
+
+TEST_F(UrlToFilenameEncoderTest, LongTailQuestion) {
+  // Here the '?' in the last path segment expands to @3F, making
+  // it hit 128 chars before the input segment gets that big.
+  static char long_word[] =
+      "~joebob/briggs/1234567?1234567?1234567?1234567?1234567?"
+      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
+      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
+      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
+      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?"
+      "1234567?1234567?1234567?1234567?1234567?1234567?1234567?";
+
+  // Notice that at the end of the third segment, we avoid splitting
+  // the (escape_ + "3F") that was generated from the "?", so that segment is
+  // only 127 characters.
+  string pattern = "1234567" + escape_ + "3F";  // 10 characters
+  string gold_long_word =
+      escape_ + "7Ejoebob/briggs/" +
+      pattern + pattern + pattern + pattern + pattern + pattern + "1234"
+      "567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern +
+       "123456" + escape_ + "-/"
+      "7" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern +
+      pattern + pattern + pattern + pattern + pattern + pattern + pattern +
+      "12" +
+      escape_ + "-/"
+      "34567" + escape_ + "3F" + pattern + pattern + pattern + pattern + pattern
+      + "1234567" + escape_ + "3F" + pattern + pattern + pattern + pattern
+      + pattern + "1234567" +
+      escape_ + "-/" +
+      escape_ + "3F" + pattern + pattern + escape_;
+  EXPECT_LT(kMaximumSubdirectoryLength,
+            sizeof(long_word));
+  Validate(long_word, gold_long_word);
+}
+
+TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenNoEscape) {
+  // hit corner cases, +/- 4 characters from kMaxLen
+  for (int i = -4; i <= 4; ++i) {
+    string input;
+    input.append(i + kMaximumSubdirectoryLength, 'x');
+    ValidateAllSegmentsSmall(input);
+  }
+}
+
+TEST_F(UrlToFilenameEncoderTest, CornerCasesNearMaxLenWithEscape) {
+  // hit corner cases, +/- 4 characters from kMaxLen.  This time we
+  // leave off the last 'x' and put in a '.', which ensures that we
+  // are truncating with '/' *after* the expansion.
+  for (int i = -4; i <= 4; ++i) {
+    string input;
+    input.append(i + kMaximumSubdirectoryLength - 1, 'x');
+    input.append(1, '.');  // this will expand to 3 characters.
+    ValidateAllSegmentsSmall(input);
+  }
+}
+
+TEST_F(UrlToFilenameEncoderTest, LeafBranchAlias) {
+  Validate("/a/b/c", "/a/b/c" + escape_);        // c is leaf file "c,"
+  Validate("/a/b/c/d", "/a/b/c/d" + escape_);    // c is directory "c"
+  Validate("/a/b/c/d/", "/a/b/c/d/" + escape_);
+}
+
+
+TEST_F(UrlToFilenameEncoderTest, BackslashSeparator) {
+  string long_word;
+  string escaped_word;
+  long_word.append(kMaximumSubdirectoryLength + 1, 'x');
+  UrlToFilenameEncoder::EncodeSegment("", long_word, '\\', &escaped_word);
+
+  // check that one backslash, plus the escape ",-", and the ending , got added.
+  EXPECT_EQ(long_word.size() + 4, escaped_word.size());
+  ASSERT_LT(kMaximumSubdirectoryLength,
+            escaped_word.size());
+  // Check that the backslash got inserted at the correct spot.
+  EXPECT_EQ('\\', escaped_word[
+      kMaximumSubdirectoryLength]);
+}
+
+}  // namespace
+
diff --git a/net/tools/dump_cache/url_utilities.h b/net/tools/dump_cache/url_utilities.h
new file mode 100644
index 0000000..4de95dc
--- /dev/null
+++ b/net/tools/dump_cache/url_utilities.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_
+#define NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_
+
+#include <string>
+
+namespace net {
+
+namespace UrlUtilities {
+
+// Gets the host from an url, strips the port number as well if the url
+// has one.
+// For example: calling GetUrlHost(www.foo.com:8080/boo) returns www.foo.com
+static std::string GetUrlHost(const std::string& url) {
+  size_t b = url.find("//");
+  if (b == std::string::npos)
+    b = 0;
+  else
+    b += 2;
+  size_t next_slash = url.find_first_of('/', b);
+  size_t next_colon = url.find_first_of(':', b);
+  if (next_slash != std::string::npos
+      && next_colon != std::string::npos
+      && next_colon < next_slash) {
+    return std::string(url, b, next_colon - b);
+  }
+  if (next_slash == std::string::npos) {
+    if (next_colon != std::string::npos) {
+      return std::string(url, next_colon - b);
+    } else {
+      next_slash = url.size();
+    }
+  }
+  return std::string(url, b, next_slash - b);
+}
+
+// Gets the path portion of an url.
+// e.g   http://www.foo.com/path
+//       returns /path
+static std::string GetUrlPath(const std::string& url) {
+  size_t b = url.find("//");
+  if (b == std::string::npos)
+    b = 0;
+  else
+    b += 2;
+  b = url.find("/", b);
+  if (b == std::string::npos)
+    return "/";
+
+  size_t e = url.find("#", b+1);
+  if (e != std::string::npos)
+    return std::string(url, b, (e - b));
+  return std::string(url, b);
+}
+
+}  // namespace UrlUtilities
+
+}  // namespace net
+
+#endif  // NET_TOOLS_DUMP_CACHE_URL_UTILITIES_H_
+
author	mbelshe@chromium.org <mbelshe@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-06-07 14:57:17 +0000
committer	mbelshe@chromium.org <mbelshe@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-06-07 14:57:17 +0000
commit	196f286c045d4bf22364f00801c5a9e1d59e8a40 (patch)
tree	31e165e8898eac61b2426c86b2aa4bc25358f44e
parent	01540765be8ae546c7f2e2105621bd431f8462b1 (diff)
download	chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.zip chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.tar.gz chromium_src-196f286c045d4bf22364f00801c5a9e1d59e8a40.tar.bz2