net/tools/dump_cache/url_to_filename_encoder.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211

// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// URL filename encoder goals:
//
// 1. Allow URLs with arbitrary path-segment length, generating filenames
//    with a maximum of 128 characters.
// 2. Provide a somewhat human readable filenames, for easy debugging flow.
// 3. Provide reverse-mapping from filenames back to URLs.
// 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
//    Those can all be different URLs.
// 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
//    with Facebook Connect.
//
// We need an escape-character for representing characters that are legal
// in URL paths, but not in filenames, such as '?'.
//
// We can pick any legal character as an escape, as long as we escape it too.
// But as we have a goal of having filenames that humans can correlate with
// URLs, we should pick one that doesn't show up frequently in URLs. Candidates
// are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
// shell escapes or that various build tools use.
//
// .#&%-=_+ occur frequently in URLs.
// <>:"/\|?* are illegal in Windows
//   See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
// ~`!$^&(){}[]'; are special to Unix shells
// In addition, build tools do not like ^@#%
//
// Josh took a quick look at the frequency of some special characters in
// Sadeesh's slurped directory from Fall 09 and found the following occurances:
//
//   ^   3               build tool doesn't like ^ in testdata filenames
//   @   10              build tool doesn't like @ in testdata filenames
//   .   1676            too frequent in URLs
//   ,   76              THE WINNER
//   #   0               build tool doesn't like it
//   &   487             Prefer to avoid shell escapes
//   %   374             g4 doesn't like it
//   =   579             very frequent in URLs -- leave unmodified
//   -   464             very frequent in URLs -- leave unmodified
//   _   798             very frequent in URLs -- leave unmodified
//
//
// The escaping algorithm is:
//  1) Escape all unfriendly symbols as ,XX where XX is the hex code.
//  2) Add a ',' at the end (We do not allow ',' at end of any directory name,
//     so this assures that e.g. /a and /a/b can coexist in the filesystem).
//  3) Go through the path segment by segment (where a segment is one directory
//     or leaf in the path) and
//     3a) If the segment is empty, escape the second slash. i.e. if it was
//         www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
//     3a) If it is "." or ".." prepend with ',' (so that we have a non-
//         empty and non-reserved filename).
//     3b) If it is over 128 characters, break it up into smaller segments by
//         inserting ,-/ (Windows limits paths to 128 chars, other OSes also
//         have limits that would restrict us)
//
// For example:
//     URL               File
//     /                 /,
//     /index.html       /index.html,
//     /.                /.,
//     /a/b              /a/b,
//     /a/b/             /a/b/,
//     /a/b/c            /a/b/c,   Note: no prefix problem
//     /u?foo=bar        /u,3Ffoo=bar,
//     //                /,2F,
//     /./               /,./,
//     /../              /,../,
//     /,                /,2C,
//     /,./              /,2C./,
//     /very...longname/ /very...long,-/name   If very...long is about 126 long.

// NOTE: we avoid using some classes here (like FilePath and GURL) because we
//       share this code with other projects externally.

#ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
#define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_

#include <string>

#include "base/strings/string_util.h"
#include "net/tools/dump_cache/url_utilities.h"

namespace net {

// Helper class for converting a URL into a filename.
class UrlToFilenameEncoder {
 public:
  // Given a |url| and a |base_path|, returns a filename which represents this
  // |url|. |url| may include URL escaping such as %21 for !
  // |legacy_escape| indicates that this function should use the old-style
  // of encoding.
  // TODO(mbelshe): delete the legacy_escape code.
  static std::string Encode(const std::string& url, std::string base_path,
                            bool legacy_escape) {
    std::string filename;
    if (!legacy_escape) {
      std::string url_no_scheme = UrlUtilities::GetUrlHostPath(url);
      EncodeSegment(base_path, url_no_scheme, '/', &filename);
#ifdef WIN32
      ReplaceAll(&filename, "/", "\\");
#endif
    } else {
      std::string clean_url(url);
      if (clean_url.length() && clean_url[clean_url.length()-1] == '/')
        clean_url.append("index.html");

      std::string host = UrlUtilities::GetUrlHost(clean_url);
      filename.append(base_path);
      filename.append(host);
#ifdef WIN32
      filename.append("\\");
#else
      filename.append("/");
#endif

      std::string url_filename = UrlUtilities::GetUrlPath(clean_url);
      // Strip the leading '/'.
      if (url_filename[0] == '/')
        url_filename = url_filename.substr(1);

      // Replace '/' with '\'.
      ConvertToSlashes(&url_filename);

      // Strip double back-slashes ("\\\\").
      StripDoubleSlashes(&url_filename);

      // Save path as filesystem-safe characters.
      url_filename = LegacyEscape(url_filename);
      filename.append(url_filename);

#ifndef WIN32
      // Last step - convert to native slashes.
      const std::string slash("/");
      const std::string backslash("\\");
      ReplaceAll(&filename, backslash, slash);
#endif
    }

    return filename;
  }

  // Rewrite HTML in a form that the SPDY in-memory server
  // can read.
  // |filename_prefix| is prepended without escaping.
  // |escaped_ending| is the URL to be encoded into a filename. It may have URL
  // escaped characters (like %21 for !).
  // |dir_separator| is "/" on Unix, "\" on Windows.
  // |encoded_filename| is the resultant filename.
  static void EncodeSegment(
      const std::string& filename_prefix,
      const std::string& escaped_ending,
      char dir_separator,
      std::string* encoded_filename);

  // Decodes a filename that was encoded with EncodeSegment,
  // yielding back the original URL.
  static bool Decode(const std::string& encoded_filename,
                     char dir_separator,
                     std::string* decoded_url);

  static const char kEscapeChar;
  static const char kTruncationChar;
  static const size_t kMaximumSubdirectoryLength;

  friend class UrlToFilenameEncoderTest;

 private:
  // Appends a segment of the path, special-casing "." and "..", and
  // ensuring that the segment does not exceed the path length.  If it does,
  // it chops the end off the segment, writes the segment with a separator of
  // ",-/", and then rewrites segment to contain just the truncated piece so
  // it can be used in the next iteration.
  // |segment| is a read/write parameter containing segment to write
  // Note: this should not be called with empty segment.
  static void AppendSegment(std::string* segment, std::string* dest);

  // Allow reading of old slurped files.
  static std::string LegacyEscape(const std::string& path);

  // Replace all instances of |from| within |str| as |to|.
  static void ReplaceAll(std::string* str, const std::string& from,
                         const std::string& to) {
    std::string::size_type pos(0);
    while ((pos = str->find(from, pos)) != std::string::npos) {
      str->replace(pos, from.size(), to);
      pos += from.size();
    }
  }

  // Replace all instances of "/" with "\" in |path|.
  static void ConvertToSlashes(std::string* path) {
    const std::string slash("/");
    const std::string backslash("\\");
    ReplaceAll(path, slash, backslash);
  }

  // Replace all instances of "\\" with "%5C%5C" in |path|.
  static void StripDoubleSlashes(std::string* path) {
    const std::string doubleslash("\\\\");
    const std::string escaped_doubleslash("%5C%5C");
    ReplaceAll(path, doubleslash, escaped_doubleslash);
  }
};

}  // namespace net

#endif  // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_