summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorbrettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2008-07-29 16:36:36 +0000
committerbrettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2008-07-29 16:36:36 +0000
commita9c217fc61aaa2aff80877836264528cf9870ae8 (patch)
tree09485ea0a3134b2efbfce170f73b063447ed473a /net
parent4e1fd027999bf8148458d5c14ac6345d79227cfd (diff)
downloadchromium_src-a9c217fc61aaa2aff80877836264528cf9870ae8.zip
chromium_src-a9c217fc61aaa2aff80877836264528cf9870ae8.tar.gz
chromium_src-a9c217fc61aaa2aff80877836264528cf9870ae8.tar.bz2
Puts back the optional unescaping of control characters and URL parse-affecting characters. That patch was reverted due to build problems.
This is heavily modified from the original patch. That patch required an additional function and a bunch of internal boolean flags. This one uses the new flags enum I wrote to add this to the existing functionality more cleanly. BUG=1271340 BUG=1258819 Review URL: http://chrome-reviews.prom.corp.google.com/804 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@66 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net')
-rw-r--r--net/base/data_url.cc6
-rw-r--r--net/base/escape.cc44
-rw-r--r--net/base/escape.h21
-rw-r--r--net/base/escape_unittest.cc31
-rw-r--r--net/base/net_util.cc8
-rw-r--r--net/url_request/url_request_ftp_job.cc4
6 files changed, 86 insertions, 28 deletions
diff --git a/net/base/data_url.cc b/net/base/data_url.cc
index cf8e239..95d31e7 100644
--- a/net/base/data_url.cc
+++ b/net/base/data_url.cc
@@ -97,7 +97,8 @@ bool DataURL::Parse(const GURL& url, std::string* mime_type,
// could be part of the payload, so don't strip it.
if (base64_encoded) {
temp_data = UnescapeURLComponent(temp_data,
- UnescapeRule::SPACES | UnescapeRule::PERCENTS);
+ UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS |
+ UnescapeRule::CONTROL_CHARS);
}
// Strip whitespace.
@@ -110,7 +111,8 @@ bool DataURL::Parse(const GURL& url, std::string* mime_type,
if (!base64_encoded) {
temp_data = UnescapeURLComponent(temp_data,
- UnescapeRule::SPACES | UnescapeRule::PERCENTS);
+ UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS |
+ UnescapeRule::CONTROL_CHARS);
}
if (base64_encoded)
diff --git a/net/base/escape.cc b/net/base/escape.cc
index bd4aa95..330a3ed 100644
--- a/net/base/escape.cc
+++ b/net/base/escape.cc
@@ -81,7 +81,6 @@ class Charmap {
uint32 map_[8];
};
-
// Given text to escape and a Charmap defining which values to escape,
// return an escaped string. If use_plus is true, spaces are converted
// to +, otherwise, if spaces are in the charmap, they are converted to
@@ -105,6 +104,32 @@ const std::string Escape(const std::string& text, const Charmap& charmap,
return escaped;
}
+// Contains nonzero when the corresponding character is unescapable for normal
+// URLs. These characters are the ones that may change the parsing of a URL, so
+// we don't want to unescape them sometimes. In many case we won't want to
+// unescape spaces, but that is controlled by parameters to Unescape*.
+//
+// The basic rule is that we can't unescape anything that would changing parsing
+// like # or ?. We also can't unescape &, =, or + since that could be part of a
+// query and that could change the server's parsing of the query.
+const char kUrlUnescape[128] = {
+// NULL, control chars...
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+// ' ' ! " # $ % & ' ( ) * + , - . /
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
+// @ A B C D E F G H I J K L M N O
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+// P Q R S T U V W X Y Z [ \ ] ^ _
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+// ` a b c d e f g h i j k l m n o
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+// p q r s t u v w x y z { | } ~ <NBSP>
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
+};
+
std::string UnescapeURLImpl(const std::string& escaped_text,
UnescapeRule::Type rules) {
// The output of the unescaping is always smaller than the input, so we can
@@ -121,21 +146,34 @@ std::string UnescapeURLImpl(const std::string& escaped_text,
if (IsHex(most_sig_digit) && IsHex(least_sig_digit)) {
unsigned char value = HexToInt(most_sig_digit) * 16 +
HexToInt(least_sig_digit);
- if (((rules & UnescapeRule::PERCENTS) || value != '%') &&
- ((rules & UnescapeRule::SPACES) || value != ' ')) {
+ if (value >= 0x80 || // Unescape all high-bit characters.
+ // For 7-bit characters, the lookup table tells us all valid chars.
+ (kUrlUnescape[value] ||
+ // ...and we allow some additional unescaping when flags are set.
+ (value == ' ' && (rules & UnescapeRule::SPACES)) ||
+ // Allow any of the prohibited but non-control characters when
+ // we're doing "special" chars.
+ (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
+ // Additionally allow control characters if requested.
+ (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
// Use the unescaped version of the character.
result.push_back(value);
i += 2;
} else {
+ // Keep escaped. Append a percent and we'll get the following two
+ // digits on the next loops through.
result.push_back('%');
}
} else {
+ // Invalid escape sequence, just pass the percent through and continue
+ // right after it.
result.push_back('%');
}
} else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
escaped_text[i] == '+') {
result.push_back(' ');
} else {
+ // Normal case for unescaped characters.
result.push_back(escaped_text[i]);
}
}
diff --git a/net/base/escape.h b/net/base/escape.h
index 220eebc..4b86a64 100644
--- a/net/base/escape.h
+++ b/net/base/escape.h
@@ -77,17 +77,20 @@ class UnescapeRule {
// by other applications.
SPACES = 1,
- // Unescapes "%25" to "%". This must not be used when the resulting string
- // will need to be interpreted as a URL again, since we won't know what
- // should be escaped and what shouldn't. For example, "%2520" would be
- // converted to "%20" which would have different meaning than the origina.
- // This flag is used when generating final output like filenames for URLs
- // where we won't be interpreting as a URL and want to do as much unescaping
- // as possible.
- PERCENTS = 2,
+ // Unescapes various characters that will change the meaning of URLs,
+ // including '%', '+', '&', '/', '#'. If we unescaped these charaters, the
+ // resulting URL won't be the same as the source one. This flag is used when
+ // generating final output like filenames for URLs where we won't be
+ // interpreting as a URL and want to do as much unescaping as possible.
+ URL_SPECIAL_CHARS = 2,
+
+ // Unescapes control characters such as %01. This INCLUDES NULLs!. This is
+ // used for rare cases such as data: URL decoding where the result is binary
+ // data. You should not use this for normal URLs!
+ CONTROL_CHARS = 4,
// URL queries use "+" for space. This flag controls that replacement.
- REPLACE_PLUS_WITH_SPACE = 4,
+ REPLACE_PLUS_WITH_SPACE = 8,
};
};
diff --git a/net/base/escape_unittest.cc b/net/base/escape_unittest.cc
index d2d0288..53100a91 100644
--- a/net/base/escape_unittest.cc
+++ b/net/base/escape_unittest.cc
@@ -121,11 +121,18 @@ TEST(Escape, UnescapeURLComponent) {
{"Invalid %escape %2", UnescapeRule::NORMAL, "Invalid %escape %2"},
{"Some%20random text %25%3bOK", UnescapeRule::NORMAL, "Some%20random text %25;OK"},
{"Some%20random text %25%3bOK", UnescapeRule::SPACES, "Some random text %25;OK"},
- {"Some%20random text %25%3bOK", UnescapeRule::PERCENTS, "Some%20random text %;OK"},
- {"Some%20random text %25%3bOK", UnescapeRule::SPACES | UnescapeRule::PERCENTS, "Some random text %;OK"},
- {"%01%02%03%04%05%06%07%08%09", UnescapeRule::NORMAL, "\x01\x02\x03\x04\x05\x06\x07\x08\x09"},
+ {"Some%20random text %25%3bOK", UnescapeRule::URL_SPECIAL_CHARS, "Some%20random text %;OK"},
+ {"Some%20random text %25%3bOK", UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, "Some random text %;OK"},
{"%A0%B1%C2%D3%E4%F5", UnescapeRule::NORMAL, "\xA0\xB1\xC2\xD3\xE4\xF5"},
- {"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, "\xAa\xBb\xCc\xDd\xEe\xFf"}
+ {"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, "\xAa\xBb\xCc\xDd\xEe\xFf"},
+ // Certain URL-sensitive characters should not be unescaped unless asked.
+ {"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", UnescapeRule::SPACES, "Hello %13%10world %23# %3F? %3D= %26& %25% %2B+"},
+ {"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", UnescapeRule::URL_SPECIAL_CHARS, "Hello%20%13%10world ## ?? == && %% ++"},
+ // Control characters.
+ {"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::URL_SPECIAL_CHARS, "%01%02%03%04%05%06%07%08%09 %"},
+ {"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::CONTROL_CHARS, "\x01\x02\x03\x04\x05\x06\x07\x08\x09 %25"},
+ {"Hello%20%13%10%02", UnescapeRule::SPACES, "Hello %13%10%02"},
+ {"Hello%20%13%10%02", UnescapeRule::CONTROL_CHARS, "Hello%20\x13\x10\x02"},
};
for (int i = 0; i < arraysize(unescape_cases); i++) {
@@ -134,17 +141,23 @@ TEST(Escape, UnescapeURLComponent) {
UnescapeURLComponent(str, unescape_cases[i].rules));
}
- // test the NULL character escaping (which wouldn't work above since those
- // are just char pointers)
+ // Test the NULL character unescaping (which wouldn't work above since those
+ // are just char pointers).
std::string input("Null");
input.push_back(0); // Also have a NULL in the input.
input.append("%00%39Test");
+ // When we're unescaping NULLs
std::string expected("Null");
expected.push_back(0);
expected.push_back(0);
expected.append("9Test");
+ EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::CONTROL_CHARS));
+ // When we're not unescaping NULLs.
+ expected = "Null";
+ expected.push_back(0);
+ expected.append("%009Test");
EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL));
}
@@ -178,9 +191,9 @@ TEST(Escape, UnescapeAndDecodeURLComponent) {
"Some random text %25;OK",
L"Some random text %25;OK"},
{"UTF8", "%01%02%03%04%05%06%07%08%09",
- "\x01\x02\x03\x04\x05\x06\x07\x08\x09",
- "\x01\x02\x03\x04\x05\x06\x07\x08\x09",
- L"\x01\x02\x03\x04\x05\x06\x07\x08\x09"},
+ "%01%02%03%04%05%06%07%08%09",
+ "%01%02%03%04%05%06%07%08%09",
+ L"%01%02%03%04%05%06%07%08%09"},
{"UTF8", "%E4%BD%A0+%E5%A5%BD",
"\xE4\xBD\xA0+\xE5\xA5\xBD",
"\xE4\xBD\xA0 \xE5\xA5\xBD",
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
index 416252c..68570006 100644
--- a/net/base/net_util.cc
+++ b/net/base/net_util.cc
@@ -715,7 +715,7 @@ bool FileURLToFilePath(const GURL& url, std::wstring* file_path) {
// GURL stores strings as percent-encoded UTF-8, this will undo if possible.
path = UnescapeURLComponent(path,
- UnescapeRule::SPACES | UnescapeRule::PERCENTS);
+ UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
if (!IsStringUTF8(path.c_str())) {
// Not UTF-8, assume encoding is native codepage and we're done. We know we
@@ -937,9 +937,11 @@ std::wstring GetSuggestedFilename(const GURL& url,
TrimString(filename, L".", &filename);
}
if (filename.empty()) {
- if (url.is_valid())
+ if (url.is_valid()) {
filename = UnescapeAndDecodeUTF8URLComponent(
- url.ExtractFileName(), UnescapeRule::SPACES | UnescapeRule::PERCENTS);
+ url.ExtractFileName(),
+ UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
+ }
}
// Trim '.' once more.
diff --git a/net/url_request/url_request_ftp_job.cc b/net/url_request/url_request_ftp_job.cc
index f619609..202d1fb 100644
--- a/net/url_request/url_request_ftp_job.cc
+++ b/net/url_request/url_request_ftp_job.cc
@@ -61,7 +61,7 @@ static bool UnescapeAndValidatePath(const URLRequest* request,
// we need to identify the encoding and convert to that encoding.
static const std::string kInvalidChars("\x00\x0d\x0a", 3);
*unescaped_path = UnescapeURLComponent(request->url().path(),
- UnescapeRule::SPACES | UnescapeRule::PERCENTS);
+ UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
if (unescaped_path->find_first_of(kInvalidChars) != std::string::npos) {
SetLastError(ERROR_INTERNET_INVALID_URL);
// GURL path should not contain '%00' which is NULL(0x00) when unescaped.
@@ -416,7 +416,7 @@ void URLRequestFtpJob::OnStartDirectoryTraversal() {
// Unescape the URL path and pass the raw 8bit directly to the browser.
string html = net_util::GetDirectoryListingHeader(
UnescapeURLComponent(request_->url().path(),
- UnescapeRule::SPACES | UnescapeRule::PERCENTS));
+ UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS));
// If this isn't top level directory (i.e. the path isn't "/",) add a link to
// the parent directory.