summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorskanuj@chromium.org <skanuj@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2014-02-28 10:42:33 +0000
committerskanuj@chromium.org <skanuj@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2014-02-28 10:42:33 +0000
commit2cd905494ba700a8e2097d614ed39ad36d1519f2 (patch)
treecffb99be0eeb5f2c5b329353f751d7447e804c1d /net
parente2dc73dede643a6b812c5e3a01dac779041a9e30 (diff)
downloadchromium_src-2cd905494ba700a8e2097d614ed39ad36d1519f2.zip
chromium_src-2cd905494ba700a8e2097d614ed39ad36d1519f2.tar.gz
chromium_src-2cd905494ba700a8e2097d614ed39ad36d1519f2.tar.bz2
Don't unescape BiDi control characters in URL components
As per http://tools.ietf.org/html/rfc3987#section-4.1, the BiDi control characters are not allowed in IRI. Add constants for the new BiDi control characters from http://www.unicode.org/reports/tr9/ in rtl.h. BUG=337746 TBR=rsleevi Review URL: https://codereview.chromium.org/181483008 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@254091 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net')
-rw-r--r--net/base/escape.cc116
-rw-r--r--net/base/escape_unittest.cc25
-rw-r--r--net/base/net_util_unittest.cc4
3 files changed, 116 insertions, 29 deletions
diff --git a/net/base/escape.cc b/net/base/escape.cc
index 134a986..6f67a5f 100644
--- a/net/base/escape.cc
+++ b/net/base/escape.cc
@@ -97,6 +97,29 @@ const char kUrlUnescape[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
};
+// Attempts to unescape the sequence at |index| within |escaped_text|. If
+// successful, sets |value| to the unescaped value. Returns whether
+// unescaping succeeded.
+template<typename STR>
+bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,
+ size_t index,
+ unsigned char* value) {
+ if ((index + 2) >= escaped_text.size())
+ return false;
+ if (escaped_text[index] != '%')
+ return false;
+ const typename STR::value_type most_sig_digit(
+ static_cast<typename STR::value_type>(escaped_text[index + 1]));
+ const typename STR::value_type least_sig_digit(
+ static_cast<typename STR::value_type>(escaped_text[index + 2]));
+ if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
+ *value = HexDigitToInt(most_sig_digit) * 16 +
+ HexDigitToInt(least_sig_digit);
+ return true;
+ }
+ return false;
+}
+
template<typename STR>
STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
UnescapeRule::Type rules,
@@ -125,37 +148,72 @@ STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
continue;
}
- char current_char = static_cast<char>(escaped_text[i]);
- if (current_char == '%' && i + 2 < max) {
- const typename STR::value_type most_sig_digit(
- static_cast<typename STR::value_type>(escaped_text[i + 1]));
- const typename STR::value_type least_sig_digit(
- static_cast<typename STR::value_type>(escaped_text[i + 2]));
- if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
- unsigned char value = HexDigitToInt(most_sig_digit) * 16 +
- HexDigitToInt(least_sig_digit);
- if (value >= 0x80 || // Unescape all high-bit characters.
- // For 7-bit characters, the lookup table tells us all valid chars.
- (kUrlUnescape[value] ||
- // ...and we allow some additional unescaping when flags are set.
- (value == ' ' && (rules & UnescapeRule::SPACES)) ||
- // Allow any of the prohibited but non-control characters when
- // we're doing "special" chars.
- (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
- // Additionally allow control characters if requested.
- (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
- // Use the unescaped version of the character.
- adjustments.push_back(i);
- result.push_back(value);
- i += 2;
- } else {
- // Keep escaped. Append a percent and we'll get the following two
- // digits on the next loops through.
- result.push_back('%');
+ unsigned char first_byte;
+ if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) {
+ // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
+ // control characters are not allowed to appear unescaped in URLs:
+ //
+ // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E)
+ // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F)
+ // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
+ // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
+ // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
+ // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
+ // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
+ //
+ // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
+ // 3987 above has since added some new BiDi control characters.
+ // http://www.unicode.org/reports/tr9
+ //
+ // U+061C ARABIC LETTER MARK (%D8%9C)
+ // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
+ // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
+ // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)
+ // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)
+
+ unsigned char second_byte;
+ // Check for ALM.
+ if ((first_byte == 0xD8) &&
+ UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&
+ (second_byte == 0x9c)) {
+ result.append(escaped_text, i, 6);
+ i += 5;
+ continue;
+ }
+
+ // Check for other BiDi control characters.
+ if ((first_byte == 0xE2) &&
+ UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&
+ ((second_byte == 0x80) || (second_byte == 0x81))) {
+ unsigned char third_byte;
+ if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) &&
+ ((second_byte == 0x80) ?
+ ((third_byte == 0x8E) || (third_byte == 0x8F) ||
+ ((third_byte >= 0xAA) && (third_byte <= 0xAE))) :
+ ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) {
+ result.append(escaped_text, i, 9);
+ i += 8;
+ continue;
}
+ }
+
+ if (first_byte >= 0x80 || // Unescape all high-bit characters.
+ // For 7-bit characters, the lookup table tells us all valid chars.
+ (kUrlUnescape[first_byte] ||
+ // ...and we allow some additional unescaping when flags are set.
+ (first_byte == ' ' && (rules & UnescapeRule::SPACES)) ||
+ // Allow any of the prohibited but non-control characters when
+ // we're doing "special" chars.
+ (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
+ // Additionally allow control characters if requested.
+ (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
+ // Use the unescaped version of the character.
+ adjustments.push_back(i);
+ result.push_back(first_byte);
+ i += 2;
} else {
- // Invalid escape sequence, just pass the percent through and continue
- // right after it.
+ // Keep escaped. Append a percent and we'll get the following two
+ // digits on the next loops through.
result.push_back('%');
}
} else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
diff --git a/net/base/escape_unittest.cc b/net/base/escape_unittest.cc
index 90c246a..77d9fc2 100644
--- a/net/base/escape_unittest.cc
+++ b/net/base/escape_unittest.cc
@@ -218,6 +218,31 @@ TEST(EscapeTest, UnescapeURLComponent) {
L"Some%20random text %25%2dOK"},
{L"Some%20random text %25%2dOK", UnescapeRule::NORMAL,
L"Some%20random text %25-OK"},
+ {L"Some%20random text %25%E2%80", UnescapeRule::NORMAL,
+ L"Some%20random text %25\xE2\x80"},
+ {L"Some%20random text %25%E2%80OK", UnescapeRule::NORMAL,
+ L"Some%20random text %25\xE2\x80OK"},
+ {L"Some%20random text %25%E2%80%84OK", UnescapeRule::NORMAL,
+ L"Some%20random text %25\xE2\x80\x84OK"},
+
+ // BiDi Control characters should not be unescaped.
+ {L"Some%20random text %25%D8%9COK", UnescapeRule::NORMAL,
+ L"Some%20random text %25%D8%9COK"},
+ {L"Some%20random text %25%E2%80%8EOK", UnescapeRule::NORMAL,
+ L"Some%20random text %25%E2%80%8EOK"},
+ {L"Some%20random text %25%E2%80%8FOK", UnescapeRule::NORMAL,
+ L"Some%20random text %25%E2%80%8FOK"},
+ {L"Some%20random text %25%E2%80%AAOK", UnescapeRule::NORMAL,
+ L"Some%20random text %25%E2%80%AAOK"},
+ {L"Some%20random text %25%E2%80%ABOK", UnescapeRule::NORMAL,
+ L"Some%20random text %25%E2%80%ABOK"},
+ {L"Some%20random text %25%E2%80%AEOK", UnescapeRule::NORMAL,
+ L"Some%20random text %25%E2%80%AEOK"},
+ {L"Some%20random text %25%E2%81%A6OK", UnescapeRule::NORMAL,
+ L"Some%20random text %25%E2%81%A6OK"},
+ {L"Some%20random text %25%E2%81%A9OK", UnescapeRule::NORMAL,
+ L"Some%20random text %25%E2%81%A9OK"},
+
{L"Some%20random text %25%2dOK", UnescapeRule::SPACES,
L"Some random text %25-OK"},
{L"Some%20random text %25%2dOK", UnescapeRule::URL_SPECIAL_CHARS,
diff --git a/net/base/net_util_unittest.cc b/net/base/net_util_unittest.cc
index 7e4c77e..a0f466d 100644
--- a/net/base/net_util_unittest.cc
+++ b/net/base/net_util_unittest.cc
@@ -2548,6 +2548,10 @@ TEST(NetUtilTest, FormatUrl) {
L"http://xn--qcka1pmc.jp/\x30B0\x30FC\x30B0\x30EB"
L"?q=\x30B0\x30FC\x30B0\x30EB", 7},
+ {"Unescape normally with BiDi control character",
+ "http://example.com/%E2%80%AEabc?q=%E2%80%8Fxy", "en", default_format_type,
+ UnescapeRule::NORMAL, L"http://example.com/%E2%80%AEabc?q=%E2%80%8Fxy", 7},
+
{"Unescape normally including unescape spaces",
"http://www.google.com/search?q=Hello%20World", "en", default_format_type,
UnescapeRule::SPACES, L"http://www.google.com/search?q=Hello World", 7},