diff options
author | meacer <meacer@chromium.org> | 2014-10-21 17:44:52 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2014-10-22 04:03:12 +0000 |
commit | c62e2eeb225afbc123db27c2d3500a7bbbab6a48 (patch) | |
tree | c78a981e910f764292c09e2ee7e7da1eb49a504b /net/base | |
parent | 82f6f9885248ba9ece56af554fe8a087b74bdf02 (diff) | |
download | chromium_src-c62e2eeb225afbc123db27c2d3500a7bbbab6a48.zip chromium_src-c62e2eeb225afbc123db27c2d3500a7bbbab6a48.tar.gz chromium_src-c62e2eeb225afbc123db27c2d3500a7bbbab6a48.tar.bz2 |
Unescape BiDi control chars while parsing data: urls
The fix for bug 337746 prevented unescaping of BiDi control
characters in URLs. This breaks the loading of data: URLs
because BiDi control chars appear escaped in the loaded HTML.
This patch adds a special case for the parsing of BiDi control
chars. This shouldn't change the way URLs are shown in the omnibox
or any other UI. URLs with BiDi control characters are always
displayed as escaped in the omnibox. This behavior is also
consistent with Firefox.
BUG=423901
Review URL: https://codereview.chromium.org/643963004
Cr-Commit-Position: refs/heads/master@{#300584}
Diffstat (limited to 'net/base')
-rw-r--r-- | net/base/data_url_unittest.cc | 38 | ||||
-rw-r--r-- | net/base/escape.cc | 74 | ||||
-rw-r--r-- | net/base/escape.h | 5 | ||||
-rw-r--r-- | net/base/escape_unittest.cc | 28 |
4 files changed, 122 insertions, 23 deletions
diff --git a/net/base/data_url_unittest.cc b/net/base/data_url_unittest.cc index 3876301..bcb2b49 100644 --- a/net/base/data_url_unittest.cc +++ b/net/base/data_url_unittest.cc @@ -184,6 +184,44 @@ TEST(DataURLTest, Parse) { "", "" }, + // BiDi control characters should be unescaped and preserved as is, and + // should not be replaced with % versions. In the below case, \xE2\x80\x8F + // is the RTL mark and the parsed text should preserve it as is. + { + "data:text/plain;charset=utf-8,\xE2\x80\x8Ftest", + true, + "text/plain", + "utf-8", + "\xE2\x80\x8Ftest"}, + + // Same as above but with Arabic text after RTL mark. + { + "data:text/plain;charset=utf-8," + "\xE2\x80\x8F\xD8\xA7\xD8\xAE\xD8\xAA\xD8\xA8\xD8\xA7\xD8\xB1", + true, + "text/plain", + "utf-8", + "\xE2\x80\x8F\xD8\xA7\xD8\xAE\xD8\xAA\xD8\xA8\xD8\xA7\xD8\xB1"}, + + // RTL mark encoded as %E2%80%8F should be unescaped too. Note that when + // wrapped in a GURL, this URL and the next effectively become the same as + // the previous two URLs. + { + "data:text/plain;charset=utf-8,%E2%80%8Ftest", + true, + "text/plain", + "utf-8", + "\xE2\x80\x8Ftest"}, + + // Same as above but with Arabic text after RTL mark. + { + "data:text/plain;charset=utf-8," + "%E2%80%8F\xD8\xA7\xD8\xAE\xD8\xAA\xD8\xA8\xD8\xA7\xD8\xB1", + true, + "text/plain", + "utf-8", + "\xE2\x80\x8F\xD8\xA7\xD8\xAE\xD8\xAA\xD8\xA8\xD8\xA7\xD8\xB1"} + // TODO(darin): add more interesting tests }; diff --git a/net/base/escape.cc b/net/base/escape.cc index 1798b6c..3c8adc6 100644 --- a/net/base/escape.cc +++ b/net/base/escape.cc @@ -120,6 +120,44 @@ bool UnescapeUnsignedCharAtIndex(const STR& escaped_text, return false; } +// Returns true if there is an Arabic Language Mark at |index|. |first_byte| +// is the byte at |index|. +template<typename STR> +bool HasArabicLanguageMarkAtIndex(const STR& escaped_text, + unsigned char first_byte, + size_t index) { + if (first_byte != 0xD8) + return false; + unsigned char second_byte; + if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) + return false; + return second_byte == 0x9c; +} + +// Returns true if there is a BiDi control char at |index|. |first_byte| is the +// byte at |index|. +template<typename STR> +bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text, + unsigned char first_byte, + size_t index) { + if (first_byte != 0xE2) + return false; + unsigned char second_byte; + if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte)) + return false; + if (second_byte != 0x80 && second_byte != 0x81) + return false; + unsigned char third_byte; + if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte)) + return false; + if (second_byte == 0x80) { + return third_byte == 0x8E || + third_byte == 0x8F || + (third_byte >= 0xAA && third_byte <= 0xAE); + } + return third_byte >= 0xA6 && third_byte <= 0xA9; +} + // Unescapes |escaped_text| according to |rules|, returning the resulting // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects // the alterations done to the string that are not one-character-to-one- @@ -172,27 +210,21 @@ STR UnescapeURLWithAdjustmentsImpl( // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) - - unsigned char second_byte; - // Check for ALM. - if ((first_byte == 0xD8) && - UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && - (second_byte == 0x9c)) { - result.append(escaped_text, i, 6); - i += 5; - continue; - } - - // Check for other BiDi control characters. - if ((first_byte == 0xE2) && - UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && - ((second_byte == 0x80) || (second_byte == 0x81))) { - unsigned char third_byte; - if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) && - ((second_byte == 0x80) ? - ((third_byte == 0x8E) || (third_byte == 0x8F) || - ((third_byte >= 0xAA) && (third_byte <= 0xAE))) : - ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) { + // + // However, some schemes such as data: and file: need to parse the exact + // binary data when loading the URL. For that reason, CONTROL_CHARS allows + // unescaping BiDi control characters. + // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed + // in the UI. + if (!(rules & UnescapeRule::CONTROL_CHARS)) { + if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) { + // Keep Arabic Language Mark escaped. + result.append(escaped_text, i, 6); + i += 5; + continue; + } + if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) { + // Keep BiDi control char escaped. result.append(escaped_text, i, 9); i += 8; continue; diff --git a/net/base/escape.h b/net/base/escape.h index 1915d24..bc58439 100644 --- a/net/base/escape.h +++ b/net/base/escape.h @@ -88,7 +88,10 @@ class UnescapeRule { // Unescapes control characters such as %01. This INCLUDES NULLs. This is // used for rare cases such as data: URL decoding where the result is binary - // data. You should not use this for normal URLs! + // data. This flag also unescapes BiDi control characters. + // + // DO NOT use CONTROL_CHARS if the URL is going to be displayed in the UI + // for security reasons. CONTROL_CHARS = 8, // URL queries use "+" for space. This flag controls that replacement. diff --git a/net/base/escape_unittest.cc b/net/base/escape_unittest.cc index 74ae293..1e82239 100644 --- a/net/base/escape_unittest.cc +++ b/net/base/escape_unittest.cc @@ -232,7 +232,8 @@ TEST(EscapeTest, UnescapeURLComponent) { {L"Some%20random text %25%E2%80%84OK", UnescapeRule::NORMAL, L"Some%20random text %25\xE2\x80\x84OK"}, - // BiDi Control characters should not be unescaped. + // BiDi Control characters should not be unescaped unless explicity told to + // do so with UnescapeRule::CONTROL_CHARS {L"Some%20random text %25%D8%9COK", UnescapeRule::NORMAL, L"Some%20random text %25%D8%9COK"}, {L"Some%20random text %25%E2%80%8EOK", UnescapeRule::NORMAL, @@ -249,6 +250,31 @@ TEST(EscapeTest, UnescapeURLComponent) { L"Some%20random text %25%E2%81%A6OK"}, {L"Some%20random text %25%E2%81%A9OK", UnescapeRule::NORMAL, L"Some%20random text %25%E2%81%A9OK"}, + // UnescapeRule::CONTROL_CHARS should unescape BiDi Control characters. + {L"Some%20random text %25%D8%9COK", + UnescapeRule::NORMAL | UnescapeRule::CONTROL_CHARS, + L"Some%20random text %25\xD8\x9COK"}, + {L"Some%20random text %25%E2%80%8EOK", + UnescapeRule::NORMAL | UnescapeRule::CONTROL_CHARS, + L"Some%20random text %25\xE2\x80\x8EOK"}, + {L"Some%20random text %25%E2%80%8FOK", + UnescapeRule::NORMAL | UnescapeRule::CONTROL_CHARS, + L"Some%20random text %25\xE2\x80\x8FOK"}, + {L"Some%20random text %25%E2%80%AAOK", + UnescapeRule::NORMAL | UnescapeRule::CONTROL_CHARS, + L"Some%20random text %25\xE2\x80\xAAOK"}, + {L"Some%20random text %25%E2%80%ABOK", + UnescapeRule::NORMAL | UnescapeRule::CONTROL_CHARS, + L"Some%20random text %25\xE2\x80\xABOK"}, + {L"Some%20random text %25%E2%80%AEOK", + UnescapeRule::NORMAL | UnescapeRule::CONTROL_CHARS, + L"Some%20random text %25\xE2\x80\xAEOK"}, + {L"Some%20random text %25%E2%81%A6OK", + UnescapeRule::NORMAL | UnescapeRule::CONTROL_CHARS, + L"Some%20random text %25\xE2\x81\xA6OK"}, + {L"Some%20random text %25%E2%81%A9OK", + UnescapeRule::NORMAL | UnescapeRule::CONTROL_CHARS, + L"Some%20random text %25\xE2\x81\xA9OK"}, {L"Some%20random text %25%2dOK", UnescapeRule::SPACES, L"Some random text %25-OK"}, |