summaryrefslogtreecommitdiffstats
path: root/net/base/escape.cc
diff options
context:
space:
mode:
authormeacer <meacer@chromium.org>2014-10-21 17:44:52 -0700
committerCommit bot <commit-bot@chromium.org>2014-10-22 04:03:12 +0000
commitc62e2eeb225afbc123db27c2d3500a7bbbab6a48 (patch)
treec78a981e910f764292c09e2ee7e7da1eb49a504b /net/base/escape.cc
parent82f6f9885248ba9ece56af554fe8a087b74bdf02 (diff)
downloadchromium_src-c62e2eeb225afbc123db27c2d3500a7bbbab6a48.zip
chromium_src-c62e2eeb225afbc123db27c2d3500a7bbbab6a48.tar.gz
chromium_src-c62e2eeb225afbc123db27c2d3500a7bbbab6a48.tar.bz2
Unescape BiDi control chars while parsing data: urls
The fix for bug 337746 prevented unescaping of BiDi control characters in URLs. This breaks the loading of data: URLs because BiDi control chars appear escaped in the loaded HTML. This patch adds a special case for the parsing of BiDi control chars. This shouldn't change the way URLs are shown in the omnibox or any other UI. URLs with BiDi control characters are always displayed as escaped in the omnibox. This behavior is also consistent with Firefox. BUG=423901 Review URL: https://codereview.chromium.org/643963004 Cr-Commit-Position: refs/heads/master@{#300584}
Diffstat (limited to 'net/base/escape.cc')
-rw-r--r--net/base/escape.cc74
1 files changed, 53 insertions, 21 deletions
diff --git a/net/base/escape.cc b/net/base/escape.cc
index 1798b6c..3c8adc6 100644
--- a/net/base/escape.cc
+++ b/net/base/escape.cc
@@ -120,6 +120,44 @@ bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,
return false;
}
+// Returns true if there is an Arabic Language Mark at |index|. |first_byte|
+// is the byte at |index|.
+template<typename STR>
+bool HasArabicLanguageMarkAtIndex(const STR& escaped_text,
+ unsigned char first_byte,
+ size_t index) {
+ if (first_byte != 0xD8)
+ return false;
+ unsigned char second_byte;
+ if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
+ return false;
+ return second_byte == 0x9c;
+}
+
+// Returns true if there is a BiDi control char at |index|. |first_byte| is the
+// byte at |index|.
+template<typename STR>
+bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text,
+ unsigned char first_byte,
+ size_t index) {
+ if (first_byte != 0xE2)
+ return false;
+ unsigned char second_byte;
+ if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
+ return false;
+ if (second_byte != 0x80 && second_byte != 0x81)
+ return false;
+ unsigned char third_byte;
+ if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))
+ return false;
+ if (second_byte == 0x80) {
+ return third_byte == 0x8E ||
+ third_byte == 0x8F ||
+ (third_byte >= 0xAA && third_byte <= 0xAE);
+ }
+ return third_byte >= 0xA6 && third_byte <= 0xA9;
+}
+
// Unescapes |escaped_text| according to |rules|, returning the resulting
// string. Fills in an |adjustments| parameter, if non-NULL, so it reflects
// the alterations done to the string that are not one-character-to-one-
@@ -172,27 +210,21 @@ STR UnescapeURLWithAdjustmentsImpl(
// U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
// U+2068 FIRST STRONG ISOLATE (%E2%81%A8)
// U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)
-
- unsigned char second_byte;
- // Check for ALM.
- if ((first_byte == 0xD8) &&
- UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&
- (second_byte == 0x9c)) {
- result.append(escaped_text, i, 6);
- i += 5;
- continue;
- }
-
- // Check for other BiDi control characters.
- if ((first_byte == 0xE2) &&
- UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&
- ((second_byte == 0x80) || (second_byte == 0x81))) {
- unsigned char third_byte;
- if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) &&
- ((second_byte == 0x80) ?
- ((third_byte == 0x8E) || (third_byte == 0x8F) ||
- ((third_byte >= 0xAA) && (third_byte <= 0xAE))) :
- ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) {
+ //
+ // However, some schemes such as data: and file: need to parse the exact
+ // binary data when loading the URL. For that reason, CONTROL_CHARS allows
+ // unescaping BiDi control characters.
+ // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed
+ // in the UI.
+ if (!(rules & UnescapeRule::CONTROL_CHARS)) {
+ if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {
+ // Keep Arabic Language Mark escaped.
+ result.append(escaped_text, i, 6);
+ i += 5;
+ continue;
+ }
+ if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {
+ // Keep BiDi control char escaped.
result.append(escaped_text, i, 9);
i += 8;
continue;