summaryrefslogtreecommitdiffstats
path: root/url
diff options
context:
space:
mode:
authorbrettw <brettw@chromium.org>2015-11-25 16:29:35 -0800
committerCommit bot <commit-bot@chromium.org>2015-11-26 00:30:55 +0000
commit1141951b9418bf41586643be378d7f5e5c87bfca (patch)
treeeebdf0854a3d46c2e5d1079fdd742965e661ffb6 /url
parent575d1deb85ba41095599809f83dc08ffce9fd443 (diff)
downloadchromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.zip
chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.tar.gz
chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.tar.bz2
Handle non-ASCII escaped characters from ICU.
ICU can apparently generate non-ASCII escaped characters if name-preping generates a new escape sequence. Previously we would get confused by these and would assert. This new code marks the error and generates an escaped invalid URL. BUG=456271 NOPRESUBMIT=True (for long lines) Review URL: https://codereview.chromium.org/1471693005 Cr-Commit-Position: refs/heads/master@{#361757}
Diffstat (limited to 'url')
-rw-r--r--url/url_canon_host.cc23
-rw-r--r--url/url_canon_unittest.cc18
2 files changed, 40 insertions, 1 deletions
diff --git a/url/url_canon_host.cc b/url/url_canon_host.cc
index fce4d3a..d4cdfd5 100644
--- a/url/url_canon_host.cc
+++ b/url/url_canon_host.cc
@@ -165,6 +165,8 @@ bool DoSimpleHost(const INCHAR* host,
// Canonicalizes a host that requires IDN conversion. Returns true on success
bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
+ int original_output_len = output->length(); // So we can rewind below.
+
// We need to escape URL before doing IDN conversion, since punicode strings
// cannot be escaped after they are created.
RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
@@ -187,7 +189,26 @@ bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
bool success = DoSimpleHost(wide_output.data(),
wide_output.length(),
output, &has_non_ascii);
- DCHECK(!has_non_ascii);
+ if (has_non_ascii) {
+ // ICU generated something that DoSimpleHost didn't think looked like
+ // ASCII. This is quite rare, but ICU might convert some characters to
+ // percent signs which might generate new escape sequences which might in
+ // turn be invalid. An example is U+FE6A "small percent" which ICU will
+ // name prep into an ASCII percent and then we can interpret the following
+ // characters as escaped characters.
+ //
+ // If DoSimpleHost didn't think the output was ASCII, just escape the
+ // thing we gave ICU and give up. DoSimpleHost will have handled a further
+ // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
+ // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
+ // do more (like handle escaped non-ASCII sequences). Handling the escaped
+ // ASCII isn't strictly necessary, but DoSimpleHost handles this case
+ // anyway so we handle it/
+ output->set_length(original_output_len);
+ AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
+ output);
+ return false;
+ }
return success;
}
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
index b382a7c..6c44e78 100644
--- a/url/url_canon_unittest.cc
+++ b/url/url_canon_unittest.cc
@@ -321,6 +321,17 @@ TEST(URLCanonTest, Host) {
// ...%00 in fullwidth should fail (also as escaped UTF-8 input)
{"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
{"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ // ICU will convert weird percents into ASCII percents, but not unescape
+ // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
+ // "small percent". At this point we should be within our rights to mark
+ // anything as invalid since the URL is corrupt or malicious. The code
+ // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
+ // and kept as valid, so we validate that behavior here, but this level
+ // of fixing the input shouldn't be seen as required. "%81" is invalid.
+ {"\xef\xb9\xaa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"%ef%b9%aa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+ {"\xef\xb9\xaa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+ {"%ef%b9%aa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
// Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
{"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
// See http://unicode.org/cldr/utility/idna.jsp for other
@@ -1316,6 +1327,13 @@ TEST(URLCanonTest, CanonicalizeStandardURL) {
{"wss://foo:81/", "wss://foo:81/", true},
{"wss://foo:443/", "wss://foo/", true},
{"wss://foo:815/", "wss://foo:815/", true},
+
+ // This particular code path ends up "backing up" to replace an invalid
+ // host ICU generated with an escaped version. Test that in the context
+ // of a full URL to make sure the backing up doesn't mess up the non-host
+ // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that
+ // ICU will convert to an ASCII one, generating "%81".
+ {"ws:)W\x1eW\xef\xb9\xaa""81:80/", "ws://%29w%1ew%81/", false},
};
for (size_t i = 0; i < arraysize(cases); i++) {