diff options
author | brettw <brettw@chromium.org> | 2015-11-25 16:29:35 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-11-26 00:30:55 +0000 |
commit | 1141951b9418bf41586643be378d7f5e5c87bfca (patch) | |
tree | eebdf0854a3d46c2e5d1079fdd742965e661ffb6 /url | |
parent | 575d1deb85ba41095599809f83dc08ffce9fd443 (diff) | |
download | chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.zip chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.tar.gz chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.tar.bz2 |
Handle non-ASCII escaped characters from ICU.
ICU can apparently generate non-ASCII escaped characters if name-preping generates a new escape sequence. Previously we would get confused by these and would assert.
This new code marks the error and generates an escaped invalid URL.
BUG=456271
NOPRESUBMIT=True (for long lines)
Review URL: https://codereview.chromium.org/1471693005
Cr-Commit-Position: refs/heads/master@{#361757}
Diffstat (limited to 'url')
-rw-r--r-- | url/url_canon_host.cc | 23 | ||||
-rw-r--r-- | url/url_canon_unittest.cc | 18 |
2 files changed, 40 insertions, 1 deletions
diff --git a/url/url_canon_host.cc b/url/url_canon_host.cc index fce4d3a..d4cdfd5 100644 --- a/url/url_canon_host.cc +++ b/url/url_canon_host.cc @@ -165,6 +165,8 @@ bool DoSimpleHost(const INCHAR* host, // Canonicalizes a host that requires IDN conversion. Returns true on success bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) { + int original_output_len = output->length(); // So we can rewind below. + // We need to escape URL before doing IDN conversion, since punicode strings // cannot be escaped after they are created. RawCanonOutputW<kTempHostBufferLen> url_escaped_host; @@ -187,7 +189,26 @@ bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) { bool success = DoSimpleHost(wide_output.data(), wide_output.length(), output, &has_non_ascii); - DCHECK(!has_non_ascii); + if (has_non_ascii) { + // ICU generated something that DoSimpleHost didn't think looked like + // ASCII. This is quite rare, but ICU might convert some characters to + // percent signs which might generate new escape sequences which might in + // turn be invalid. An example is U+FE6A "small percent" which ICU will + // name prep into an ASCII percent and then we can interpret the following + // characters as escaped characters. + // + // If DoSimpleHost didn't think the output was ASCII, just escape the + // thing we gave ICU and give up. DoSimpleHost will have handled a further + // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates + // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't + // do more (like handle escaped non-ASCII sequences). Handling the escaped + // ASCII isn't strictly necessary, but DoSimpleHost handles this case + // anyway so we handle it/ + output->set_length(original_output_len); + AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(), + output); + return false; + } return success; } diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc index b382a7c..6c44e78 100644 --- a/url/url_canon_unittest.cc +++ b/url/url_canon_unittest.cc @@ -321,6 +321,17 @@ TEST(URLCanonTest, Host) { // ...%00 in fullwidth should fail (also as escaped UTF-8 input) {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + // ICU will convert weird percents into ASCII percents, but not unescape + // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a + // "small percent". At this point we should be within our rights to mark + // anything as invalid since the URL is corrupt or malicious. The code + // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped + // and kept as valid, so we validate that behavior here, but this level + // of fixing the input shouldn't be seen as required. "%81" is invalid. + {"\xef\xb9\xaa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + {"%ef%b9%aa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""}, + {"\xef\xb9\xaa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, + {"%ef%b9%aa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""}, // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""}, // See http://unicode.org/cldr/utility/idna.jsp for other @@ -1316,6 +1327,13 @@ TEST(URLCanonTest, CanonicalizeStandardURL) { {"wss://foo:81/", "wss://foo:81/", true}, {"wss://foo:443/", "wss://foo/", true}, {"wss://foo:815/", "wss://foo:815/", true}, + + // This particular code path ends up "backing up" to replace an invalid + // host ICU generated with an escaped version. Test that in the context + // of a full URL to make sure the backing up doesn't mess up the non-host + // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that + // ICU will convert to an ASCII one, generating "%81". + {"ws:)W\x1eW\xef\xb9\xaa""81:80/", "ws://%29w%1ew%81/", false}, }; for (size_t i = 0; i < arraysize(cases); i++) { |