Handle non-ASCII escaped characters from ICU.

ICU can apparently generate non-ASCII escaped characters if name-preping generates a new escape sequence. Previously we would get confused by these and would assert. This new code marks the error and generates an escaped invalid URL. BUG=456271 NOPRESUBMIT=True (for long lines) Review URL: https://codereview.chromium.org/1471693005 Cr-Commit-Position: refs/heads/master@{#361757}
author: brettw <brettw@chromium.org> 2015-11-25 16:29:35 -0800
committer: Commit bot <commit-bot@chromium.org> 2015-11-26 00:30:55 +0000
commit: 1141951b9418bf41586643be378d7f5e5c87bfca (patch)
tree: eebdf0854a3d46c2e5d1079fdd742965e661ffb6 /url
parent: 575d1deb85ba41095599809f83dc08ffce9fd443 (diff)
download: chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.zip
chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.tar.gz
chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.tar.bz2
2 files changed, 40 insertions, 1 deletions
diff --git a/url/url_canon_host.cc b/url/url_canon_host.cc
index fce4d3a..d4cdfd5 100644
--- a/url/url_canon_host.cc
+++ b/url/url_canon_host.cc
@@ -165,6 +165,8 @@ bool DoSimpleHost(const INCHAR* host,
 
 // Canonicalizes a host that requires IDN conversion. Returns true on success
 bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
+  int original_output_len = output->length();  // So we can rewind below.
+
   // We need to escape URL before doing IDN conversion, since punicode strings
   // cannot be escaped after they are created.
   RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
@@ -187,7 +189,26 @@ bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
   bool success = DoSimpleHost(wide_output.data(),
                               wide_output.length(),
                               output, &has_non_ascii);
-  DCHECK(!has_non_ascii);
+  if (has_non_ascii) {
+    // ICU generated something that DoSimpleHost didn't think looked like
+    // ASCII. This is quite rare, but ICU might convert some characters to
+    // percent signs which might generate new escape sequences which might in
+    // turn be invalid. An example is U+FE6A "small percent" which ICU will
+    // name prep into an ASCII percent and then we can interpret the following
+    // characters as escaped characters.
+    //
+    // If DoSimpleHost didn't think the output was ASCII, just escape the
+    // thing we gave ICU and give up. DoSimpleHost will have handled a further
+    // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
+    // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
+    // do more (like handle escaped non-ASCII sequences). Handling the escaped
+    // ASCII isn't strictly necessary, but DoSimpleHost handles this case
+    // anyway so we handle it/
+    output->set_length(original_output_len);
+    AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
+                              output);
+    return false;
+  }
   return success;
 }
 
diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc
index b382a7c..6c44e78 100644
--- a/url/url_canon_unittest.cc
+++ b/url/url_canon_unittest.cc
@@ -321,6 +321,17 @@ TEST(URLCanonTest, Host) {
       // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
     {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
     {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com", "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+      // ICU will convert weird percents into ASCII percents, but not unescape
+      // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
+      // "small percent". At this point we should be within our rights to mark
+      // anything as invalid since the URL is corrupt or malicious. The code
+      // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
+      // and kept as valid, so we validate that behavior here, but this level
+      // of fixing the input shouldn't be seen as required. "%81" is invalid.
+    {"\xef\xb9\xaa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+    {"%ef%b9%aa" "41.com", L"\xfe6a" L"41.com", "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
+    {"\xef\xb9\xaa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
+    {"%ef%b9%aa" "81.com", L"\xfe6a" L"81.com", "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
       // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
     {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14), CanonHostInfo::NEUTRAL, -1, ""},
       // See http://unicode.org/cldr/utility/idna.jsp for other
@@ -1316,6 +1327,13 @@ TEST(URLCanonTest, CanonicalizeStandardURL) {
     {"wss://foo:81/", "wss://foo:81/", true},
     {"wss://foo:443/", "wss://foo/", true},
     {"wss://foo:815/", "wss://foo:815/", true},
+
+      // This particular code path ends up "backing up" to replace an invalid
+      // host ICU generated with an escaped version. Test that in the context
+      // of a full URL to make sure the backing up doesn't mess up the non-host
+      // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that
+      // ICU will convert to an ASCII one, generating "%81".
+    {"ws:)W\x1eW\xef\xb9\xaa""81:80/", "ws://%29w%1ew%81/", false},
   };
 
   for (size_t i = 0; i < arraysize(cases); i++) {
author	brettw <brettw@chromium.org>	2015-11-25 16:29:35 -0800
committer	Commit bot <commit-bot@chromium.org>	2015-11-26 00:30:55 +0000
commit	1141951b9418bf41586643be378d7f5e5c87bfca (patch)
tree	eebdf0854a3d46c2e5d1079fdd742965e661ffb6 /url
parent	575d1deb85ba41095599809f83dc08ffce9fd443 (diff)
download	chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.zip chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.tar.gz chromium_src-1141951b9418bf41586643be378d7f5e5c87bfca.tar.bz2