Improve parsing of the Content-Disposition header

Previous, we were using GetHeaderParamValue to parse the Content-Disposition header, which describes itself as a "quick and dirty implementation." After this patch, we use more of our normal HTTP parsing machinery, making our parsing much less quirky and better aligned with RFC 6266. Some notes: 1) Many of the test cases for parsing the Content-Disposition header included the string "Content-Disposition: " in the input. I've looked through all of the callers of these functions, and that seems to be completely bogus. The old parser wasn't careful enough to see that as a problem, but the new one follows the spec more closely. I've updated the test cases to remove this string. 2) After this patch, there's a bunch of code in net_util.cc that really should be moved to http_content_disposition.cc. I didn't move that code in this patch because I didn't want this path to be too large. I'll move it in a future patch. 3) In a future patch, I'll audit the codebase for callers of GetHeaderParamValue. With any luck, we'll be able to remove them all and delete this less-than-amazing function. BUG=65423 Review URL: http://codereview.chromium.org/9234055 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@119378 0039d316-1c4b-4281-b951-d872f2087c98
author: abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-01-27 06:03:16 +0000
committer: abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-01-27 06:03:16 +0000
commit: 56eab2f216ce86217149753a407f981cb6d5de6c (patch)
tree: bf4b45a8fbb845b5ec177332df0ebafcfaedb98f /net/http/http_content_disposition_unittest.cc
parent: 9b961c91eeb8ba43be7918563a07432cbcb1c4df (diff)
download: chromium_src-56eab2f216ce86217149753a407f981cb6d5de6c.zip
chromium_src-56eab2f216ce86217149753a407f981cb6d5de6c.tar.gz
chromium_src-56eab2f216ce86217149753a407f981cb6d5de6c.tar.bz2
1 files changed, 201 insertions, 0 deletions
diff --git a/net/http/http_content_disposition_unittest.cc b/net/http/http_content_disposition_unittest.cc
new file mode 100644
index 0000000..3d8750e
--- /dev/null
+++ b/net/http/http_content_disposition_unittest.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "net/http/http_content_disposition.h"
+
+#include "base/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace net {
+
+namespace {
+
+struct FileNameCDCase {
+  const char* header;
+  const char* referrer_charset;
+  const wchar_t* expected;
+};
+
+}  // anonymous namespace
+
+TEST(HttpContentDispositionTest, Filename) {
+  const FileNameCDCase tests[] = {
+    // Test various forms of C-D header fields emitted by web servers.
+    {"inline; filename=\"abcde.pdf\"", "", L"abcde.pdf"},
+    {"inline; name=\"abcde.pdf\"", "", L"abcde.pdf"},
+    {"attachment; filename=abcde.pdf", "", L"abcde.pdf"},
+    {"attachment; name=abcde.pdf", "", L"abcde.pdf"},
+    {"attachment; filename=abc,de.pdf", "", L"abc,de.pdf"},
+    {"filename=abcde.pdf", "", L"abcde.pdf"},
+    {"filename= abcde.pdf", "", L"abcde.pdf"},
+    {"filename =abcde.pdf", "", L"abcde.pdf"},
+    {"filename = abcde.pdf", "", L"abcde.pdf"},
+    {"filename\t=abcde.pdf", "", L"abcde.pdf"},
+    {"filename \t\t  =abcde.pdf", "", L"abcde.pdf"},
+    {"name=abcde.pdf", "", L"abcde.pdf"},
+    {"inline; filename=\"abc%20de.pdf\"", "",
+     L"abc de.pdf"},
+    // Unbalanced quotation mark
+    {"filename=\"abcdef.pdf", "", L"abcdef.pdf"},
+    // Whitespaces are converted to a space.
+    {"inline; filename=\"abc  \t\nde.pdf\"", "",
+     L"abc    de.pdf"},
+    // %-escaped UTF-8
+    {"attachment; filename=\"%EC%98%88%EC%88%A0%20"
+     "%EC%98%88%EC%88%A0.jpg\"", "", L"\xc608\xc220 \xc608\xc220.jpg"},
+    {"attachment; filename=\"%F0%90%8C%B0%F0%90%8C%B1"
+     "abc.jpg\"", "", L"\U00010330\U00010331abc.jpg"},
+    {"attachment; filename=\"%EC%98%88%EC%88%A0 \n"
+     "%EC%98%88%EC%88%A0.jpg\"", "", L"\xc608\xc220  \xc608\xc220.jpg"},
+    // RFC 2047 with various charsets and Q/B encodings
+    {"attachment; filename=\"=?EUC-JP?Q?=B7=DD=BD="
+     "D13=2Epng?=\"", "", L"\x82b8\x8853" L"3.png"},
+    {"attachment; filename==?eUc-Kr?b?v7m8+iAzLnBuZw==?=",
+     "", L"\xc608\xc220 3.png"},
+    {"attachment; filename==?utf-8?Q?=E8=8A=B8=E8"
+     "=A1=93_3=2Epng?=", "", L"\x82b8\x8853 3.png"},
+    {"attachment; filename==?utf-8?Q?=F0=90=8C=B0"
+     "_3=2Epng?=", "", L"\U00010330 3.png"},
+    {"inline; filename=\"=?iso88591?Q?caf=e9_=2epng?=\"",
+     "", L"caf\x00e9 .png"},
+    // Space after an encoded word should be removed.
+    {"inline; filename=\"=?iso88591?Q?caf=E9_?= .png\"",
+     "", L"caf\x00e9 .png"},
+    // Two encoded words with different charsets (not very likely to be emitted
+    // by web servers in the wild). Spaces between them are removed.
+    {"inline; filename=\"=?euc-kr?b?v7m8+iAz?="
+     " =?ksc5601?q?=BF=B9=BC=FA=2Epng?=\"", "",
+     L"\xc608\xc220 3\xc608\xc220.png"},
+    {"attachment; filename=\"=?windows-1252?Q?caf=E9?="
+     "  =?iso-8859-7?b?4eI=?= .png\"", "", L"caf\x00e9\x03b1\x03b2.png"},
+    // Non-ASCII string is passed through and treated as UTF-8 as long as
+    // it's valid as UTF-8 and regardless of |referrer_charset|.
+    {"attachment; filename=caf\xc3\xa9.png",
+     "iso-8859-1", L"caf\x00e9.png"},
+    {"attachment; filename=caf\xc3\xa9.png",
+     "", L"caf\x00e9.png"},
+    // Non-ASCII/Non-UTF-8 string. Fall back to the referrer charset.
+    {"attachment; filename=caf\xe5.png",
+     "windows-1253", L"caf\x03b5.png"},
+#if 0
+    // Non-ASCII/Non-UTF-8 string. Fall back to the native codepage.
+    // TODO(jungshik): We need to set the OS default codepage
+    // to a specific value before testing. On Windows, we can use
+    // SetThreadLocale().
+    {"attachment; filename=\xb0\xa1\xb0\xa2.png",
+     "", L"\xac00\xac01.png"},
+#endif
+    // Failure cases
+    // Invalid hex-digit "G"
+    {"attachment; filename==?iiso88591?Q?caf=EG?=", "",
+     L""},
+    // Incomplete RFC 2047 encoded-word (missing '='' at the end)
+    {"attachment; filename==?iso88591?Q?caf=E3?", "", L""},
+    // Extra character at the end of an encoded word
+    {"attachment; filename==?iso88591?Q?caf=E3?==",
+     "", L""},
+    // Extra token at the end of an encoded word
+    {"attachment; filename==?iso88591?Q?caf=E3?=?",
+     "", L""},
+    {"attachment; filename==?iso88591?Q?caf=E3?=?=",
+     "",  L""},
+    // Incomplete hex-escaped chars
+    {"attachment; filename==?windows-1252?Q?=63=61=E?=",
+     "", L""},
+    {"attachment; filename=%EC%98%88%EC%88%A", "", L""},
+    // %-escaped non-UTF-8 encoding is an "error"
+    {"attachment; filename=%B7%DD%BD%D1.png", "", L""},
+    // Two RFC 2047 encoded words in a row without a space is an error.
+    {"attachment; filename==?windows-1252?Q?caf=E3?="
+     "=?iso-8859-7?b?4eIucG5nCg==?=", "", L""},
+
+    // RFC 5987 tests with Filename*  : see http://tools.ietf.org/html/rfc5987
+    {"attachment; filename*=foo.html", "", L""},
+    {"attachment; filename*=foo'.html", "", L""},
+    {"attachment; filename*=''foo'.html", "", L""},
+    {"attachment; filename*=''foo.html'", "", L""},
+    {"attachment; filename*=''f\"oo\".html'", "", L""},
+    {"attachment; filename*=bogus_charset''foo.html'",
+     "", L""},
+    {"attachment; filename*='en'foo.html'", "", L""},
+    {"attachment; filename*=iso-8859-1'en'foo.html", "",
+      L"foo.html"},
+    {"attachment; filename*=utf-8'en'foo.html", "",
+      L"foo.html"},
+    // charset cannot be omitted.
+    {"attachment; filename*='es'f\xfa.html'", "", L""},
+    // Non-ASCII bytes are not allowed.
+    {"attachment; filename*=iso-8859-1'es'f\xfa.html", "",
+      L""},
+    {"attachment; filename*=utf-8'es'f\xce\xba.html", "",
+      L""},
+    // TODO(jshin): Space should be %-encoded, but currently, we allow
+    // spaces.
+    {"inline; filename*=iso88591''cafe foo.png", "",
+      L"cafe foo.png"},
+
+    // Filename* tests converted from Q-encoded tests above.
+    {"attachment; filename*=EUC-JP''%B7%DD%BD%D13%2Epng",
+     "", L"\x82b8\x8853" L"3.png"},
+    {"attachment; filename*=utf-8''"
+      "%E8%8A%B8%E8%A1%93%203%2Epng", "", L"\x82b8\x8853 3.png"},
+    {"attachment; filename*=utf-8''%F0%90%8C%B0 3.png", "",
+      L"\U00010330 3.png"},
+    {"inline; filename*=Euc-Kr'ko'%BF%B9%BC%FA%2Epng", "",
+     L"\xc608\xc220.png"},
+    {"attachment; filename*=windows-1252''caf%E9.png", "",
+      L"caf\x00e9.png"},
+
+    // http://greenbytes.de/tech/tc2231/ filename* test cases.
+    // attwithisofn2231iso
+    {"attachment; filename*=iso-8859-1''foo-%E4.html", "",
+      L"foo-\xe4.html"},
+    // attwithfn2231utf8
+    {"attachment; filename*="
+      "UTF-8''foo-%c3%a4-%e2%82%ac.html", "", L"foo-\xe4-\x20ac.html"},
+    // attwithfn2231noc : no encoding specified but UTF-8 is used.
+    {"attachment; filename*=''foo-%c3%a4-%e2%82%ac.html",
+      "", L""},
+    // attwithfn2231utf8comp
+    {"attachment; filename*=UTF-8''foo-a%cc%88.html", "",
+      L"foo-\xe4.html"},
+#ifdef ICU_SHOULD_FAIL_CONVERSION_ON_INVALID_CHARACTER
+    // This does not work because we treat ISO-8859-1 synonymous with
+    // Windows-1252 per HTML5. For HTTP, in theory, we're not
+    // supposed to.
+    // attwithfn2231utf8-bad
+    {"attachment; filename*="
+      "iso-8859-1''foo-%c3%a4-%e2%82%ac.html", "", L""},
+#endif
+    // attwithfn2231ws1
+    {"attachment; filename *=UTF-8''foo-%c3%a4.html", "",
+      L""},
+    // attwithfn2231ws2
+    {"attachment; filename*= UTF-8''foo-%c3%a4.html", "",
+      L"foo-\xe4.html"},
+    // attwithfn2231ws3
+    {"attachment; filename* =UTF-8''foo-%c3%a4.html", "",
+      L"foo-\xe4.html"},
+    // attwithfn2231quot
+    {"attachment; filename*=\"UTF-8''foo-%c3%a4.html\"",
+      "", L""},
+    // attfnboth
+    {"attachment; filename=\"foo-ae.html\"; "
+      "filename*=UTF-8''foo-%c3%a4.html", "", L"foo-\xe4.html"},
+    // attfnboth2
+    {"attachment; filename*=UTF-8''foo-%c3%a4.html; "
+      "filename=\"foo-ae.html\"", "", L"foo-\xe4.html"},
+    // attnewandfn
+    {"attachment; foobar=x; filename=\"foo.html\"", "",
+      L"foo.html"},
+  };
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(tests); ++i) {
+    HttpContentDisposition header(tests[i].header, tests[i].referrer_charset);
+    EXPECT_EQ(tests[i].expected,
+        UTF8ToWide(header.filename()))
+        << "Failed on input: " << tests[i].header;
+  }
+}
+
+}  // namespace net
author	abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-01-27 06:03:16 +0000
committer	abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-01-27 06:03:16 +0000
commit	56eab2f216ce86217149753a407f981cb6d5de6c (patch)
tree	bf4b45a8fbb845b5ec177332df0ebafcfaedb98f /net/http/http_content_disposition_unittest.cc
parent	9b961c91eeb8ba43be7918563a07432cbcb1c4df (diff)
download	chromium_src-56eab2f216ce86217149753a407f981cb6d5de6c.zip chromium_src-56eab2f216ce86217149753a407f981cb6d5de6c.tar.gz chromium_src-56eab2f216ce86217149753a407f981cb6d5de6c.tar.bz2