diff options
author | darin@chromium.org <darin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-10-08 07:13:44 +0000 |
---|---|---|
committer | darin@chromium.org <darin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-10-08 07:13:44 +0000 |
commit | 344128dc6d961ef925071050cd360c09c7310488 (patch) | |
tree | f7844fd41ee30c8f123b8a6d93c6b1523cc2139c /net/base | |
parent | ccf7d64a4fa04ac14d4cc3060eac4f5e80413422 (diff) | |
download | chromium_src-344128dc6d961ef925071050cd360c09c7310488.zip chromium_src-344128dc6d961ef925071050cd360c09c7310488.tar.gz chromium_src-344128dc6d961ef925071050cd360c09c7310488.tar.bz2 |
Optimize the mime sniffer based on UMA data and move closer to the HTML 5 spec.
Removed over 50% of the heuristics while returning the same result 99.996% of
the time. Each heuristic we remove reduces the attack surface that honest sites
have to worry about when the serve third-party content.
Patch by Adam Barth
R=darin
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@2997 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/base')
-rw-r--r-- | net/base/mime_sniffer.cc | 102 | ||||
-rw-r--r-- | net/base/mime_sniffer_unittest.cc | 3 |
2 files changed, 30 insertions, 75 deletions
diff --git a/net/base/mime_sniffer.cc b/net/base/mime_sniffer.cc index 0d9539d..e17a603 100644 --- a/net/base/mime_sniffer.cc +++ b/net/base/mime_sniffer.cc @@ -144,25 +144,15 @@ static const MagicNumber kMagicNumbers[] = { MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") MAGIC_NUMBER("image/bmp", "BM") // Source: Mozilla - MAGIC_NUMBER("application/postscript", "%! PS-Adobe-") - // Mozilla uses "\x4a47????00" for image/x-jg, but we use stronger pattern - MAGIC_NUMBER("image/x-icon", "\x00\x00\x10\x00") - MAGIC_NUMBER("image/x-icon", "\x00\x00\x20\x00") - MAGIC_NUMBER("image/x-xbitmap", "#define ") - MAGIC_NUMBER("text/plain", "#!") // Script - MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS - MAGIC_NUMBER("text/plain", "From") + MAGIC_NUMBER("text/plain", "#!") // Script + MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS + MAGIC_NUMBER("text/plain", "From") MAGIC_NUMBER("text/plain", ">From") // Chrome specific - MAGIC_NUMBER("image/x-rgb", "\x01\xDA\x01\x01\x00\x03") MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") - MAGIC_NUMBER("application/x-compress", "\x1F\x9D\x90") // tar.Z MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") MAGIC_NUMBER("video/x-ms-asf", "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") - MAGIC_NUMBER("application/winhlp", "?_\x03") - MAGIC_NUMBER("application/winhlp", "LN\x02\x00") - MAGIC_NUMBER("application/x-bzip2", "BZ") MAGIC_NUMBER("image/tiff", "I I") MAGIC_NUMBER("image/tiff", "II*") MAGIC_NUMBER("image/tiff", "MM\x00*") @@ -171,18 +161,9 @@ static const MagicNumber kMagicNumbers[] = { // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") - MAGIC_NUMBER("image/x-jg", "\x4A\x47\x03\x0E\x00\x00\x00") - MAGIC_NUMBER("image/x-jg", "\x4A\x47\x04\x0E\x00\x00\x00") - MAGIC_NUMBER("image/x-portable-graymap", "P4\x0A") MAGIC_NUMBER("application/zip", "PK\x03\x04") MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") - MAGIC_NUMBER("application/rtf", "{\\rtf1") - MAGIC_NUMBER("application/postscript", "\xC5\xD0\xD3\xC6") MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") - MAGIC_NUMBER("application/octet-stream", "\x7F" "ELF") // ELF - MAGIC_NUMBER("application/octet-stream", "\xE8") // COM, SYS - MAGIC_NUMBER("application/octet-stream", "\xE9") // COM, SYS - MAGIC_NUMBER("application/octet-stream", "\xEB") // COM, SYS MAGIC_NUMBER("application/octet-stream", "MZ") // EXE // Sniffing for Flash: // @@ -218,53 +199,23 @@ static const MagicNumber kSniffableTags[] = { MAGIC_NUMBER("text/xml", "<?xml") // Mozilla // DOCTYPEs MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec - // Sniffable tags, ordered by how often they occur in web documents with a - // sniffable mime type (as measured in 2007). + // Sniffable tags, ordered by how often they occur in sniffable documents. + MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla + MAGIC_HTML_TAG("!--") MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla - MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla - MAGIC_HTML_TAG("tr") - MAGIC_HTML_TAG("link") // Mozilla - MAGIC_HTML_TAG("meta") // Mozilla - MAGIC_HTML_TAG("title") // Mozilla - MAGIC_HTML_TAG("pre") // Mozilla - MAGIC_HTML_TAG("table") // Mozilla - MAGIC_HTML_TAG("basefont") - // Not HTML: "xml" - MAGIC_HTML_TAG("p") // Mozilla + MAGIC_HTML_TAG("iframe") // Mozilla + MAGIC_HTML_TAG("h1") // Mozilla MAGIC_HTML_TAG("div") // Mozilla - MAGIC_HTML_TAG("base") // Mozilla - // Not HTML: "metadata" - MAGIC_HTML_TAG("body") // Mozilla - // Not HTML: "asx" - MAGIC_HTML_TAG("frameset") // Mozilla - // Not HTML: "sami" + MAGIC_HTML_TAG("font") // Mozilla + MAGIC_HTML_TAG("table") // Mozilla MAGIC_HTML_TAG("a") // Mozilla MAGIC_HTML_TAG("style") // Mozilla - // Not HTML: "rss" - MAGIC_HTML_TAG("br") - MAGIC_HTML_TAG("center") // Mozilla + MAGIC_HTML_TAG("title") // Mozilla MAGIC_HTML_TAG("b") // Mozilla - MAGIC_HTML_TAG("iframe") // Mozilla - MAGIC_HTML_TAG("img") // Mozilla - MAGIC_HTML_TAG("h1") // Mozilla - MAGIC_HTML_TAG("td") - // Not HTML: "printer" - MAGIC_HTML_TAG("font") // Mozilla - // Not HTML: "htlm" - MAGIC_HTML_TAG("form") // Mozilla - // Not HTML: "master" - MAGIC_HTML_TAG("h3") // Mozilla - MAGIC_HTML_TAG("h2") // Mozilla - // Plus a long tail, but we need to stop somewhere. - // - // We also include all the other tags that Mozilla sniffs: - MAGIC_HTML_TAG("!--") - MAGIC_HTML_TAG("applet") - MAGIC_HTML_TAG("isindex") - MAGIC_HTML_TAG("h4") - MAGIC_HTML_TAG("h5") - MAGIC_HTML_TAG("h6") + MAGIC_HTML_TAG("body") // Mozilla + MAGIC_HTML_TAG("br") + MAGIC_HTML_TAG("p") // Mozilla }; static bool MatchMagicNumber(const char* content, size_t size, @@ -322,7 +273,7 @@ static bool SniffForHTML(const char* content, size_t size, if (!IsAsciiWhitespace(*pos)) break; } - static SnifferHistogram counter(L"mime_sniffer.kSniffableTags", + static SnifferHistogram counter(L"mime_sniffer.kSniffableTags2", arraysize(kSniffableTags)); // |pos| now points to first non-whitespace character (or at end). return CheckForMagicNumbers(pos, end - pos, @@ -333,7 +284,7 @@ static bool SniffForHTML(const char* content, size_t size, static bool SniffForMagicNumbers(const char* content, size_t size, std::string* result) { // Check our big table of Magic Numbers - static SnifferHistogram counter(L"mime_sniffer.kMagicNumbers", + static SnifferHistogram counter(L"mime_sniffer.kMagicNumbers2", arraysize(kMagicNumbers)); return CheckForMagicNumbers(content, size, kMagicNumbers, arraysize(kMagicNumbers), @@ -369,7 +320,7 @@ static bool SniffXML(const char* content, size_t size, std::string* result) { // We want to skip XML processing instructions (of the form "<?xml ...") // and stop at the first "plain" tag, then make a decision on the mime-type // based on the name (or possibly attributes) of that tag. - static SnifferHistogram counter(L"mime_sniffer.kMagicXML", + static SnifferHistogram counter(L"mime_sniffer.kMagicXML2", arraysize(kMagicXML)); const int kMaxTagIterations = 5; for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { @@ -411,13 +362,12 @@ static const MagicNumber kByteOrderMark[] = { MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 - MAGIC_NUMBER("text/plain", "\x00\x00\xFE\xFF") // UCS-4BE }; // Whether a given byte looks like it might be part of binary content. // Source: HTML5 spec static char kByteLooksBinary[] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, // 0x00 - 0x0F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F @@ -437,7 +387,7 @@ static char kByteLooksBinary[] = { static bool LooksBinary(const char* content, size_t size) { // First, we look for a BOM. - static SnifferHistogram counter(L"mime_sniffer.kByteOrderMark", + static SnifferHistogram counter(L"mime_sniffer.kByteOrderMark2", arraysize(kByteOrderMark)); std::string unused; if (CheckForMagicNumbers(content, size, @@ -460,6 +410,7 @@ static bool LooksBinary(const char* content, size_t size) { static bool IsUnknownMimeType(const std::string& mime_type) { // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. + // If we do, please be careful not to alter the semantics at all. static const char* kUnknownMimeTypes[] = { // Empty mime types are as unknown as they get. "", @@ -470,7 +421,7 @@ static bool IsUnknownMimeType(const std::string& mime_type) { // Firefox rejects a mime type if it is exactly */* "*/*", }; - static SnifferHistogram counter(L"mime_sniffer.kUnknownMimeTypes", + static SnifferHistogram counter(L"mime_sniffer.kUnknownMimeTypes2", arraysize(kUnknownMimeTypes) + 1); for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { if (mime_type == kUnknownMimeTypes[i]) { @@ -487,13 +438,17 @@ static bool IsUnknownMimeType(const std::string& mime_type) { } bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { + static SnifferHistogram should_sniff_counter( + L"mime_sniffer.ShouldSniffMimeType2", 2); // We are willing to sniff the mime type for HTTP, HTTPS, and FTP bool sniffable_scheme = url.is_empty() || url.SchemeIs("http") || url.SchemeIs("https") || url.SchemeIs("ftp"); - if (!sniffable_scheme) + if (!sniffable_scheme) { + should_sniff_counter.Add(0); return false; + } static const char* kSniffableTypes[] = { // Many web servers are misconfigured to send text/plain for many @@ -508,11 +463,12 @@ bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { "text/xml", "application/xml", }; - static SnifferHistogram counter(L"mime_sniffer.kSniffableTypes", + static SnifferHistogram counter(L"mime_sniffer.kSniffableTypes2", arraysize(kSniffableTypes) + 1); for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { if (mime_type == kSniffableTypes[i]) { counter.Add(i); + should_sniff_counter.Add(1); return true; } } @@ -520,8 +476,10 @@ bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { // The web server didn't specify a content type or specified a mime // type that we ignore. counter.Add(arraysize(kSniffableTypes)); + should_sniff_counter.Add(1); return true; } + should_sniff_counter.Add(0); return false; } diff --git a/net/base/mime_sniffer_unittest.cc b/net/base/mime_sniffer_unittest.cc index a3416e0..0ed87e8 100644 --- a/net/base/mime_sniffer_unittest.cc +++ b/net/base/mime_sniffer_unittest.cc @@ -101,9 +101,6 @@ TEST(MimeSnifferTest, MozillaCompatibleTest) { { "BMjlakdsfk", sizeof("BMjlakdsfk")-1, "http://www.example.com/foo", "", "image/bmp" }, - { "\x00\x00\x20\x00", sizeof("\x00\x00\x30\x00")-1, - "http://www.example.com/favicon", - "", "image/x-icon" }, { "\x00\x00\x30\x00", sizeof("\x00\x00\x30\x00")-1, "http://www.example.com/favicon.ico", "", "application/octet-stream" }, |