diff options
author | abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-16 06:34:56 +0000 |
---|---|---|
committer | abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-16 06:34:56 +0000 |
commit | 2e7aff66fe443c29b2fc14a776dca5512b0b4729 (patch) | |
tree | 0195994507e51a675c488718dd50e7bd997d889f /net | |
parent | 791199c674cd1aa2997e39079e33681596866fc8 (diff) | |
download | chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.zip chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.tar.gz chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.tar.bz2 |
Increase the mime sniffer's limit for searching for binary looking bytes from
512 to 1024 bytes.
Contributed by tonyg@chromium.org
BUG=15314
TEST=net_unittests --gtest_filter=MimeSnifferTest.LooksBinary
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@41687 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net')
-rw-r--r-- | net/base/mime_sniffer.cc | 169 | ||||
-rw-r--r-- | net/base/mime_sniffer.h | 8 | ||||
-rw-r--r-- | net/base/mime_sniffer_unittest.cc | 30 |
3 files changed, 142 insertions, 65 deletions
diff --git a/net/base/mime_sniffer.cc b/net/base/mime_sniffer.cc index 1950c4b..1961107 100644 --- a/net/base/mime_sniffer.cc +++ b/net/base/mime_sniffer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -105,9 +105,6 @@ namespace net { -// We aren't interested in looking at more than 512 bytes of content -static const size_t kMaxBytesToSniff = 512; - // The number of content bytes we need to use all our magic numbers. Feel free // to increase this number if you add a longer magic number. static const size_t kBytesRequiredForMagic = 42; @@ -224,7 +221,7 @@ static bool MatchMagicNumber(const char* content, size_t size, const size_t len = magic_entry->magic_len; // Keep kBytesRequiredForMagic honest. - DCHECK(len <= kBytesRequiredForMagic); + DCHECK_LE(len, kBytesRequiredForMagic); // To compare with magic strings, we need to compute strlen(content), but // content might not actually have a null terminator. In that case, we @@ -263,8 +260,29 @@ static bool CheckForMagicNumbers(const char* content, size_t size, return false; } -static bool SniffForHTML(const char* content, size_t size, +// Truncates |size| to |max_size| and returns true if |size| is at least +// |max_size|. +static bool TruncateSize(const size_t max_size, size_t* size) { + // Keep kMaxBytesToSniff honest. + DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff); + + if (*size >= max_size) { + *size = max_size; + return true; + } + return false; +} + +// Returns true and sets result if the content appears to be HTML. +// Clears have_enough_content if more data could possibly change the result. +static bool SniffForHTML(const char* content, + size_t size, + bool* have_enough_content, std::string* result) { + // For HTML, we are willing to consider up to 512 bytes. This may be overly + // conservative as IE only considers 256. + *have_enough_content &= TruncateSize(512, &size); + // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, // but with some modifications to better match the HTML5 spec. const char* const end = content + size; @@ -282,8 +300,14 @@ static bool SniffForHTML(const char* content, size_t size, counter.get(), result); } -static bool SniffForMagicNumbers(const char* content, size_t size, +// Returns true and sets result if the content matches any of kMagicNumbers. +// Clears have_enough_content if more data could possibly change the result. +static bool SniffForMagicNumbers(const char* content, + size_t size, + bool* have_enough_content, std::string* result) { + *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); + // Check our big table of Magic Numbers static scoped_refptr<Histogram> counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", @@ -305,18 +329,22 @@ static const MagicNumber kMagicXML[] = { MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 }; -// Sniff an XML document to judge whether it contains XHTML or a feed. -// Returns true if it has seen enough content to make a definitive decision. +// Returns true and sets result if the content appears to contain XHTML or a +// feed. +// Clears have_enough_content if more data could possibly change the result. +// // TODO(evanm): this is similar but more conservative than what Safari does, // while HTML5 has a different recommendation -- what should we do? // TODO(evanm): this is incorrect for documents whose encoding isn't a superset // of ASCII -- do we care? -static bool SniffXML(const char* content, size_t size, std::string* result) { - // We allow at most kFirstTagBytes bytes of content before we expect the - // opening tag. - const size_t kFeedAllowedHeaderBytes = 300; - const char* const end = content + std::min(size, kFeedAllowedHeaderBytes); +static bool SniffXML(const char* content, + size_t size, + bool* have_enough_content, + std::string* result) { + // We allow at most 300 bytes of content before we expect the opening tag. + *have_enough_content &= TruncateSize(300, &size); const char* pos = content; + const char* const end = content + size; // This loop iterates through tag-looking offsets in the file. // We want to skip XML processing instructions (of the form "<?xml ...") @@ -389,7 +417,22 @@ static char kByteLooksBinary[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF }; -static bool LooksBinary(const char* content, size_t size) { +// Returns true and sets result to "application/octet-stream" if the content +// appears to be binary data. Otherwise, returns false and sets "text/plain". +// Clears have_enough_content if more data could possibly change the result. +static bool SniffBinary(const char* content, + size_t size, + bool* have_enough_content, + std::string* result) { + // There is no concensus about exactly how to sniff for binary content. + // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. + // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. + // Here, we side with FF, but with a smaller buffer. This size was chosen + // because it is small enough to comfortably fit into a single packet (after + // allowing for headers) and yet large enough to account for binary formats + // that have a significant amount of ASCII at the beginning (crbug.com/15314). + const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); + // First, we look for a BOM. static scoped_refptr<Histogram> counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", @@ -399,17 +442,24 @@ static bool LooksBinary(const char* content, size_t size) { kByteOrderMark, arraysize(kByteOrderMark), counter.get(), &unused)) { // If there is BOM, we think the buffer is not binary. + result->assign("text/plain"); return false; } // Next we look to see if any of the bytes "look binary." for (size_t i = 0; i < size; ++i) { // If we a see a binary-looking byte, we think the content is binary. - if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) + if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { + result->assign("application/octet-stream"); return true; + } } - // No evidence either way, default to non-binary. + // No evidence either way. Default to non-binary and, if truncated, clear + // have_enough_content because there could be a binary looking byte in the + // truncated data. + *have_enough_content &= is_truncated; + result->assign("text/plain"); return false; } @@ -443,9 +493,15 @@ static bool IsUnknownMimeType(const std::string& mime_type) { return false; } -// Sniff a crx (chrome extension) file. -static bool SniffCRX(const char* content, size_t content_size, const GURL& url, - const std::string& type_hint, std::string* result) { +// Returns true and sets result if the content appears to be a crx (chrome +// extension) file. +// Clears have_enough_content if more data could possibly change the result. +static bool SniffCRX(const char* content, + size_t size, + const GURL& url, + const std::string& type_hint, + bool* have_enough_content, + std::string* result) { static scoped_refptr<Histogram> counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); @@ -456,13 +512,14 @@ static bool SniffCRX(const char* content, size_t content_size, const GURL& url, // // TODO(aa): If we ever have another magic number, we'll want to pass a // histogram into CheckForMagicNumbers(), below, to see which one matched. - const struct MagicNumber kCRXMagicNumbers[] = { + static const struct MagicNumber kCRXMagicNumbers[] = { MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") }; // Only consider files that have the extension ".crx". - const char kCRXExtension[] = ".crx"; - const int kExtensionLength = arraysize(kCRXExtension) - 1; // ignore null + static const char kCRXExtension[] = ".crx"; + // Ignore null by subtracting 1. + static const int kExtensionLength = arraysize(kCRXExtension) - 1; if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == url.path().size() - kExtensionLength) { counter->Add(1); @@ -470,7 +527,8 @@ static bool SniffCRX(const char* content, size_t content_size, const GURL& url, return false; } - if (CheckForMagicNumbers(content, content_size, + *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); + if (CheckForMagicNumbers(content, size, kCRXMagicNumbers, arraysize(kCRXMagicNumbers), NULL, result)) { counter->Add(2); @@ -535,17 +593,14 @@ bool SniffMimeType(const char* content, size_t content_size, DCHECK(content); DCHECK(result); + // By default, we assume we have enough content. + // Each sniff routine may unset this if it wasn't provided enough content. + bool have_enough_content = true; + // By default, we'll return the type hint. + // Each sniff routine may modify this if it has a better guess.. result->assign(type_hint); - // Flag for tracking whether our decision was limited by content_size. We - // probably have enough content if we can use all our magic numbers. - const bool have_enough_content = content_size >= kBytesRequiredForMagic; - - // We have an upper limit on the number of bytes we will consider. - if (content_size > kMaxBytesToSniff) - content_size = kMaxBytesToSniff; - // Cache information about the type_hint const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint); @@ -554,34 +609,41 @@ bool SniffMimeType(const char* content, size_t content_size, // We're only willing to sniff HTML if the server has not supplied a mime // type, or if the type it did supply indicates that it doesn't know what // the type should be. - if (SniffForHTML(content, content_size, result)) + if (SniffForHTML(content, content_size, &have_enough_content, result)) return true; // We succeeded in sniffing HTML. No more content needed. } - // We'll reuse this information later + // We're only willing to sniff for binary in 3 cases: + // 1. The server has not supplied a mime type. + // 2. The type it did supply indicates that it doesn't know what the type + // should be. + // 3. The type is "text/plain" which is the default on some web servers and + // could be indicative of a mis-configuration that we shield the user from. const bool hint_is_text_plain = (type_hint == "text/plain"); - const bool looks_binary = LooksBinary(content, content_size); - - if (hint_is_text_plain && !looks_binary) { - // The server said the content was text/plain and we don't really have any - // evidence otherwise. - result->assign("text/plain"); - return have_enough_content; + if (hint_is_unknown_mime_type || hint_is_text_plain) { + if (!SniffBinary(content, content_size, &have_enough_content, result)) { + // If the server said the content was text/plain and it doesn't appear + // to be binary, then we trust it. + if (hint_is_text_plain) { + return have_enough_content; + } + } } // If we have plain XML, sniff XML subtypes. if (type_hint == "text/xml" || type_hint == "application/xml") { // We're not interested in sniffing these types for images and the like. - // Instead, we're looking explicitly for a feed. If we don't find one we're - // done and return early. - if (SniffXML(content, content_size, result)) + // Instead, we're looking explicitly for a feed. If we don't find one + // we're done and return early. + if (SniffXML(content, content_size, &have_enough_content, result)) return true; - return content_size >= kMaxBytesToSniff; + return have_enough_content; } // CRX files (chrome extensions) have a special sniffing algorithm. It is // tighter than the others because we don't have to match legacy behavior. - if (SniffCRX(content, content_size, url, type_hint, result)) + if (SniffCRX(content, content_size, url, type_hint, + &have_enough_content, result)) return true; // We're not interested in sniffing for magic numbers when the type_hint @@ -591,21 +653,10 @@ bool SniffMimeType(const char* content, size_t content_size, // Now we look in our large table of magic numbers to see if we can find // anything that matches the content. - if (SniffForMagicNumbers(content, content_size, result)) + if (SniffForMagicNumbers(content, content_size, + &have_enough_content, result)) return true; // We've matched a magic number. No more content needed. - // Having failed thus far, we're willing to override unknown mime types and - // text/plain. - if (hint_is_unknown_mime_type || hint_is_text_plain) { - if (looks_binary) - result->assign("application/octet-stream"); - else - result->assign("text/plain"); - // We could change our mind if a binary-looking byte appears later in - // the content, so we only have enough content if we have the max. - return content_size >= kMaxBytesToSniff; - } - return have_enough_content; } diff --git a/net/base/mime_sniffer.h b/net/base/mime_sniffer.h index 6fd7014..d0c4e78 100644 --- a/net/base/mime_sniffer.h +++ b/net/base/mime_sniffer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -11,6 +11,12 @@ class GURL; namespace net { +// The maximum number of bytes used by any internal mime sniffing routine. May +// be useful for callers to determine an efficient buffer size to pass to +// |SniffMimeType|. +// This must be updated if any internal sniffing routine needs more bytes. +const int kMaxBytesToSniff = 1024; + // Examine the URL and the mime_type and decide whether we should sniff a // replacement mime type from the content. // diff --git a/net/base/mime_sniffer_unittest.cc b/net/base/mime_sniffer_unittest.cc index 56dfd51..d70cb23 100644 --- a/net/base/mime_sniffer_unittest.cc +++ b/net/base/mime_sniffer_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -149,6 +149,9 @@ TEST(MimeSnifferTest, ChromeExtensionsTest) { { "Cr24\x02\x00\x00\x01", sizeof("Cr24\x02\x00\x00\x01")-1, "http://www.example.com/foo.crx?monkey", "", "application/octet-stream" }, + { "PADDING_Cr24\x02\x00\x00\x00", sizeof("PADDING_Cr24\x02\x00\x00\x00")-1, + "http://www.example.com/foo.crx?monkey", + "", "application/octet-stream" }, }; TestArray(tests, arraysize(tests)); @@ -359,16 +362,33 @@ TEST(MimeSnifferTest, XMLTest) { } -// Test content which is >= 512 bytes, and includes no open angle bracket. +// Test content which is >= 1024 bytes, and includes no open angle bracket. // http://code.google.com/p/chromium/issues/detail?id=3521 TEST(MimeSnifferTest, XMLTestLargeNoAngledBracket) { - // Make a large input, with 600 bytes of "x". + // Make a large input, with 1024 bytes of "x". std::string content; - content.resize(600); + content.resize(1024); std::fill(content.begin(), content.end(), 'x'); - // content.size() >= kMaxBytesToSniff (512) so the sniff is unambiguous. + // content.size() >= 1024 so the sniff is unambiguous. std::string mime_type; EXPECT_TRUE(net::SniffMimeType(content.data(), content.size(), GURL(), "text/xml", &mime_type)); + EXPECT_EQ("text/xml", mime_type); +} + +// Test content which is >= 1024 bytes, and includes a binary looking byte. +// http://code.google.com/p/chromium/issues/detail?id=15314 +TEST(MimeSnifferTest, LooksBinary) { + // Make a large input, with 1024 bytes of "x" and 1 byte of 0x01. + std::string content; + content.resize(1024); + std::fill(content.begin(), content.end(), 'x'); + content[1000] = 0x01; + + // content.size() >= 1024 so the sniff is unambiguous. + std::string mime_type; + EXPECT_TRUE(net::SniffMimeType(content.data(), content.size(), GURL(), + "text/plain", &mime_type)); + EXPECT_EQ("application/octet-stream", mime_type); } |