summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorabarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-03-16 06:34:56 +0000
committerabarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-03-16 06:34:56 +0000
commit2e7aff66fe443c29b2fc14a776dca5512b0b4729 (patch)
tree0195994507e51a675c488718dd50e7bd997d889f /net
parent791199c674cd1aa2997e39079e33681596866fc8 (diff)
downloadchromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.zip
chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.tar.gz
chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.tar.bz2
Increase the mime sniffer's limit for searching for binary looking bytes from
512 to 1024 bytes. Contributed by tonyg@chromium.org BUG=15314 TEST=net_unittests --gtest_filter=MimeSnifferTest.LooksBinary git-svn-id: svn://svn.chromium.org/chrome/trunk/src@41687 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net')
-rw-r--r--net/base/mime_sniffer.cc169
-rw-r--r--net/base/mime_sniffer.h8
-rw-r--r--net/base/mime_sniffer_unittest.cc30
3 files changed, 142 insertions, 65 deletions
diff --git a/net/base/mime_sniffer.cc b/net/base/mime_sniffer.cc
index 1950c4b..1961107 100644
--- a/net/base/mime_sniffer.cc
+++ b/net/base/mime_sniffer.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -105,9 +105,6 @@
namespace net {
-// We aren't interested in looking at more than 512 bytes of content
-static const size_t kMaxBytesToSniff = 512;
-
// The number of content bytes we need to use all our magic numbers. Feel free
// to increase this number if you add a longer magic number.
static const size_t kBytesRequiredForMagic = 42;
@@ -224,7 +221,7 @@ static bool MatchMagicNumber(const char* content, size_t size,
const size_t len = magic_entry->magic_len;
// Keep kBytesRequiredForMagic honest.
- DCHECK(len <= kBytesRequiredForMagic);
+ DCHECK_LE(len, kBytesRequiredForMagic);
// To compare with magic strings, we need to compute strlen(content), but
// content might not actually have a null terminator. In that case, we
@@ -263,8 +260,29 @@ static bool CheckForMagicNumbers(const char* content, size_t size,
return false;
}
-static bool SniffForHTML(const char* content, size_t size,
+// Truncates |size| to |max_size| and returns true if |size| is at least
+// |max_size|.
+static bool TruncateSize(const size_t max_size, size_t* size) {
+ // Keep kMaxBytesToSniff honest.
+ DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
+
+ if (*size >= max_size) {
+ *size = max_size;
+ return true;
+ }
+ return false;
+}
+
+// Returns true and sets result if the content appears to be HTML.
+// Clears have_enough_content if more data could possibly change the result.
+static bool SniffForHTML(const char* content,
+ size_t size,
+ bool* have_enough_content,
std::string* result) {
+ // For HTML, we are willing to consider up to 512 bytes. This may be overly
+ // conservative as IE only considers 256.
+ *have_enough_content &= TruncateSize(512, &size);
+
// We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
// but with some modifications to better match the HTML5 spec.
const char* const end = content + size;
@@ -282,8 +300,14 @@ static bool SniffForHTML(const char* content, size_t size,
counter.get(), result);
}
-static bool SniffForMagicNumbers(const char* content, size_t size,
+// Returns true and sets result if the content matches any of kMagicNumbers.
+// Clears have_enough_content if more data could possibly change the result.
+static bool SniffForMagicNumbers(const char* content,
+ size_t size,
+ bool* have_enough_content,
std::string* result) {
+ *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
+
// Check our big table of Magic Numbers
static scoped_refptr<Histogram> counter =
UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
@@ -305,18 +329,22 @@ static const MagicNumber kMagicXML[] = {
MAGIC_STRING("application/rss+xml", "<rss") // UTF-8
};
-// Sniff an XML document to judge whether it contains XHTML or a feed.
-// Returns true if it has seen enough content to make a definitive decision.
+// Returns true and sets result if the content appears to contain XHTML or a
+// feed.
+// Clears have_enough_content if more data could possibly change the result.
+//
// TODO(evanm): this is similar but more conservative than what Safari does,
// while HTML5 has a different recommendation -- what should we do?
// TODO(evanm): this is incorrect for documents whose encoding isn't a superset
// of ASCII -- do we care?
-static bool SniffXML(const char* content, size_t size, std::string* result) {
- // We allow at most kFirstTagBytes bytes of content before we expect the
- // opening tag.
- const size_t kFeedAllowedHeaderBytes = 300;
- const char* const end = content + std::min(size, kFeedAllowedHeaderBytes);
+static bool SniffXML(const char* content,
+ size_t size,
+ bool* have_enough_content,
+ std::string* result) {
+ // We allow at most 300 bytes of content before we expect the opening tag.
+ *have_enough_content &= TruncateSize(300, &size);
const char* pos = content;
+ const char* const end = content + size;
// This loop iterates through tag-looking offsets in the file.
// We want to skip XML processing instructions (of the form "<?xml ...")
@@ -389,7 +417,22 @@ static char kByteLooksBinary[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF
};
-static bool LooksBinary(const char* content, size_t size) {
+// Returns true and sets result to "application/octet-stream" if the content
+// appears to be binary data. Otherwise, returns false and sets "text/plain".
+// Clears have_enough_content if more data could possibly change the result.
+static bool SniffBinary(const char* content,
+ size_t size,
+ bool* have_enough_content,
+ std::string* result) {
+ // There is no concensus about exactly how to sniff for binary content.
+ // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
+ // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
+ // Here, we side with FF, but with a smaller buffer. This size was chosen
+ // because it is small enough to comfortably fit into a single packet (after
+ // allowing for headers) and yet large enough to account for binary formats
+ // that have a significant amount of ASCII at the beginning (crbug.com/15314).
+ const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
+
// First, we look for a BOM.
static scoped_refptr<Histogram> counter =
UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
@@ -399,17 +442,24 @@ static bool LooksBinary(const char* content, size_t size) {
kByteOrderMark, arraysize(kByteOrderMark),
counter.get(), &unused)) {
// If there is BOM, we think the buffer is not binary.
+ result->assign("text/plain");
return false;
}
// Next we look to see if any of the bytes "look binary."
for (size_t i = 0; i < size; ++i) {
// If we a see a binary-looking byte, we think the content is binary.
- if (kByteLooksBinary[static_cast<unsigned char>(content[i])])
+ if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) {
+ result->assign("application/octet-stream");
return true;
+ }
}
- // No evidence either way, default to non-binary.
+ // No evidence either way. Default to non-binary and, if truncated, clear
+ // have_enough_content because there could be a binary looking byte in the
+ // truncated data.
+ *have_enough_content &= is_truncated;
+ result->assign("text/plain");
return false;
}
@@ -443,9 +493,15 @@ static bool IsUnknownMimeType(const std::string& mime_type) {
return false;
}
-// Sniff a crx (chrome extension) file.
-static bool SniffCRX(const char* content, size_t content_size, const GURL& url,
- const std::string& type_hint, std::string* result) {
+// Returns true and sets result if the content appears to be a crx (chrome
+// extension) file.
+// Clears have_enough_content if more data could possibly change the result.
+static bool SniffCRX(const char* content,
+ size_t size,
+ const GURL& url,
+ const std::string& type_hint,
+ bool* have_enough_content,
+ std::string* result) {
static scoped_refptr<Histogram> counter =
UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
@@ -456,13 +512,14 @@ static bool SniffCRX(const char* content, size_t content_size, const GURL& url,
//
// TODO(aa): If we ever have another magic number, we'll want to pass a
// histogram into CheckForMagicNumbers(), below, to see which one matched.
- const struct MagicNumber kCRXMagicNumbers[] = {
+ static const struct MagicNumber kCRXMagicNumbers[] = {
MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
};
// Only consider files that have the extension ".crx".
- const char kCRXExtension[] = ".crx";
- const int kExtensionLength = arraysize(kCRXExtension) - 1; // ignore null
+ static const char kCRXExtension[] = ".crx";
+ // Ignore null by subtracting 1.
+ static const int kExtensionLength = arraysize(kCRXExtension) - 1;
if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
url.path().size() - kExtensionLength) {
counter->Add(1);
@@ -470,7 +527,8 @@ static bool SniffCRX(const char* content, size_t content_size, const GURL& url,
return false;
}
- if (CheckForMagicNumbers(content, content_size,
+ *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
+ if (CheckForMagicNumbers(content, size,
kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
NULL, result)) {
counter->Add(2);
@@ -535,17 +593,14 @@ bool SniffMimeType(const char* content, size_t content_size,
DCHECK(content);
DCHECK(result);
+ // By default, we assume we have enough content.
+ // Each sniff routine may unset this if it wasn't provided enough content.
+ bool have_enough_content = true;
+
// By default, we'll return the type hint.
+ // Each sniff routine may modify this if it has a better guess..
result->assign(type_hint);
- // Flag for tracking whether our decision was limited by content_size. We
- // probably have enough content if we can use all our magic numbers.
- const bool have_enough_content = content_size >= kBytesRequiredForMagic;
-
- // We have an upper limit on the number of bytes we will consider.
- if (content_size > kMaxBytesToSniff)
- content_size = kMaxBytesToSniff;
-
// Cache information about the type_hint
const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
@@ -554,34 +609,41 @@ bool SniffMimeType(const char* content, size_t content_size,
// We're only willing to sniff HTML if the server has not supplied a mime
// type, or if the type it did supply indicates that it doesn't know what
// the type should be.
- if (SniffForHTML(content, content_size, result))
+ if (SniffForHTML(content, content_size, &have_enough_content, result))
return true; // We succeeded in sniffing HTML. No more content needed.
}
- // We'll reuse this information later
+ // We're only willing to sniff for binary in 3 cases:
+ // 1. The server has not supplied a mime type.
+ // 2. The type it did supply indicates that it doesn't know what the type
+ // should be.
+ // 3. The type is "text/plain" which is the default on some web servers and
+ // could be indicative of a mis-configuration that we shield the user from.
const bool hint_is_text_plain = (type_hint == "text/plain");
- const bool looks_binary = LooksBinary(content, content_size);
-
- if (hint_is_text_plain && !looks_binary) {
- // The server said the content was text/plain and we don't really have any
- // evidence otherwise.
- result->assign("text/plain");
- return have_enough_content;
+ if (hint_is_unknown_mime_type || hint_is_text_plain) {
+ if (!SniffBinary(content, content_size, &have_enough_content, result)) {
+ // If the server said the content was text/plain and it doesn't appear
+ // to be binary, then we trust it.
+ if (hint_is_text_plain) {
+ return have_enough_content;
+ }
+ }
}
// If we have plain XML, sniff XML subtypes.
if (type_hint == "text/xml" || type_hint == "application/xml") {
// We're not interested in sniffing these types for images and the like.
- // Instead, we're looking explicitly for a feed. If we don't find one we're
- // done and return early.
- if (SniffXML(content, content_size, result))
+ // Instead, we're looking explicitly for a feed. If we don't find one
+ // we're done and return early.
+ if (SniffXML(content, content_size, &have_enough_content, result))
return true;
- return content_size >= kMaxBytesToSniff;
+ return have_enough_content;
}
// CRX files (chrome extensions) have a special sniffing algorithm. It is
// tighter than the others because we don't have to match legacy behavior.
- if (SniffCRX(content, content_size, url, type_hint, result))
+ if (SniffCRX(content, content_size, url, type_hint,
+ &have_enough_content, result))
return true;
// We're not interested in sniffing for magic numbers when the type_hint
@@ -591,21 +653,10 @@ bool SniffMimeType(const char* content, size_t content_size,
// Now we look in our large table of magic numbers to see if we can find
// anything that matches the content.
- if (SniffForMagicNumbers(content, content_size, result))
+ if (SniffForMagicNumbers(content, content_size,
+ &have_enough_content, result))
return true; // We've matched a magic number. No more content needed.
- // Having failed thus far, we're willing to override unknown mime types and
- // text/plain.
- if (hint_is_unknown_mime_type || hint_is_text_plain) {
- if (looks_binary)
- result->assign("application/octet-stream");
- else
- result->assign("text/plain");
- // We could change our mind if a binary-looking byte appears later in
- // the content, so we only have enough content if we have the max.
- return content_size >= kMaxBytesToSniff;
- }
-
return have_enough_content;
}
diff --git a/net/base/mime_sniffer.h b/net/base/mime_sniffer.h
index 6fd7014..d0c4e78 100644
--- a/net/base/mime_sniffer.h
+++ b/net/base/mime_sniffer.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -11,6 +11,12 @@ class GURL;
namespace net {
+// The maximum number of bytes used by any internal mime sniffing routine. May
+// be useful for callers to determine an efficient buffer size to pass to
+// |SniffMimeType|.
+// This must be updated if any internal sniffing routine needs more bytes.
+const int kMaxBytesToSniff = 1024;
+
// Examine the URL and the mime_type and decide whether we should sniff a
// replacement mime type from the content.
//
diff --git a/net/base/mime_sniffer_unittest.cc b/net/base/mime_sniffer_unittest.cc
index 56dfd51..d70cb23 100644
--- a/net/base/mime_sniffer_unittest.cc
+++ b/net/base/mime_sniffer_unittest.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -149,6 +149,9 @@ TEST(MimeSnifferTest, ChromeExtensionsTest) {
{ "Cr24\x02\x00\x00\x01", sizeof("Cr24\x02\x00\x00\x01")-1,
"http://www.example.com/foo.crx?monkey",
"", "application/octet-stream" },
+ { "PADDING_Cr24\x02\x00\x00\x00", sizeof("PADDING_Cr24\x02\x00\x00\x00")-1,
+ "http://www.example.com/foo.crx?monkey",
+ "", "application/octet-stream" },
};
TestArray(tests, arraysize(tests));
@@ -359,16 +362,33 @@ TEST(MimeSnifferTest, XMLTest) {
}
-// Test content which is >= 512 bytes, and includes no open angle bracket.
+// Test content which is >= 1024 bytes, and includes no open angle bracket.
// http://code.google.com/p/chromium/issues/detail?id=3521
TEST(MimeSnifferTest, XMLTestLargeNoAngledBracket) {
- // Make a large input, with 600 bytes of "x".
+ // Make a large input, with 1024 bytes of "x".
std::string content;
- content.resize(600);
+ content.resize(1024);
std::fill(content.begin(), content.end(), 'x');
- // content.size() >= kMaxBytesToSniff (512) so the sniff is unambiguous.
+ // content.size() >= 1024 so the sniff is unambiguous.
std::string mime_type;
EXPECT_TRUE(net::SniffMimeType(content.data(), content.size(), GURL(),
"text/xml", &mime_type));
+ EXPECT_EQ("text/xml", mime_type);
+}
+
+// Test content which is >= 1024 bytes, and includes a binary looking byte.
+// http://code.google.com/p/chromium/issues/detail?id=15314
+TEST(MimeSnifferTest, LooksBinary) {
+ // Make a large input, with 1024 bytes of "x" and 1 byte of 0x01.
+ std::string content;
+ content.resize(1024);
+ std::fill(content.begin(), content.end(), 'x');
+ content[1000] = 0x01;
+
+ // content.size() >= 1024 so the sniff is unambiguous.
+ std::string mime_type;
+ EXPECT_TRUE(net::SniffMimeType(content.data(), content.size(), GURL(),
+ "text/plain", &mime_type));
+ EXPECT_EQ("application/octet-stream", mime_type);
}