Increase the mime sniffer's limit for searching for binary looking bytes from

512 to 1024 bytes. Contributed by tonyg@chromium.org BUG=15314 TEST=net_unittests --gtest_filter=MimeSnifferTest.LooksBinary git-svn-id: svn://svn.chromium.org/chrome/trunk/src@41687 0039d316-1c4b-4281-b951-d872f2087c98
author: abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-03-16 06:34:56 +0000
committer: abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-03-16 06:34:56 +0000
commit: 2e7aff66fe443c29b2fc14a776dca5512b0b4729 (patch)
tree: 0195994507e51a675c488718dd50e7bd997d889f /net
parent: 791199c674cd1aa2997e39079e33681596866fc8 (diff)
download: chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.zip
chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.tar.gz
chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.tar.bz2
3 files changed, 142 insertions, 65 deletions
diff --git a/net/base/mime_sniffer.cc b/net/base/mime_sniffer.cc
index 1950c4b..1961107 100644
--- a/net/base/mime_sniffer.cc
+++ b/net/base/mime_sniffer.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -105,9 +105,6 @@
 
 namespace net {
 
-// We aren't interested in looking at more than 512 bytes of content
-static const size_t kMaxBytesToSniff = 512;
-
 // The number of content bytes we need to use all our magic numbers.  Feel free
 // to increase this number if you add a longer magic number.
 static const size_t kBytesRequiredForMagic = 42;
@@ -224,7 +221,7 @@ static bool MatchMagicNumber(const char* content, size_t size,
   const size_t len = magic_entry->magic_len;
 
   // Keep kBytesRequiredForMagic honest.
-  DCHECK(len <= kBytesRequiredForMagic);
+  DCHECK_LE(len, kBytesRequiredForMagic);
 
   // To compare with magic strings, we need to compute strlen(content), but
   // content might not actually have a null terminator.  In that case, we
@@ -263,8 +260,29 @@ static bool CheckForMagicNumbers(const char* content, size_t size,
   return false;
 }
 
-static bool SniffForHTML(const char* content, size_t size,
+// Truncates |size| to |max_size| and returns true if |size| is at least
+// |max_size|.
+static bool TruncateSize(const size_t max_size, size_t* size) {
+  // Keep kMaxBytesToSniff honest.
+  DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
+
+  if (*size >= max_size) {
+    *size = max_size;
+    return true;
+  }
+  return false;
+}
+
+// Returns true and sets result if the content appears to be HTML.
+// Clears have_enough_content if more data could possibly change the result.
+static bool SniffForHTML(const char* content,
+                         size_t size,
+                         bool* have_enough_content,
                          std::string* result) {
+  // For HTML, we are willing to consider up to 512 bytes. This may be overly
+  // conservative as IE only considers 256.
+  *have_enough_content &= TruncateSize(512, &size);
+
   // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
   // but with some modifications to better match the HTML5 spec.
   const char* const end = content + size;
@@ -282,8 +300,14 @@ static bool SniffForHTML(const char* content, size_t size,
                               counter.get(), result);
 }
 
-static bool SniffForMagicNumbers(const char* content, size_t size,
+// Returns true and sets result if the content matches any of kMagicNumbers.
+// Clears have_enough_content if more data could possibly change the result.
+static bool SniffForMagicNumbers(const char* content,
+                                 size_t size,
+                                 bool* have_enough_content,
                                  std::string* result) {
+  *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
+
   // Check our big table of Magic Numbers
   static scoped_refptr<Histogram> counter =
       UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
@@ -305,18 +329,22 @@ static const MagicNumber kMagicXML[] = {
   MAGIC_STRING("application/rss+xml", "<rss")  // UTF-8
 };
 
-// Sniff an XML document to judge whether it contains XHTML or a feed.
-// Returns true if it has seen enough content to make a definitive decision.
+// Returns true and sets result if the content appears to contain XHTML or a
+// feed.
+// Clears have_enough_content if more data could possibly change the result.
+//
 // TODO(evanm): this is similar but more conservative than what Safari does,
 // while HTML5 has a different recommendation -- what should we do?
 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
 // of ASCII -- do we care?
-static bool SniffXML(const char* content, size_t size, std::string* result) {
-  // We allow at most kFirstTagBytes bytes of content before we expect the
-  // opening tag.
-  const size_t kFeedAllowedHeaderBytes = 300;
-  const char* const end = content + std::min(size, kFeedAllowedHeaderBytes);
+static bool SniffXML(const char* content,
+                     size_t size,
+                     bool* have_enough_content,
+                     std::string* result) {
+  // We allow at most 300 bytes of content before we expect the opening tag.
+  *have_enough_content &= TruncateSize(300, &size);
   const char* pos = content;
+  const char* const end = content + size;
 
   // This loop iterates through tag-looking offsets in the file.
   // We want to skip XML processing instructions (of the form "<?xml ...")
@@ -389,7 +417,22 @@ static char kByteLooksBinary[] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xF0 - 0xFF
 };
 
-static bool LooksBinary(const char* content, size_t size) {
+// Returns true and sets result to "application/octet-stream" if the content
+// appears to be binary data. Otherwise, returns false and sets "text/plain".
+// Clears have_enough_content if more data could possibly change the result.
+static bool SniffBinary(const char* content,
+                        size_t size,
+                        bool* have_enough_content,
+                        std::string* result) {
+  // There is no concensus about exactly how to sniff for binary content.
+  // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
+  // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
+  // Here, we side with FF, but with a smaller buffer. This size was chosen
+  // because it is small enough to comfortably fit into a single packet (after
+  // allowing for headers) and yet large enough to account for binary formats
+  // that have a significant amount of ASCII at the beginning (crbug.com/15314).
+  const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
+
   // First, we look for a BOM.
   static scoped_refptr<Histogram> counter =
       UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
@@ -399,17 +442,24 @@ static bool LooksBinary(const char* content, size_t size) {
                            kByteOrderMark, arraysize(kByteOrderMark),
                            counter.get(), &unused)) {
     // If there is BOM, we think the buffer is not binary.
+    result->assign("text/plain");
     return false;
   }
 
   // Next we look to see if any of the bytes "look binary."
   for (size_t i = 0; i < size; ++i) {
     // If we a see a binary-looking byte, we think the content is binary.
-    if (kByteLooksBinary[static_cast<unsigned char>(content[i])])
+    if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) {
+      result->assign("application/octet-stream");
       return true;
+    }
   }
 
-  // No evidence either way, default to non-binary.
+  // No evidence either way. Default to non-binary and, if truncated, clear
+  // have_enough_content because there could be a binary looking byte in the
+  // truncated data.
+  *have_enough_content &= is_truncated;
+  result->assign("text/plain");
   return false;
 }
 
@@ -443,9 +493,15 @@ static bool IsUnknownMimeType(const std::string& mime_type) {
   return false;
 }
 
-// Sniff a crx (chrome extension) file.
-static bool SniffCRX(const char* content, size_t content_size, const GURL& url,
-                     const std::string& type_hint, std::string* result) {
+// Returns true and sets result if the content appears to be a crx (chrome
+// extension) file.
+// Clears have_enough_content if more data could possibly change the result.
+static bool SniffCRX(const char* content,
+                     size_t size,
+                     const GURL& url,
+                     const std::string& type_hint,
+                     bool* have_enough_content,
+                     std::string* result) {
   static scoped_refptr<Histogram> counter =
       UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
 
@@ -456,13 +512,14 @@ static bool SniffCRX(const char* content, size_t content_size, const GURL& url,
   //
   // TODO(aa): If we ever have another magic number, we'll want to pass a
   // histogram into CheckForMagicNumbers(), below, to see which one matched.
-  const struct MagicNumber kCRXMagicNumbers[] = {
+  static const struct MagicNumber kCRXMagicNumbers[] = {
     MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
   };
 
   // Only consider files that have the extension ".crx".
-  const char kCRXExtension[] = ".crx";
-  const int kExtensionLength = arraysize(kCRXExtension) - 1;  // ignore null
+  static const char kCRXExtension[] = ".crx";
+  // Ignore null by subtracting 1.
+  static const int kExtensionLength = arraysize(kCRXExtension) - 1;
   if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
       url.path().size() - kExtensionLength) {
     counter->Add(1);
@@ -470,7 +527,8 @@ static bool SniffCRX(const char* content, size_t content_size, const GURL& url,
     return false;
   }
 
-  if (CheckForMagicNumbers(content, content_size,
+  *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
+  if (CheckForMagicNumbers(content, size,
                            kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
                            NULL, result)) {
     counter->Add(2);
@@ -535,17 +593,14 @@ bool SniffMimeType(const char* content, size_t content_size,
   DCHECK(content);
   DCHECK(result);
 
+  // By default, we assume we have enough content.
+  // Each sniff routine may unset this if it wasn't provided enough content.
+  bool have_enough_content = true;
+
   // By default, we'll return the type hint.
+  // Each sniff routine may modify this if it has a better guess..
   result->assign(type_hint);
 
-  // Flag for tracking whether our decision was limited by content_size.  We
-  // probably have enough content if we can use all our magic numbers.
-  const bool have_enough_content = content_size >= kBytesRequiredForMagic;
-
-  // We have an upper limit on the number of bytes we will consider.
-  if (content_size > kMaxBytesToSniff)
-    content_size = kMaxBytesToSniff;
-
   // Cache information about the type_hint
   const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
 
@@ -554,34 +609,41 @@ bool SniffMimeType(const char* content, size_t content_size,
     // We're only willing to sniff HTML if the server has not supplied a mime
     // type, or if the type it did supply indicates that it doesn't know what
     // the type should be.
-    if (SniffForHTML(content, content_size, result))
+    if (SniffForHTML(content, content_size, &have_enough_content, result))
       return true;  // We succeeded in sniffing HTML.  No more content needed.
   }
 
-  // We'll reuse this information later
+  // We're only willing to sniff for binary in 3 cases:
+  // 1. The server has not supplied a mime type.
+  // 2. The type it did supply indicates that it doesn't know what the type
+  //    should be.
+  // 3. The type is "text/plain" which is the default on some web servers and
+  //    could be indicative of a mis-configuration that we shield the user from.
   const bool hint_is_text_plain = (type_hint == "text/plain");
-  const bool looks_binary = LooksBinary(content, content_size);
-
-  if (hint_is_text_plain && !looks_binary) {
-    // The server said the content was text/plain and we don't really have any
-    // evidence otherwise.
-    result->assign("text/plain");
-    return have_enough_content;
+  if (hint_is_unknown_mime_type || hint_is_text_plain) {
+    if (!SniffBinary(content, content_size, &have_enough_content, result)) {
+      // If the server said the content was text/plain and it doesn't appear
+      // to be binary, then we trust it.
+      if (hint_is_text_plain) {
+        return have_enough_content;
+      }
+    }
   }
 
   // If we have plain XML, sniff XML subtypes.
   if (type_hint == "text/xml" || type_hint == "application/xml") {
     // We're not interested in sniffing these types for images and the like.
-    // Instead, we're looking explicitly for a feed.  If we don't find one we're
-    // done and return early.
-    if (SniffXML(content, content_size, result))
+    // Instead, we're looking explicitly for a feed.  If we don't find one
+    // we're done and return early.
+    if (SniffXML(content, content_size, &have_enough_content, result))
       return true;
-    return content_size >= kMaxBytesToSniff;
+    return have_enough_content;
   }
 
   // CRX files (chrome extensions) have a special sniffing algorithm. It is
   // tighter than the others because we don't have to match legacy behavior.
-  if (SniffCRX(content, content_size, url, type_hint, result))
+  if (SniffCRX(content, content_size, url, type_hint,
+               &have_enough_content, result))
     return true;
 
   // We're not interested in sniffing for magic numbers when the type_hint
@@ -591,21 +653,10 @@ bool SniffMimeType(const char* content, size_t content_size,
 
   // Now we look in our large table of magic numbers to see if we can find
   // anything that matches the content.
-  if (SniffForMagicNumbers(content, content_size, result))
+  if (SniffForMagicNumbers(content, content_size,
+                           &have_enough_content, result))
     return true;  // We've matched a magic number.  No more content needed.
 
-  // Having failed thus far, we're willing to override unknown mime types and
-  // text/plain.
-  if (hint_is_unknown_mime_type || hint_is_text_plain) {
-    if (looks_binary)
-      result->assign("application/octet-stream");
-    else
-      result->assign("text/plain");
-    // We could change our mind if a binary-looking byte appears later in
-    // the content, so we only have enough content if we have the max.
-    return content_size >= kMaxBytesToSniff;
-  }
-
   return have_enough_content;
 }
 
diff --git a/net/base/mime_sniffer.h b/net/base/mime_sniffer.h
index 6fd7014..d0c4e78 100644
--- a/net/base/mime_sniffer.h
+++ b/net/base/mime_sniffer.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -11,6 +11,12 @@ class GURL;
 
 namespace net {
 
+// The maximum number of bytes used by any internal mime sniffing routine. May
+// be useful for callers to determine an efficient buffer size to pass to
+// |SniffMimeType|.
+// This must be updated if any internal sniffing routine needs more bytes.
+const int kMaxBytesToSniff = 1024;
+
 // Examine the URL and the mime_type and decide whether we should sniff a
 // replacement mime type from the content.
 //
diff --git a/net/base/mime_sniffer_unittest.cc b/net/base/mime_sniffer_unittest.cc
index 56dfd51..d70cb23 100644
--- a/net/base/mime_sniffer_unittest.cc
+++ b/net/base/mime_sniffer_unittest.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -149,6 +149,9 @@ TEST(MimeSnifferTest, ChromeExtensionsTest) {
     { "Cr24\x02\x00\x00\x01", sizeof("Cr24\x02\x00\x00\x01")-1,
       "http://www.example.com/foo.crx?monkey",
       "", "application/octet-stream" },
+    { "PADDING_Cr24\x02\x00\x00\x00", sizeof("PADDING_Cr24\x02\x00\x00\x00")-1,
+      "http://www.example.com/foo.crx?monkey",
+      "", "application/octet-stream" },
   };
 
   TestArray(tests, arraysize(tests));
@@ -359,16 +362,33 @@ TEST(MimeSnifferTest, XMLTest) {
 
 }
 
-// Test content which is >= 512 bytes, and includes no open angle bracket.
+// Test content which is >= 1024 bytes, and includes no open angle bracket.
 // http://code.google.com/p/chromium/issues/detail?id=3521
 TEST(MimeSnifferTest, XMLTestLargeNoAngledBracket) {
-  // Make a large input, with 600 bytes of "x".
+  // Make a large input, with 1024 bytes of "x".
   std::string content;
-  content.resize(600);
+  content.resize(1024);
   std::fill(content.begin(), content.end(), 'x');
 
-  // content.size() >= kMaxBytesToSniff (512) so the sniff is unambiguous.
+  // content.size() >= 1024 so the sniff is unambiguous.
   std::string mime_type;
   EXPECT_TRUE(net::SniffMimeType(content.data(), content.size(), GURL(),
                                  "text/xml", &mime_type));
+  EXPECT_EQ("text/xml", mime_type);
+}
+
+// Test content which is >= 1024 bytes, and includes a binary looking byte.
+// http://code.google.com/p/chromium/issues/detail?id=15314
+TEST(MimeSnifferTest, LooksBinary) {
+  // Make a large input, with 1024 bytes of "x" and 1 byte of 0x01.
+  std::string content;
+  content.resize(1024);
+  std::fill(content.begin(), content.end(), 'x');
+  content[1000] = 0x01;
+
+  // content.size() >= 1024 so the sniff is unambiguous.
+  std::string mime_type;
+  EXPECT_TRUE(net::SniffMimeType(content.data(), content.size(), GURL(),
+                                 "text/plain", &mime_type));
+  EXPECT_EQ("application/octet-stream", mime_type);
 }
author	abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-03-16 06:34:56 +0000
committer	abarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-03-16 06:34:56 +0000
commit	2e7aff66fe443c29b2fc14a776dca5512b0b4729 (patch)
tree	0195994507e51a675c488718dd50e7bd997d889f /net
parent	791199c674cd1aa2997e39079e33681596866fc8 (diff)
download	chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.zip chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.tar.gz chromium_src-2e7aff66fe443c29b2fc14a776dca5512b0b4729.tar.bz2