summaryrefslogtreecommitdiffstats
path: root/net/base/mime_sniffer.cc
diff options
context:
space:
mode:
authorabarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2008-10-09 00:13:41 +0000
committerabarth@chromium.org <abarth@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2008-10-09 00:13:41 +0000
commit02e2a6984d1f3333268bfaa4f3a86f1e4330b337 (patch)
tree84ed589b02849f49329c991e4af5fe7452bb4fd1 /net/base/mime_sniffer.cc
parent756ff1823a3495f0e145a5bd146f44e5fbfb59c7 (diff)
downloadchromium_src-02e2a6984d1f3333268bfaa4f3a86f1e4330b337.zip
chromium_src-02e2a6984d1f3333268bfaa4f3a86f1e4330b337.tar.gz
chromium_src-02e2a6984d1f3333268bfaa4f3a86f1e4330b337.tar.bz2
Optimize the mime sniffer based on UMA data and move closer to the HTML 5 spec.
Removed over 50% of the heuristics while returning the same result 99.996% of the time. Each heuristic we remove reduces the attack surface that honest sites have to worry about when the serve third-party content. (with typo fix this time) R=darin git-svn-id: svn://svn.chromium.org/chrome/trunk/src@3082 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/base/mime_sniffer.cc')
-rw-r--r--net/base/mime_sniffer.cc102
1 files changed, 30 insertions, 72 deletions
diff --git a/net/base/mime_sniffer.cc b/net/base/mime_sniffer.cc
index 0d9539d..53e765e 100644
--- a/net/base/mime_sniffer.cc
+++ b/net/base/mime_sniffer.cc
@@ -144,25 +144,15 @@ static const MagicNumber kMagicNumbers[] = {
MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF")
MAGIC_NUMBER("image/bmp", "BM")
// Source: Mozilla
- MAGIC_NUMBER("application/postscript", "%! PS-Adobe-")
- // Mozilla uses "\x4a47????00" for image/x-jg, but we use stronger pattern
- MAGIC_NUMBER("image/x-icon", "\x00\x00\x10\x00")
- MAGIC_NUMBER("image/x-icon", "\x00\x00\x20\x00")
- MAGIC_NUMBER("image/x-xbitmap", "#define ")
- MAGIC_NUMBER("text/plain", "#!") // Script
- MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS
- MAGIC_NUMBER("text/plain", "From")
+ MAGIC_NUMBER("text/plain", "#!") // Script
+ MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS
+ MAGIC_NUMBER("text/plain", "From")
MAGIC_NUMBER("text/plain", ">From")
// Chrome specific
- MAGIC_NUMBER("image/x-rgb", "\x01\xDA\x01\x01\x00\x03")
MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
- MAGIC_NUMBER("application/x-compress", "\x1F\x9D\x90") // tar.Z
MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46")
MAGIC_NUMBER("video/x-ms-asf",
"\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
- MAGIC_NUMBER("application/winhlp", "?_\x03")
- MAGIC_NUMBER("application/winhlp", "LN\x02\x00")
- MAGIC_NUMBER("application/x-bzip2", "BZ")
MAGIC_NUMBER("image/tiff", "I I")
MAGIC_NUMBER("image/tiff", "II*")
MAGIC_NUMBER("image/tiff", "MM\x00*")
@@ -171,18 +161,9 @@ static const MagicNumber kMagicNumbers[] = {
// MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB")
// MAGIC_NUMBER("audio/mpeg", "\xFF\xE")
// MAGIC_NUMBER("audio/mpeg", "\xFF\xF")
- MAGIC_NUMBER("image/x-jg", "\x4A\x47\x03\x0E\x00\x00\x00")
- MAGIC_NUMBER("image/x-jg", "\x4A\x47\x04\x0E\x00\x00\x00")
- MAGIC_NUMBER("image/x-portable-graymap", "P4\x0A")
MAGIC_NUMBER("application/zip", "PK\x03\x04")
MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
- MAGIC_NUMBER("application/rtf", "{\\rtf1")
- MAGIC_NUMBER("application/postscript", "\xC5\xD0\xD3\xC6")
MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
- MAGIC_NUMBER("application/octet-stream", "\x7F" "ELF") // ELF
- MAGIC_NUMBER("application/octet-stream", "\xE8") // COM, SYS
- MAGIC_NUMBER("application/octet-stream", "\xE9") // COM, SYS
- MAGIC_NUMBER("application/octet-stream", "\xEB") // COM, SYS
MAGIC_NUMBER("application/octet-stream", "MZ") // EXE
// Sniffing for Flash:
//
@@ -218,53 +199,23 @@ static const MagicNumber kSniffableTags[] = {
MAGIC_NUMBER("text/xml", "<?xml") // Mozilla
// DOCTYPEs
MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec
- // Sniffable tags, ordered by how often they occur in web documents with a
- // sniffable mime type (as measured in 2007).
+ // Sniffable tags, ordered by how often they occur in sniffable documents.
+ MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla
MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla
+ MAGIC_HTML_TAG("!--")
MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla
- MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla
- MAGIC_HTML_TAG("tr")
- MAGIC_HTML_TAG("link") // Mozilla
- MAGIC_HTML_TAG("meta") // Mozilla
- MAGIC_HTML_TAG("title") // Mozilla
- MAGIC_HTML_TAG("pre") // Mozilla
- MAGIC_HTML_TAG("table") // Mozilla
- MAGIC_HTML_TAG("basefont")
- // Not HTML: "xml"
- MAGIC_HTML_TAG("p") // Mozilla
+ MAGIC_HTML_TAG("iframe") // Mozilla
+ MAGIC_HTML_TAG("h1") // Mozilla
MAGIC_HTML_TAG("div") // Mozilla
- MAGIC_HTML_TAG("base") // Mozilla
- // Not HTML: "metadata"
- MAGIC_HTML_TAG("body") // Mozilla
- // Not HTML: "asx"
- MAGIC_HTML_TAG("frameset") // Mozilla
- // Not HTML: "sami"
+ MAGIC_HTML_TAG("font") // Mozilla
+ MAGIC_HTML_TAG("table") // Mozilla
MAGIC_HTML_TAG("a") // Mozilla
MAGIC_HTML_TAG("style") // Mozilla
- // Not HTML: "rss"
- MAGIC_HTML_TAG("br")
- MAGIC_HTML_TAG("center") // Mozilla
+ MAGIC_HTML_TAG("title") // Mozilla
MAGIC_HTML_TAG("b") // Mozilla
- MAGIC_HTML_TAG("iframe") // Mozilla
- MAGIC_HTML_TAG("img") // Mozilla
- MAGIC_HTML_TAG("h1") // Mozilla
- MAGIC_HTML_TAG("td")
- // Not HTML: "printer"
- MAGIC_HTML_TAG("font") // Mozilla
- // Not HTML: "htlm"
- MAGIC_HTML_TAG("form") // Mozilla
- // Not HTML: "master"
- MAGIC_HTML_TAG("h3") // Mozilla
- MAGIC_HTML_TAG("h2") // Mozilla
- // Plus a long tail, but we need to stop somewhere.
- //
- // We also include all the other tags that Mozilla sniffs:
- MAGIC_HTML_TAG("!--")
- MAGIC_HTML_TAG("applet")
- MAGIC_HTML_TAG("isindex")
- MAGIC_HTML_TAG("h4")
- MAGIC_HTML_TAG("h5")
- MAGIC_HTML_TAG("h6")
+ MAGIC_HTML_TAG("body") // Mozilla
+ MAGIC_HTML_TAG("br")
+ MAGIC_HTML_TAG("p") // Mozilla
};
static bool MatchMagicNumber(const char* content, size_t size,
@@ -322,7 +273,7 @@ static bool SniffForHTML(const char* content, size_t size,
if (!IsAsciiWhitespace(*pos))
break;
}
- static SnifferHistogram counter(L"mime_sniffer.kSniffableTags",
+ static SnifferHistogram counter(L"mime_sniffer.kSniffableTags2",
arraysize(kSniffableTags));
// |pos| now points to first non-whitespace character (or at end).
return CheckForMagicNumbers(pos, end - pos,
@@ -333,7 +284,7 @@ static bool SniffForHTML(const char* content, size_t size,
static bool SniffForMagicNumbers(const char* content, size_t size,
std::string* result) {
// Check our big table of Magic Numbers
- static SnifferHistogram counter(L"mime_sniffer.kMagicNumbers",
+ static SnifferHistogram counter(L"mime_sniffer.kMagicNumbers2",
arraysize(kMagicNumbers));
return CheckForMagicNumbers(content, size,
kMagicNumbers, arraysize(kMagicNumbers),
@@ -369,7 +320,7 @@ static bool SniffXML(const char* content, size_t size, std::string* result) {
// We want to skip XML processing instructions (of the form "<?xml ...")
// and stop at the first "plain" tag, then make a decision on the mime-type
// based on the name (or possibly attributes) of that tag.
- static SnifferHistogram counter(L"mime_sniffer.kMagicXML",
+ static SnifferHistogram counter(L"mime_sniffer.kMagicXML2",
arraysize(kMagicXML));
const int kMaxTagIterations = 5;
for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
@@ -411,13 +362,12 @@ static const MagicNumber kByteOrderMark[] = {
MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE
MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE
MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8
- MAGIC_NUMBER("text/plain", "\x00\x00\xFE\xFF") // UCS-4BE
};
// Whether a given byte looks like it might be part of binary content.
// Source: HTML5 spec
static char kByteLooksBinary[] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, // 0x00 - 0x0F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F
@@ -437,7 +387,7 @@ static char kByteLooksBinary[] = {
static bool LooksBinary(const char* content, size_t size) {
// First, we look for a BOM.
- static SnifferHistogram counter(L"mime_sniffer.kByteOrderMark",
+ static SnifferHistogram counter(L"mime_sniffer.kByteOrderMark2",
arraysize(kByteOrderMark));
std::string unused;
if (CheckForMagicNumbers(content, size,
@@ -460,6 +410,7 @@ static bool LooksBinary(const char* content, size_t size) {
static bool IsUnknownMimeType(const std::string& mime_type) {
// TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
+ // If we do, please be careful not to alter the semantics at all.
static const char* kUnknownMimeTypes[] = {
// Empty mime types are as unknown as they get.
"",
@@ -470,7 +421,7 @@ static bool IsUnknownMimeType(const std::string& mime_type) {
// Firefox rejects a mime type if it is exactly */*
"*/*",
};
- static SnifferHistogram counter(L"mime_sniffer.kUnknownMimeTypes",
+ static SnifferHistogram counter(L"mime_sniffer.kUnknownMimeTypes2",
arraysize(kUnknownMimeTypes) + 1);
for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
if (mime_type == kUnknownMimeTypes[i]) {
@@ -487,13 +438,17 @@ static bool IsUnknownMimeType(const std::string& mime_type) {
}
bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
+ static SnifferHistogram should_sniff_counter(
+ L"mime_sniffer.ShouldSniffMimeType2", 3);
// We are willing to sniff the mime type for HTTP, HTTPS, and FTP
bool sniffable_scheme = url.is_empty() ||
url.SchemeIs("http") ||
url.SchemeIs("https") ||
url.SchemeIs("ftp");
- if (!sniffable_scheme)
+ if (!sniffable_scheme) {
+ should_sniff_counter.Add(1);
return false;
+ }
static const char* kSniffableTypes[] = {
// Many web servers are misconfigured to send text/plain for many
@@ -508,11 +463,12 @@ bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
"text/xml",
"application/xml",
};
- static SnifferHistogram counter(L"mime_sniffer.kSniffableTypes",
+ static SnifferHistogram counter(L"mime_sniffer.kSniffableTypes2",
arraysize(kSniffableTypes) + 1);
for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
if (mime_type == kSniffableTypes[i]) {
counter.Add(i);
+ should_sniff_counter.Add(2);
return true;
}
}
@@ -520,8 +476,10 @@ bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
// The web server didn't specify a content type or specified a mime
// type that we ignore.
counter.Add(arraysize(kSniffableTypes));
+ should_sniff_counter.Add(2);
return true;
}
+ should_sniff_counter.Add(1);
return false;
}