content/common/cross_site_document_classifier.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245

// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/common/cross_site_document_classifier.h"

#include "base/basictypes.h"
#include "base/command_line.h"
#include "base/lazy_instance.h"
#include "base/logging.h"
#include "base/metrics/histogram.h"
#include "base/strings/string_util.h"
#include "content/public/common/content_switches.h"
#include "content/public/common/resource_response_info.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/http/http_response_headers.h"

using base::StringPiece;

namespace content {

namespace {

// MIME types
const char kTextHtml[] = "text/html";
const char kTextXml[] = "text/xml";
const char kAppRssXml[] = "application/rss+xml";
const char kAppXml[] = "application/xml";
const char kAppJson[] = "application/json";
const char kTextJson[] = "text/json";
const char kTextXjson[] = "text/x-json";
const char kTextPlain[] = "text/plain";

bool MatchesSignature(StringPiece data,
                      const StringPiece signatures[],
                      size_t arr_size) {
  size_t offset = data.find_first_not_of(" \t\r\n");
  // There is no not-whitespace character in this document.
  if (offset == base::StringPiece::npos)
    return false;

  data.remove_prefix(offset);
  for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
    if (base::StartsWith(data, signatures[sig_index],
                         base::CompareCase::INSENSITIVE_ASCII))
      return true;
  }
  return false;
}

}  // namespace

CrossSiteDocumentMimeType CrossSiteDocumentClassifier::GetCanonicalMimeType(
    const std::string& mime_type) {
  if (base::LowerCaseEqualsASCII(mime_type, kTextHtml)) {
    return CROSS_SITE_DOCUMENT_MIME_TYPE_HTML;
  }

  if (base::LowerCaseEqualsASCII(mime_type, kTextPlain)) {
    return CROSS_SITE_DOCUMENT_MIME_TYPE_PLAIN;
  }

  if (base::LowerCaseEqualsASCII(mime_type, kAppJson) ||
      base::LowerCaseEqualsASCII(mime_type, kTextJson) ||
      base::LowerCaseEqualsASCII(mime_type, kTextXjson)) {
    return CROSS_SITE_DOCUMENT_MIME_TYPE_JSON;
  }

  if (base::LowerCaseEqualsASCII(mime_type, kTextXml) ||
      base::LowerCaseEqualsASCII(mime_type, kAppRssXml) ||
      base::LowerCaseEqualsASCII(mime_type, kAppXml)) {
    return CROSS_SITE_DOCUMENT_MIME_TYPE_XML;
  }

  return CROSS_SITE_DOCUMENT_MIME_TYPE_OTHERS;
}

bool CrossSiteDocumentClassifier::IsBlockableScheme(const GURL& url) {
  // We exclude ftp:// from here. FTP doesn't provide a Content-Type
  // header which our policy depends on, so we cannot protect any
  // document from FTP servers.
  return url.SchemeIs(url::kHttpScheme) || url.SchemeIs(url::kHttpsScheme);
}

bool CrossSiteDocumentClassifier::IsSameSite(const GURL& frame_origin,
                                             const GURL& response_url) {
  if (!frame_origin.is_valid() || !response_url.is_valid())
    return false;

  if (frame_origin.scheme() != response_url.scheme())
    return false;

  // SameDomainOrHost() extracts the effective domains (public suffix plus one)
  // from the two URLs and compare them.
  return net::registry_controlled_domains::SameDomainOrHost(
      frame_origin, response_url,
      net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
}

// We don't use Webkit's existing CORS policy implementation since
// their policy works in terms of origins, not sites. For example,
// when frame is sub.a.com and it is not allowed to access a document
// with sub1.a.com. But under Site Isolation, it's allowed.
bool CrossSiteDocumentClassifier::IsValidCorsHeaderSet(
    const GURL& frame_origin,
    const GURL& website_origin,
    const std::string& access_control_origin) {
  // Many websites are sending back "\"*\"" instead of "*". This is
  // non-standard practice, and not supported by Chrome. Refer to
  // CrossOriginAccessControl::passesAccessControlCheck().

  // TODO(dsjang): * is not allowed for the response from a request
  // with cookies. This allows for more than what the renderer will
  // eventually be able to receive, so we won't see illegal cross-site
  // documents allowed by this. We have to find a way to see if this
  // response is from a cookie-tagged request or not in the future.
  if (access_control_origin == "*")
    return true;

  // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
  // "*", but many websites are using just a domain for access_control_origin,
  // and this is blocked by Webkit's CORS logic here :
  // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
  // is_valid() to false when it is created from a URL containing * in the
  // domain part.

  GURL cors_origin(access_control_origin);
  return IsSameSite(frame_origin, cors_origin);
}

// This function is a slight modification of |net::SniffForHTML|.
bool CrossSiteDocumentClassifier::SniffForHTML(StringPiece data) {
  // The content sniffer used by Chrome and Firefox are using "<!--"
  // as one of the HTML signatures, but it also appears in valid
  // JavaScript, considered as well-formed JS by the browser.  Since
  // we do not want to block any JS, we exclude it from our HTML
  // signatures. This can weaken our document block policy, but we can
  // break less websites.
  // TODO(dsjang): parameterize |net::SniffForHTML| with an option
  // that decides whether to include <!-- or not, so that we can
  // remove this function.
  // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
  // process, we should do single-thread checking here for the static
  // initializer.
  static const StringPiece kHtmlSignatures[] = {
      StringPiece("<!doctype html"),  // HTML5 spec
      StringPiece("<script"),         // HTML5 spec, Mozilla
      StringPiece("<html"),           // HTML5 spec, Mozilla
      StringPiece("<head"),           // HTML5 spec, Mozilla
      StringPiece("<iframe"),         // Mozilla
      StringPiece("<h1"),             // Mozilla
      StringPiece("<div"),            // Mozilla
      StringPiece("<font"),           // Mozilla
      StringPiece("<table"),          // Mozilla
      StringPiece("<a"),              // Mozilla
      StringPiece("<style"),          // Mozilla
      StringPiece("<title"),          // Mozilla
      StringPiece("<b"),              // Mozilla
      StringPiece("<body"),           // Mozilla
      StringPiece("<br"),             // Mozilla
      StringPiece("<p")               // Mozilla
  };

  while (data.length() > 0) {
    if (MatchesSignature(data, kHtmlSignatures, arraysize(kHtmlSignatures)))
      return true;

    // If we cannot find "<!--", we fail sniffing this as HTML.
    static const StringPiece kCommentBegins[] = {StringPiece("<!--")};
    if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
      break;

    // Search for --> and do SniffForHTML after that. If we can find the
    // comment's end, we start HTML sniffing from there again.
    static const char kEndComment[] = "-->";
    size_t offset = data.find(kEndComment);
    if (offset == base::StringPiece::npos)
      break;

    // Proceed to the index next to the ending comment (-->).
    data.remove_prefix(offset + strlen(kEndComment));
  }

  return false;
}

bool CrossSiteDocumentClassifier::SniffForXML(base::StringPiece data) {
  // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
  // this signature. However, XML is case-sensitive. Don't we have to
  // be more lenient only to block documents starting with the exact
  // string <?xml rather than <?XML ?
  // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
  // process, we should do single-thread checking here for the static
  // initializer.
  static const StringPiece kXmlSignatures[] = {StringPiece("<?xml")};
  return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
}

bool CrossSiteDocumentClassifier::SniffForJSON(base::StringPiece data) {
  // TODO(dsjang): We have to come up with a better way to sniff
  // JSON. However, even RE cannot help us that much due to the fact
  // that we don't do full parsing.  This DFA starts with state 0, and
  // finds {, "/' and : in that order. We're avoiding adding a
  // dependency on a regular expression library.
  enum {
    kStartState,
    kLeftBraceState,
    kLeftQuoteState,
    kColonState,
    kTerminalState,
  } state = kStartState;

  size_t length = data.length();
  for (size_t i = 0; i < length && state < kColonState; ++i) {
    const char c = data[i];
    if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
      continue;

    switch (state) {
      case kStartState:
        if (c == '{')
          state = kLeftBraceState;
        else
          state = kTerminalState;
        break;
      case kLeftBraceState:
        if (c == '\"' || c == '\'')
          state = kLeftQuoteState;
        else
          state = kTerminalState;
        break;
      case kLeftQuoteState:
        if (c == ':')
          state = kColonState;
        break;
      case kColonState:
      case kTerminalState:
        NOTREACHED();
        break;
    }
  }
  return state == kColonState;
}

}  // namespace content