// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/child/site_isolation_policy.h"

#include "base/basictypes.h"
#include "base/command_line.h"
#include "base/lazy_instance.h"
#include "base/logging.h"
#include "base/metrics/histogram.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_util.h"
#include "content/child/child_thread.h"
#include "content/public/common/content_switches.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/http/http_response_headers.h"
#include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
#include "third_party/WebKit/public/platform/WebString.h"
#include "third_party/WebKit/public/platform/WebURL.h"
#include "third_party/WebKit/public/platform/WebURLRequest.h"
#include "third_party/WebKit/public/platform/WebURLResponse.h"
#include "third_party/WebKit/public/web/WebDocument.h"
#include "third_party/WebKit/public/web/WebFrame.h"
#include "third_party/WebKit/public/web/WebFrameClient.h"
#include "third_party/WebKit/public/web/WebSecurityOrigin.h"

using base::StringPiece;
using blink::WebDocument;
using blink::WebString;
using blink::WebURL;
using blink::WebURLResponse;
using blink::WebURLRequest;

namespace content {

namespace {

// Maintain the bookkeeping data between OnReceivedResponse and
// OnReceivedData. The key is a request id maintained by ResourceDispatcher.
static base::LazyInstance<SiteIsolationPolicy::RequestIdToMetaDataMap>
    g_metadata_map = LAZY_INSTANCE_INITIALIZER;

// Maintain the bookkeeping data for OnReceivedData. Blocking decision is made
// when OnReceivedData is called for the first time for a request, and the
// decision will remain the same for following data. This map maintains the
// decision. The key is a request id maintained by ResourceDispatcher.
static base::LazyInstance<SiteIsolationPolicy::RequestIdToResultMap>
    g_result_map = LAZY_INSTANCE_INITIALIZER;

// The cross-site document blocking/UMA data collection is deactivated by
// default, and only activated in renderer processes.
static bool g_policy_enabled = false;

// MIME types
const char kTextHtml[] = "text/html";
const char kTextXml[] = "text/xml";
const char xAppRssXml[] = "application/rss+xml";
const char kAppXml[] = "application/xml";
const char kAppJson[] = "application/json";
const char kTextJson[] = "text/json";
const char kTextXjson[] = "text/x-json";
const char kTextPlain[] = "text/plain";

// TODO(dsjang): this is only needed for collecting UMA stat. Will be deleted
// when this class is used for actual blocking.
bool IsRenderableStatusCode(int status_code) {
  // Chrome only uses the content of a response with one of these status codes
  // for CSS/JavaScript. For images, Chrome just ignores status code.
  const int renderable_status_code[] = {200, 201, 202, 203, 206, 300,
                                        301, 302, 303, 305, 306, 307};
  for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
    if (renderable_status_code[i] == status_code)
      return true;
  }
  return false;
}

bool MatchesSignature(StringPiece data,
                      const StringPiece signatures[],
                      size_t arr_size) {

  size_t offset = data.find_first_not_of(" \t\r\n");
  // There is no not-whitespace character in this document.
  if (offset == base::StringPiece::npos)
    return false;

  data.remove_prefix(offset);
  size_t length = data.length();

  for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
    const StringPiece& signature = signatures[sig_index];
    size_t signature_length = signature.length();
    if (length < signature_length)
      continue;

    if (LowerCaseEqualsASCII(
            data.begin(), data.begin() + signature_length, signature.data()))
      return true;
  }
  return false;
}

void IncrementHistogramCount(const std::string& name) {
  // The default value of min, max, bucket_count are copied from histogram.h.
  base::HistogramBase* histogram_pointer = base::Histogram::FactoryGet(
      name, 1, 100000, 50, base::HistogramBase::kUmaTargetedHistogramFlag);
  histogram_pointer->Add(1);
}

void IncrementHistogramEnum(const std::string& name,
                          uint32 sample,
                          uint32 boundary_value) {
  // The default value of min, max, bucket_count are copied from histogram.h.
  base::HistogramBase* histogram_pointer = base::LinearHistogram::FactoryGet(
      name,
      1,
      boundary_value,
      boundary_value + 1,
      base::HistogramBase::kUmaTargetedHistogramFlag);
  histogram_pointer->Add(sample);
}

void HistogramCountBlockedResponse(
    const std::string& bucket_prefix,
    const SiteIsolationPolicy::ResponseMetaData& resp_data,
    bool nosniff_block) {
  std::string block_label(nosniff_block ? ".NoSniffBlocked" : ".Blocked");
  IncrementHistogramCount(bucket_prefix + block_label);

  // The content is blocked if it is sniffed as HTML/JSON/XML. When
  // the blocked response is with an error status code, it is not
  // disruptive for the following reasons : 1) the blocked content is
  // not a binary object (such as an image) since it is sniffed as
  // text; 2) then, this blocking only breaks the renderer behavior
  // only if it is either JavaScript or CSS. However, the renderer
  // doesn't use the contents of JS/CSS with unaffected status code
  // (e.g, 404). 3) the renderer is expected not to use the cross-site
  // document content for purposes other than JS/CSS (e.g, XHR).
  bool renderable_status_code =
      IsRenderableStatusCode(resp_data.http_status_code);

  if (renderable_status_code) {
    IncrementHistogramEnum(
        bucket_prefix + block_label + ".RenderableStatusCode",
        resp_data.resource_type,
        ResourceType::LAST_TYPE);
  } else {
    IncrementHistogramCount(bucket_prefix + block_label +
                            ".NonRenderableStatusCode");
  }
}

void HistogramCountNotBlockedResponse(const std::string& bucket_prefix,
                                      bool sniffed_as_js) {
  IncrementHistogramCount(bucket_prefix + ".NotBlocked");
  if (sniffed_as_js)
    IncrementHistogramCount(bucket_prefix + ".NotBlocked.MaybeJS");
}

}  // namespace

SiteIsolationPolicy::ResponseMetaData::ResponseMetaData() {}

void SiteIsolationPolicy::SetPolicyEnabled(bool enabled) {
  g_policy_enabled = enabled;
}

void SiteIsolationPolicy::OnReceivedResponse(
    int request_id,
    const GURL& frame_origin,
    const GURL& response_url,
    ResourceType::Type resource_type,
    int origin_pid,
    const webkit_glue::ResourceResponseInfo& info) {
  if (!g_policy_enabled)
    return;

  // if |origin_pid| is non-zero, it means that this response is for a plugin
  // spawned from this renderer process. We exclude responses for plugins for
  // now, but eventually, we're going to make plugin processes directly talk to
  // the browser process so that we don't apply cross-site document blocking to
  // them.
  if (origin_pid)
    return;

  UMA_HISTOGRAM_COUNTS("SiteIsolation.AllResponses", 1);

  // See if this is for navigation. If it is, don't block it, under the
  // assumption that we will put it in an appropriate process.
  if (ResourceType::IsFrame(resource_type))
    return;

  if (!IsBlockableScheme(response_url))
    return;

  if (IsSameSite(frame_origin, response_url))
    return;

  SiteIsolationPolicy::ResponseMetaData::CanonicalMimeType canonical_mime_type =
      GetCanonicalMimeType(info.mime_type);

  if (canonical_mime_type == SiteIsolationPolicy::ResponseMetaData::Others)
    return;

  // Every CORS request should have the Access-Control-Allow-Origin header even
  // if it is preceded by a pre-flight request. Therefore, if this is a CORS
  // request, it has this header.  response.httpHeaderField() internally uses
  // case-insensitive matching for the header name.
  std::string access_control_origin;

  // We can use a case-insensitive header name for EnumerateHeader().
  info.headers->EnumerateHeader(
      NULL, "access-control-allow-origin", &access_control_origin);
  if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin))
    return;

  // Real XSD data collection starts from here.
  std::string no_sniff;
  info.headers->EnumerateHeader(NULL, "x-content-type-options", &no_sniff);

  ResponseMetaData resp_data;
  resp_data.frame_origin = frame_origin.spec();
  resp_data.response_url = response_url;
  resp_data.resource_type = resource_type;
  resp_data.canonical_mime_type = canonical_mime_type;
  resp_data.http_status_code = info.headers->response_code();
  resp_data.no_sniff = LowerCaseEqualsASCII(no_sniff, "nosniff");

  (g_metadata_map.Get())[request_id] = resp_data;
}

bool SiteIsolationPolicy::ShouldBlockResponse(
    int request_id,
    const char* raw_data,
    int raw_length,
    std::string* alternative_data) {
  if (!g_policy_enabled)
    return false;

  RequestIdToMetaDataMap& metadata_map = g_metadata_map.Get();
  RequestIdToResultMap& result_map = g_result_map.Get();

  // If there's an entry for |request_id| in blocked_map, this request's first
  // data packet has already been examined. We can return the result here.
  if (result_map.count(request_id) != 0) {
    if (result_map[request_id]) {
      // Here, the blocking result has been set for the previous run of
      // ShouldBlockResponse(), so we set alternative data to an empty string so
      // that ResourceDispatcher doesn't call its peer's onReceivedData() with
      // the alternative data.
      alternative_data->erase();
      return true;
    }
    return false;
  }

  // If result_map doesn't have an entry for |request_id|, we're receiving the
  // first data packet for request_id. If request_id is not registered, this
  // request is identified as a non-target of our policy. So we return true.
  if (metadata_map.count(request_id) == 0) {
    // We set request_id to true so that we always return true for this request.
    result_map[request_id] = false;
    return false;
  }

  StringPiece data(raw_data, raw_length);

  // We now look at the first data packet received for request_id.
  ResponseMetaData resp_data = metadata_map[request_id];
  metadata_map.erase(request_id);

  // Record the length of the first received network packet to see if it's
  // enough for sniffing.
  UMA_HISTOGRAM_COUNTS("SiteIsolation.XSD.DataLength", raw_length);

  // Record the number of cross-site document responses with a specific mime
  // type (text/html, text/xml, etc).
  UMA_HISTOGRAM_ENUMERATION(
      "SiteIsolation.XSD.MimeType",
      resp_data.canonical_mime_type,
      SiteIsolationPolicy::ResponseMetaData::MaxCanonicalMimeType);

  // Store the result of cross-site document blocking analysis.
  bool is_blocked = false;
  bool sniffed_as_js = SniffForJS(data);

  // Record the number of responses whose content is sniffed for what its mime
  // type claims it to be. For example, we apply a HTML sniffer for a document
  // tagged with text/html here. Whenever this check becomes true, we'll block
  // the response.
  if (resp_data.canonical_mime_type !=
          SiteIsolationPolicy::ResponseMetaData::Plain) {
    std::string bucket_prefix;
    bool sniffed_as_target_document = false;
    if (resp_data.canonical_mime_type ==
            SiteIsolationPolicy::ResponseMetaData::HTML) {
      bucket_prefix = "SiteIsolation.XSD.HTML";
      sniffed_as_target_document = SniffForHTML(data);
    } else if (resp_data.canonical_mime_type ==
                   SiteIsolationPolicy::ResponseMetaData::XML) {
      bucket_prefix = "SiteIsolation.XSD.XML";
      sniffed_as_target_document = SniffForXML(data);
    } else if (resp_data.canonical_mime_type ==
                   SiteIsolationPolicy::ResponseMetaData::JSON) {
      bucket_prefix = "SiteIsolation.XSD.JSON";
      sniffed_as_target_document = SniffForJSON(data);
    } else {
      NOTREACHED() << "Not a blockable mime type: "
                   << resp_data.canonical_mime_type;
    }

    if (sniffed_as_target_document) {
      is_blocked = true;
      HistogramCountBlockedResponse(bucket_prefix, resp_data, false);
    } else {
      if (resp_data.no_sniff) {
        is_blocked = true;
        HistogramCountBlockedResponse(bucket_prefix, resp_data, true);
      } else {
        HistogramCountNotBlockedResponse(bucket_prefix, sniffed_as_js);
      }
    }
  } else {
    // This block is for plain text documents. We apply our HTML, XML,
    // and JSON sniffer to a text document in the order, and block it
    // if any of them succeeds in sniffing.
    std::string bucket_prefix;
    if (SniffForHTML(data))
      bucket_prefix = "SiteIsolation.XSD.Plain.HTML";
    else if (SniffForXML(data))
      bucket_prefix = "SiteIsolation.XSD.Plain.XML";
    else if (SniffForJSON(data))
      bucket_prefix = "SiteIsolation.XSD.Plain.JSON";

    if (bucket_prefix.size() > 0) {
      is_blocked = true;
      HistogramCountBlockedResponse(bucket_prefix, resp_data, false);
    } else if (resp_data.no_sniff) {
      is_blocked = true;
      HistogramCountBlockedResponse("SiteIsolation.XSD.Plain", resp_data, true);
    } else {
      HistogramCountNotBlockedResponse("SiteIsolation.XSD.Plain",
                                       sniffed_as_js);
    }
  }

  if (!CommandLine::ForCurrentProcess()->HasSwitch(
           switches::kBlockCrossSiteDocuments))
    is_blocked = false;
  result_map[request_id] = is_blocked;

  if (is_blocked) {
    alternative_data->erase();
    alternative_data->insert(0, " ");
    LOG(ERROR) << resp_data.response_url
               << " is blocked as an illegal cross-site document from "
               << resp_data.frame_origin;
  }
  return is_blocked;
}

void SiteIsolationPolicy::OnRequestComplete(int request_id) {
  if (!g_policy_enabled)
    return;
  g_metadata_map.Get().erase(request_id);
  g_result_map.Get().erase(request_id);
}

SiteIsolationPolicy::ResponseMetaData::CanonicalMimeType
SiteIsolationPolicy::GetCanonicalMimeType(const std::string& mime_type) {
  if (LowerCaseEqualsASCII(mime_type, kTextHtml)) {
    return SiteIsolationPolicy::ResponseMetaData::HTML;
  }

  if (LowerCaseEqualsASCII(mime_type, kTextPlain)) {
    return SiteIsolationPolicy::ResponseMetaData::Plain;
  }

  if (LowerCaseEqualsASCII(mime_type, kAppJson) ||
      LowerCaseEqualsASCII(mime_type, kTextJson) ||
      LowerCaseEqualsASCII(mime_type, kTextXjson)) {
    return SiteIsolationPolicy::ResponseMetaData::JSON;
  }

  if (LowerCaseEqualsASCII(mime_type, kTextXml) ||
      LowerCaseEqualsASCII(mime_type, xAppRssXml) ||
      LowerCaseEqualsASCII(mime_type, kAppXml)) {
    return SiteIsolationPolicy::ResponseMetaData::XML;
  }

 return SiteIsolationPolicy::ResponseMetaData::Others;
}

bool SiteIsolationPolicy::IsBlockableScheme(const GURL& url) {
  // We exclude ftp:// from here. FTP doesn't provide a Content-Type
  // header which our policy depends on, so we cannot protect any
  // document from FTP servers.
  return url.SchemeIs("http") || url.SchemeIs("https");
}

bool SiteIsolationPolicy::IsSameSite(const GURL& frame_origin,
                                     const GURL& response_url) {

  if (!frame_origin.is_valid() || !response_url.is_valid())
    return false;

  if (frame_origin.scheme() != response_url.scheme())
    return false;

  // SameDomainOrHost() extracts the effective domains (public suffix plus one)
  // from the two URLs and compare them.
  // TODO(dsjang): use INCLUDE_PRIVATE_REGISTRIES when http://crbug.com/7988 is
  // fixed.
  return net::registry_controlled_domains::SameDomainOrHost(
      frame_origin,
      response_url,
      net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
}

// We don't use Webkit's existing CORS policy implementation since
// their policy works in terms of origins, not sites. For example,
// when frame is sub.a.com and it is not allowed to access a document
// with sub1.a.com. But under Site Isolation, it's allowed.
bool SiteIsolationPolicy::IsValidCorsHeaderSet(
    const GURL& frame_origin,
    const GURL& website_origin,
    const std::string& access_control_origin) {
  // Many websites are sending back "\"*\"" instead of "*". This is
  // non-standard practice, and not supported by Chrome. Refer to
  // CrossOriginAccessControl::passesAccessControlCheck().

  // TODO(dsjang): * is not allowed for the response from a request
  // with cookies. This allows for more than what the renderer will
  // eventually be able to receive, so we won't see illegal cross-site
  // documents allowed by this. We have to find a way to see if this
  // response is from a cookie-tagged request or not in the future.
  if (access_control_origin == "*")
    return true;

  // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
  // "*", but many websites are using just a domain for access_control_origin,
  // and this is blocked by Webkit's CORS logic here :
  // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
  // is_valid() to false when it is created from a URL containing * in the
  // domain part.

  GURL cors_origin(access_control_origin);
  return IsSameSite(frame_origin, cors_origin);
}

// This function is a slight modification of |net::SniffForHTML|.
bool SiteIsolationPolicy::SniffForHTML(StringPiece data) {
  // The content sniffer used by Chrome and Firefox are using "<!--"
  // as one of the HTML signatures, but it also appears in valid
  // JavaScript, considered as well-formed JS by the browser.  Since
  // we do not want to block any JS, we exclude it from our HTML
  // signatures. This can weaken our document block policy, but we can
  // break less websites.
  // TODO(dsjang): parameterize |net::SniffForHTML| with an option
  // that decides whether to include <!-- or not, so that we can
  // remove this function.
  // TODO(dsjang): Once SiteIsolationPolicy is moved into the browser
  // process, we should do single-thread checking here for the static
  // initializer.
  static const StringPiece kHtmlSignatures[] = {
    StringPiece("<!DOCTYPE html"),  // HTML5 spec
    StringPiece("<script"),  // HTML5 spec, Mozilla
    StringPiece("<html"),    // HTML5 spec, Mozilla
    StringPiece("<head"),    // HTML5 spec, Mozilla
    StringPiece("<iframe"),  // Mozilla
    StringPiece("<h1"),      // Mozilla
    StringPiece("<div"),     // Mozilla
    StringPiece("<font"),    // Mozilla
    StringPiece("<table"),   // Mozilla
    StringPiece("<a"),       // Mozilla
    StringPiece("<style"),   // Mozilla
    StringPiece("<title"),   // Mozilla
    StringPiece("<b"),       // Mozilla
    StringPiece("<body"),    // Mozilla
    StringPiece("<br"),      // Mozilla
    StringPiece("<p"),       // Mozilla
    StringPiece("<?xml")     // Mozilla
  };

  while (data.length() > 0) {
    if (MatchesSignature(
          data, kHtmlSignatures, arraysize(kHtmlSignatures)))
      return true;

    // If we cannot find "<!--", we fail sniffing this as HTML.
    static const StringPiece kCommentBegins[] = { StringPiece("<!--") };
    if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
      break;

    // Search for --> and do SniffForHTML after that. If we can find the
    // comment's end, we start HTML sniffing from there again.
    static const char kEndComment[] = "-->";
    size_t offset = data.find(kEndComment);
    if (offset == base::StringPiece::npos)
      break;

    // Proceed to the index next to the ending comment (-->).
    data.remove_prefix(offset + strlen(kEndComment));
  }

  return false;
}

bool SiteIsolationPolicy::SniffForXML(base::StringPiece data) {
  // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
  // this signature. However, XML is case-sensitive. Don't we have to
  // be more lenient only to block documents starting with the exact
  // string <?xml rather than <?XML ?
  // TODO(dsjang): Once SiteIsolationPolicy is moved into the browser
  // process, we should do single-thread checking here for the static
  // initializer.
  static const StringPiece kXmlSignatures[] = { StringPiece("<?xml") };
  return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
}

bool SiteIsolationPolicy::SniffForJSON(base::StringPiece data) {
  // TODO(dsjang): We have to come up with a better way to sniff
  // JSON. However, even RE cannot help us that much due to the fact
  // that we don't do full parsing.  This DFA starts with state 0, and
  // finds {, "/' and : in that order. We're avoiding adding a
  // dependency on a regular expression library.
  enum {
    kStartState,
    kLeftBraceState,
    kLeftQuoteState,
    kColonState,
    kTerminalState,
  } state = kStartState;

  size_t length = data.length();
  for (size_t i = 0; i < length && state < kColonState; ++i) {
    const char c = data[i];
    if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
      continue;

    switch (state) {
      case kStartState:
        if (c == '{')
          state = kLeftBraceState;
        else
          state = kTerminalState;
        break;
      case kLeftBraceState:
        if (c == '\"' || c == '\'')
          state = kLeftQuoteState;
        else
          state = kTerminalState;
        break;
      case kLeftQuoteState:
        if (c == ':')
          state = kColonState;
        break;
      case kColonState:
      case kTerminalState:
        NOTREACHED();
        break;
    }
  }
  return state == kColonState;
}

bool SiteIsolationPolicy::SniffForJS(StringPiece data) {
  // TODO(dsjang): This is a real hack. The only purpose of this function is to
  // try to see if there's any possibility that this data can be JavaScript
  // (superset of JS). This function will be removed once UMA stats are
  // gathered.

  // Search for "var " for JS detection.
  return data.find("var ") != base::StringPiece::npos;
}

}  // namespace content