// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"

#include "base/bind.h"
#include "base/compiler_specific.h"
#include "base/hash_tables.h"
#include "base/logging.h"
#include "base/message_loop.h"
#include "base/metrics/histogram.h"
#include "base/string_util.h"
#include "base/time.h"
#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "content/public/renderer/render_view.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "third_party/WebKit/Source/Platform/chromium/public/WebString.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h"

namespace safe_browsing {

// This time should be short enough that it doesn't noticeably disrupt the
// user's interaction with the page.
const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;

// Experimenting shows that we get a reasonable gain in performance by
// increasing this up to around 10, but there's not much benefit in
// increasing it past that.
const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;

// This should be longer than we expect feature extraction to take on any
// actual phishing page.
const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;

// Intermediate state used for computing features.  See features.h for
// descriptions of the DOM features that are computed.
struct PhishingDOMFeatureExtractor::PageFeatureState {
  // Link related features
  int external_links;
  base::hash_set<std::string> external_domains;
  int secure_links;
  int total_links;

  // Form related features
  int num_forms;
  int num_text_inputs;
  int num_pswd_inputs;
  int num_radio_inputs;
  int num_check_inputs;
  int action_other_domain;
  int total_actions;

  // Image related features
  int img_other_domain;
  int total_imgs;

  // How many script tags
  int num_script_tags;

  // The time at which we started feature extraction for the current page.
  base::TimeTicks start_time;

  // The number of iterations we've done for the current extraction.
  int num_iterations;

  explicit PageFeatureState(base::TimeTicks start_time_ticks)
      : external_links(0),
        secure_links(0),
        total_links(0),
        num_forms(0),
        num_text_inputs(0),
        num_pswd_inputs(0),
        num_radio_inputs(0),
        num_check_inputs(0),
        action_other_domain(0),
        total_actions(0),
        img_other_domain(0),
        total_imgs(0),
        num_script_tags(0),
        start_time(start_time_ticks),
        num_iterations(0) {}

  ~PageFeatureState() {}
};

// Per-frame state
struct PhishingDOMFeatureExtractor::FrameData {
  // This is our reference to document.all, which is an iterator over all
  // of the elements in the document.  It keeps track of our current position.
  WebKit::WebNodeCollection elements;
  // The domain of the document URL, stored here so that we don't need to
  // recompute it every time it's needed.
  std::string domain;
};

PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
    content::RenderView* render_view,
    FeatureExtractorClock* clock)
    : render_view_(render_view),
      clock_(clock),
      ALLOW_THIS_IN_INITIALIZER_LIST(weak_factory_(this)) {
  Clear();
}

PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
  // The RenderView should have called CancelPendingExtraction() before
  // we are destroyed.
  CheckNoPendingExtraction();
}

void PhishingDOMFeatureExtractor::ExtractFeatures(
    FeatureMap* features,
    const DoneCallback& done_callback) {
  // The RenderView should have called CancelPendingExtraction() before
  // starting a new extraction, so DCHECK this.
  CheckNoPendingExtraction();
  // However, in an opt build, we will go ahead and clean up the pending
  // extraction so that we can start in a known state.
  CancelPendingExtraction();

  features_ = features;
  done_callback_ = done_callback;

  page_feature_state_.reset(new PageFeatureState(clock_->Now()));
  WebKit::WebView* web_view = render_view_->GetWebView();
  if (web_view && web_view->mainFrame()) {
    cur_document_ = web_view->mainFrame()->document();
  }

  MessageLoop::current()->PostTask(
      FROM_HERE,
      base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
                 weak_factory_.GetWeakPtr()));
}

void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
  // Cancel any pending callbacks, and clear our state.
  weak_factory_.InvalidateWeakPtrs();
  Clear();
}

void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
  DCHECK(page_feature_state_.get());
  ++page_feature_state_->num_iterations;
  base::TimeTicks current_chunk_start_time = clock_->Now();

  if (cur_document_.isNull()) {
    // This will only happen if we weren't able to get the document for the
    // main frame.  We'll treat this as an extraction failure.
    RunCallback(false);
    return;
  }

  int num_elements = 0;
  for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
    WebKit::WebNode cur_node;
    if (cur_frame_data_.get()) {
      // We're resuming traversal of a frame, so just advance to the next node.
      cur_node = cur_frame_data_->elements.nextItem();
      // When we resume the traversal, the first call to nextItem() potentially
      // has to walk through the document again from the beginning, if it was
      // modified between our chunks of work.  Log how long this takes, so we
      // can tell if it's too slow.
      UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
                          clock_->Now() - current_chunk_start_time);
    } else {
      // We just moved to a new frame, so update our frame state
      // and advance to the first element.
      ResetFrameData();
      cur_node = cur_frame_data_->elements.firstItem();
    }

    for (; !cur_node.isNull();
         cur_node = cur_frame_data_->elements.nextItem()) {
      if (!cur_node.isElementNode()) {
        continue;
      }
      WebKit::WebElement element = cur_node.to<WebKit::WebElement>();
      if (element.hasTagName("a")) {
        HandleLink(element);
      } else if (element.hasTagName("form")) {
        HandleForm(element);
      } else if (element.hasTagName("img")) {
        HandleImage(element);
      } else if (element.hasTagName("input")) {
        HandleInput(element);
      } else if (element.hasTagName("script")) {
        HandleScript(element);
      }

      if (++num_elements >= kClockCheckGranularity) {
        num_elements = 0;
        base::TimeTicks now = clock_->Now();
        if (now - page_feature_state_->start_time >=
            base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
          DLOG(ERROR) << "Feature extraction took too long, giving up";
          // We expect this to happen infrequently, so record when it does.
          UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
          RunCallback(false);
          return;
        }
        base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
        if (chunk_elapsed >=
            base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
          // The time limit for the current chunk is up, so post a task to
          // continue extraction.
          //
          // Record how much time we actually spent on the chunk. If this is
          // much higher than kMaxTimePerChunkMs, we may need to adjust the
          // clock granularity.
          UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
                              chunk_elapsed);
          MessageLoop::current()->PostTask(
              FROM_HERE,
              base::Bind(
                  &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
                  weak_factory_.GetWeakPtr()));
          return;
        }
        // Otherwise, continue.
      }
    }

    // We're done with this frame, recalculate the FrameData when we
    // advance to the next frame.
    cur_frame_data_.reset();
  }

  InsertFeatures();
  RunCallback(true);
}

void PhishingDOMFeatureExtractor::HandleLink(
    const WebKit::WebElement& element) {
  // Count the number of times we link to a different host.
  if (!element.hasAttribute("href")) {
    DVLOG(1) << "Skipping anchor tag with no href";
    return;
  }

  // Retrieve the link and resolve the link in case it's relative.
  WebKit::WebURL full_url = element.document().completeURL(
      element.getAttribute("href"));

  std::string domain;
  bool is_external = IsExternalDomain(full_url, &domain);
  if (domain.empty()) {
    DVLOG(1) << "Could not extract domain from link: " << full_url;
    return;
  }

  if (is_external) {
    ++page_feature_state_->external_links;

    // Record each unique domain that we link to.
    page_feature_state_->external_domains.insert(domain);
  }

  // Check how many are https links.
  if (GURL(full_url).SchemeIs("https")) {
    ++page_feature_state_->secure_links;
  }

  ++page_feature_state_->total_links;
}

void PhishingDOMFeatureExtractor::HandleForm(
    const WebKit::WebElement& element) {
  // Increment the number of forms on this page.
  ++page_feature_state_->num_forms;

  // Record whether the action points to a different domain.
  if (!element.hasAttribute("action")) {
    return;
  }

  WebKit::WebURL full_url = element.document().completeURL(
      element.getAttribute("action"));

  std::string domain;
  bool is_external = IsExternalDomain(full_url, &domain);
  if (domain.empty()) {
    DVLOG(1) << "Could not extract domain from form action: " << full_url;
    return;
  }

  if (is_external) {
    ++page_feature_state_->action_other_domain;
  }
  ++page_feature_state_->total_actions;
}

void PhishingDOMFeatureExtractor::HandleImage(
    const WebKit::WebElement& element) {
  if (!element.hasAttribute("src")) {
    DVLOG(1) << "Skipping img tag with no src";
  }

  // Record whether the image points to a different domain.
  WebKit::WebURL full_url = element.document().completeURL(
      element.getAttribute("src"));
  std::string domain;
  bool is_external = IsExternalDomain(full_url, &domain);
  if (domain.empty()) {
    DVLOG(1) << "Could not extract domain from image src: " << full_url;
    return;
  }

  if (is_external) {
    ++page_feature_state_->img_other_domain;
  }
  ++page_feature_state_->total_imgs;
}

void PhishingDOMFeatureExtractor::HandleInput(
    const WebKit::WebElement& element) {
  // The HTML spec says that if the type is unspecified, it defaults to text.
  // In addition, any unrecognized type will be treated as a text input.
  //
  // Note that we use the attribute value rather than
  // WebFormControlElement::formControlType() for consistency with the
  // way the phishing classification model is created.
  std::string type = element.getAttribute("type").utf8();
  StringToLowerASCII(&type);
  if (type == "password") {
    ++page_feature_state_->num_pswd_inputs;
  } else if (type == "radio") {
    ++page_feature_state_->num_radio_inputs;
  } else if (type == "checkbox") {
    ++page_feature_state_->num_check_inputs;
  } else if (type != "submit" && type != "reset" && type != "file" &&
             type != "hidden" && type != "image" && type != "button") {
    // Note that there are a number of new input types in HTML5 that are not
    // handled above.  For now, we will consider these as text inputs since
    // they could be used to capture user input.
    ++page_feature_state_->num_text_inputs;
  }
}

void PhishingDOMFeatureExtractor::HandleScript(
    const WebKit::WebElement& element) {
  ++page_feature_state_->num_script_tags;
}

void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
  DCHECK(done_callback_.is_null());
  DCHECK(!cur_frame_data_.get());
  DCHECK(cur_document_.isNull());
  if (!done_callback_.is_null() || cur_frame_data_.get() ||
      !cur_document_.isNull()) {
    LOG(ERROR) << "Extraction in progress, missing call to "
               << "CancelPendingExtraction";
  }
}

void PhishingDOMFeatureExtractor::RunCallback(bool success) {
  // Record some timing stats that we can use to evaluate feature extraction
  // performance.  These include both successful and failed extractions.
  DCHECK(page_feature_state_.get());
  UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
                       page_feature_state_->num_iterations);
  UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
                      clock_->Now() - page_feature_state_->start_time);

  DCHECK(!done_callback_.is_null());
  done_callback_.Run(success);
  Clear();
}

void PhishingDOMFeatureExtractor::Clear() {
  features_ = NULL;
  done_callback_.Reset();
  cur_frame_data_.reset(NULL);
  cur_document_.reset();
}

void PhishingDOMFeatureExtractor::ResetFrameData() {
  DCHECK(!cur_document_.isNull());
  DCHECK(!cur_frame_data_.get());

  cur_frame_data_.reset(new FrameData());
  cur_frame_data_->elements = cur_document_.all();
  cur_frame_data_->domain =
      net::RegistryControlledDomainService::GetDomainAndRegistry(
          cur_document_.url());
}

WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
  DCHECK(!cur_document_.isNull());
  WebKit::WebFrame* frame = cur_document_.frame();
  // Advance to the next frame that contains a document, with no wrapping.
  if (frame) {
    while ((frame = frame->traverseNext(false))) {
      if (!frame->document().isNull()) {
        return frame->document();
      }
    }
  } else {
    // Keep track of how often frame traversal got "stuck" due to the
    // current subdocument getting removed from the frame tree.
    UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
  }
  return WebKit::WebDocument();
}

bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
                                                   std::string* domain) const {
  DCHECK(domain);
  DCHECK(cur_frame_data_.get());

  if (cur_frame_data_->domain.empty()) {
    return false;
  }

  // TODO(bryner): Ensure that the url encoding is consistent with the features
  // in the model.
  if (url.HostIsIPAddress()) {
    domain->assign(url.host());
  } else {
    domain->assign(net::RegistryControlledDomainService::GetDomainAndRegistry(
        url));
  }

  return !domain->empty() && *domain != cur_frame_data_->domain;
}

void PhishingDOMFeatureExtractor::InsertFeatures() {
  DCHECK(page_feature_state_.get());

  if (page_feature_state_->total_links > 0) {
    // Add a feature for the fraction of times the page links to an external
    // domain vs. an internal domain.
    double link_freq = static_cast<double>(
        page_feature_state_->external_links) /
        page_feature_state_->total_links;
    features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);

    // Add a feature for each unique domain that we're linking to
    for (base::hash_set<std::string>::iterator it =
             page_feature_state_->external_domains.begin();
         it != page_feature_state_->external_domains.end(); ++it) {
      features_->AddBooleanFeature(features::kPageLinkDomain + *it);
    }

    // Fraction of links that use https.
    double secure_freq = static_cast<double>(
        page_feature_state_->secure_links) / page_feature_state_->total_links;
    features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
  }

  // Record whether forms appear and whether various form elements appear.
  if (page_feature_state_->num_forms > 0) {
    features_->AddBooleanFeature(features::kPageHasForms);
  }
  if (page_feature_state_->num_text_inputs > 0) {
    features_->AddBooleanFeature(features::kPageHasTextInputs);
  }
  if (page_feature_state_->num_pswd_inputs > 0) {
    features_->AddBooleanFeature(features::kPageHasPswdInputs);
  }
  if (page_feature_state_->num_radio_inputs > 0) {
    features_->AddBooleanFeature(features::kPageHasRadioInputs);
  }
  if (page_feature_state_->num_check_inputs > 0) {
    features_->AddBooleanFeature(features::kPageHasCheckInputs);
  }

  // Record fraction of form actions that point to a different domain.
  if (page_feature_state_->total_actions > 0) {
    double action_freq = static_cast<double>(
        page_feature_state_->action_other_domain) /
        page_feature_state_->total_actions;
    features_->AddRealFeature(features::kPageActionOtherDomainFreq,
                              action_freq);
  }

  // Record how many image src attributes point to a different domain.
  if (page_feature_state_->total_imgs > 0) {
    double img_freq = static_cast<double>(
        page_feature_state_->img_other_domain) /
        page_feature_state_->total_imgs;
    features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
  }

  // Record number of script tags (discretized for numerical stability.)
  if (page_feature_state_->num_script_tags > 1) {
    features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
    if (page_feature_state_->num_script_tags > 6) {
      features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
    }
  }
}

}  // namespace safe_browsing