summaryrefslogtreecommitdiffstats
path: root/chrome/renderer
diff options
context:
space:
mode:
authorbryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-07-29 00:37:45 +0000
committerbryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-07-29 00:37:45 +0000
commitf4dafe029aab967bbc6ec5ad28c9f928280367f3 (patch)
tree21233b5642296b8ba402c6a1b771cce64d9d37d1 /chrome/renderer
parent92608249eb322c9075f147c2dea302fb1c65acb1 (diff)
downloadchromium_src-f4dafe029aab967bbc6ec5ad28c9f928280367f3.zip
chromium_src-f4dafe029aab967bbc6ec5ad28c9f928280367f3.tar.gz
chromium_src-f4dafe029aab967bbc6ec5ad28c9f928280367f3.tar.bz2
Add an extractor for DOM features to be used for client side phishing detection.
PhishingDOMFeatureExtractor iterates over the page elements and computes a number of features. To avoid blocking the renderer for too long, the extractor may run in several chunks of works, posting a task to continue processing if necessary. This CL only includes the feature extraction itself. I will add the logic to cap the time per iteration in a follow-up CL. BUG=none TEST=PhishingDOMFeatureExtractorTest Review URL: http://codereview.chromium.org/2878046 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@54082 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer')
-rw-r--r--chrome/renderer/safe_browsing/features.cc35
-rw-r--r--chrome/renderer/safe_browsing/features.h55
-rw-r--r--chrome/renderer/safe_browsing/features_unittest.cc17
-rw-r--r--chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc416
-rw-r--r--chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h128
-rw-r--r--chrome/renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc410
6 files changed, 1060 insertions, 1 deletions
diff --git a/chrome/renderer/safe_browsing/features.cc b/chrome/renderer/safe_browsing/features.cc
index 47a093c..4d67cf3 100644
--- a/chrome/renderer/safe_browsing/features.cc
+++ b/chrome/renderer/safe_browsing/features.cc
@@ -15,6 +15,10 @@ FeatureMap::FeatureMap() {}
FeatureMap::~FeatureMap() {}
bool FeatureMap::AddBooleanFeature(const std::string& name) {
+ return AddRealFeature(name, 1.0);
+}
+
+bool FeatureMap::AddRealFeature(const std::string& name, double value) {
if (features_.size() >= kMaxFeatureMapSize) {
// If we hit this case, it indicates that either kMaxFeatureMapSize is
// too small, or there is a bug causing too many features to be added.
@@ -25,7 +29,16 @@ bool FeatureMap::AddBooleanFeature(const std::string& name) {
UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1);
return false;
}
- features_[name] = 1.0;
+ // We only expect features in the range [0.0, 1.0], so fail if the feature is
+ // outside this range.
+ if (value < 0.0 || value > 1.0) {
+ LOG(ERROR) << "Not adding feature: " << name << " because the value "
+ << value << " is not in the range [0.0, 1.0].";
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.IllegalFeatureValue", 1);
+ return false;
+ }
+
+ features_[name] = value;
return true;
}
@@ -47,5 +60,25 @@ const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3";
// URL path features
const char kUrlPathToken[] = "UrlPathToken=";
+// DOM HTML form features
+const char kPageHasForms[] = "PageHasForms";
+const char kPageActionOtherDomainFreq[] = "PageActionOtherDomainFreq";
+const char kPageHasTextInputs[] = "PageHasTextInputs";
+const char kPageHasPswdInputs[] = "PageHasPswdInputs";
+const char kPageHasRadioInputs[] = "PageHasRadioInputs";
+const char kPageHasCheckInputs[] = "PageHasCheckInputs";
+
+// DOM HTML link features
+const char kPageExternalLinksFreq[] = "PageExternalLinksFreq";
+const char kPageLinkDomain[] = "PageLinkDomain=";
+const char kPageSecureLinksFreq[] = "PageSecureLinksFreq";
+
+// DOM HTML script features
+const char kPageNumScriptTagsGTOne[] = "PageNumScriptTags>1";
+const char kPageNumScriptTagsGTSix[] = "PageNumScriptTags>6";
+
+// Other DOM HTML features
+const char kPageImgOtherDomainFreq[] = "PageImgOtherDomainFreq";
+
} // namespace features
} // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/features.h b/chrome/renderer/safe_browsing/features.h
index 1a82c61..f3c8348 100644
--- a/chrome/renderer/safe_browsing/features.h
+++ b/chrome/renderer/safe_browsing/features.h
@@ -44,6 +44,12 @@ class FeatureMap {
// kMaxFeatureMapSize.
bool AddBooleanFeature(const std::string& name);
+ // Adds a real-valued feature to a FeatureMap with the given value.
+ // Values must always be in the range [0.0, 1.0]. Returns true on
+ // success, or false if the feature map exceeds kMaxFeatureMapSize
+ // or the value is outside of the allowed range.
+ bool AddRealFeature(const std::string& name, double value);
+
// Provides read-only access to the current set of features.
const base::hash_map<std::string, double>& features() const {
return features_;
@@ -103,6 +109,55 @@ extern const char kUrlNumOtherHostTokensGTThree[];
// token features, "abc" and "efg". Query parameters are not included.
extern const char kUrlPathToken[];
+////////////////////////////////////////////////////
+// DOM HTML form features
+////////////////////////////////////////////////////
+
+// Set if the page has any <form> elements.
+extern const char kPageHasForms[];
+// The fraction of form elements whose |action| attribute points to a
+// URL on a different domain from the document URL.
+extern const char kPageActionOtherDomainFreq[];
+
+// Set if the page has any <input type="text"> elements
+// (includes inputs with missing or unknown types).
+extern const char kPageHasTextInputs[];
+// Set if the page has any <input type="password"> elements.
+extern const char kPageHasPswdInputs[];
+// Set if the page has any <input type="radio"> elements.
+extern const char kPageHasRadioInputs[];
+// Set if the page has any <input type="checkbox"> elements.
+extern const char kPageHasCheckInputs[];
+
+////////////////////////////////////////////////////
+// DOM HTML link features
+////////////////////////////////////////////////////
+
+// The fraction of links in the page which point to a domain other than the
+// domain of the document. See "URL host features" above for a discussion
+// of how the doamin is computed.
+extern const char kPageExternalLinksFreq[];
+// Token feature containing each external domain that is linked to.
+extern const char kPageLinkDomain[];
+// Fraction of links in the page that use https.
+extern const char kPageSecureLinksFreq[];
+
+////////////////////////////////////////////////////
+// DOM HTML script features
+////////////////////////////////////////////////////
+
+// Set if the number of <script> elements in the page is greater than 1.
+extern const char kPageNumScriptTagsGTOne[];
+// Set if the number of <script> elements in the page is greater than 6.
+extern const char kPageNumScriptTagsGTSix[];
+
+////////////////////////////////////////////////////
+// Other DOM HTML features
+////////////////////////////////////////////////////
+
+// The fraction of images whose src attribute points to an external domain.
+extern const char kPageImgOtherDomainFreq[];
+
} // namespace features
} // namepsace safe_browsing
diff --git a/chrome/renderer/safe_browsing/features_unittest.cc b/chrome/renderer/safe_browsing/features_unittest.cc
index ad07ba2..ac5cb55 100644
--- a/chrome/renderer/safe_browsing/features_unittest.cc
+++ b/chrome/renderer/safe_browsing/features_unittest.cc
@@ -6,6 +6,7 @@
#include "base/format_macros.h"
#include "base/string_util.h"
+#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace safe_browsing {
@@ -24,4 +25,20 @@ TEST(PhishingFeaturesTest, TooManyFeatures) {
EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size());
}
+TEST(PhishingFeaturesTest, IllegalFeatureValue) {
+ FeatureMap features;
+ EXPECT_FALSE(features.AddRealFeature("toosmall", -0.1));
+ EXPECT_TRUE(features.AddRealFeature("zero", 0.0));
+ EXPECT_TRUE(features.AddRealFeature("pointfive", 0.5));
+ EXPECT_TRUE(features.AddRealFeature("one", 1.0));
+ EXPECT_FALSE(features.AddRealFeature("toolarge", 1.1));
+
+ FeatureMap expected_features;
+ expected_features.AddRealFeature("zero", 0.0);
+ expected_features.AddRealFeature("pointfive", 0.5);
+ expected_features.AddRealFeature("one", 1.0);
+ EXPECT_THAT(features.features(),
+ ::testing::ContainerEq(expected_features.features()));
+}
+
} // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
new file mode 100644
index 0000000..c8f4bd0
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
@@ -0,0 +1,416 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
+
+#include "base/compiler_specific.h"
+#include "base/hash_tables.h"
+#include "base/histogram.h"
+#include "base/logging.h"
+#include "chrome/renderer/render_view.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "net/base/registry_controlled_domain.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebDocument.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebElement.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebFrame.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebString.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebView.h"
+
+namespace safe_browsing {
+
+// Intermediate state used for computing features. See features.h for
+// descriptions of the DOM features that are computed.
+struct PhishingDOMFeatureExtractor::PageFeatureState {
+ // Link related features
+ int external_links;
+ base::hash_set<std::string> external_domains;
+ int secure_links;
+ int total_links;
+
+ // Form related features
+ int num_forms;
+ int num_text_inputs;
+ int num_pswd_inputs;
+ int num_radio_inputs;
+ int num_check_inputs;
+ int action_other_domain;
+ int total_actions;
+
+ // Image related features
+ int img_other_domain;
+ int total_imgs;
+
+ // How many script tags
+ int num_script_tags;
+
+ PageFeatureState()
+ : external_links(0),
+ secure_links(0),
+ total_links(0),
+ num_forms(0),
+ num_text_inputs(0),
+ num_pswd_inputs(0),
+ num_radio_inputs(0),
+ num_check_inputs(0),
+ action_other_domain(0),
+ total_actions(0),
+ img_other_domain(0),
+ total_imgs(0),
+ num_script_tags(0) {}
+
+ ~PageFeatureState() {}
+};
+
+// Per-frame state
+struct PhishingDOMFeatureExtractor::FrameData {
+ // This is our reference to document.all, which is an iterator over all
+ // of the elements in the document. It keeps track of our current position.
+ WebKit::WebNodeCollection elements;
+ // The domain of the document URL, stored here so that we don't need to
+ // recompute it every time it's needed.
+ std::string domain;
+};
+
+PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
+ RenderView* render_view)
+ : render_view_(render_view),
+ ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
+ Clear();
+}
+
+PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
+ // The RenderView should have called CancelPendingExtraction() before
+ // we are destroyed.
+ CheckNoPendingExtraction();
+}
+
+void PhishingDOMFeatureExtractor::ExtractFeatures(
+ FeatureMap* features,
+ DoneCallback* done_callback) {
+ // The RenderView should have called CancelPendingExtraction() before
+ // starting a new extraction, so DCHECK this.
+ CheckNoPendingExtraction();
+ // However, in an opt build, we will go ahead and clean up the pending
+ // extraction so that we can start in a known state.
+ CancelPendingExtraction();
+
+ features_ = features;
+ done_callback_.reset(done_callback);
+ MessageLoop::current()->PostTask(
+ FROM_HERE,
+ method_factory_.NewRunnableMethod(
+ &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
+}
+
+void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
+ // Cancel any pending callbacks, and clear our state.
+ method_factory_.RevokeAll();
+ Clear();
+}
+
+void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
+ if (!cur_frame_) {
+ WebKit::WebView* web_view = render_view_->webview();
+ if (!web_view) {
+ // When the WebView is going away, the render view should have called
+ // CancelPendingExtraction() which should have stopped any pending work,
+ // so this case should not happen.
+ NOTREACHED();
+ RunCallback(false);
+ return;
+ }
+ cur_frame_ = web_view->mainFrame();
+ page_feature_state_.reset(new PageFeatureState);
+ }
+
+ for (; cur_frame_;
+ cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) {
+ WebKit::WebNode cur_node;
+ if (cur_frame_data_.get()) {
+ // We're resuming traversal of a frame, so just advance to the next node.
+ cur_node = cur_frame_data_->elements.nextItem();
+ } else {
+ // We just moved to a new frame, so update our frame state
+ // and advance to the first element.
+ if (!ResetFrameData()) {
+ // Nothing in this frame, move on to the next one.
+ LOG(WARNING) << "No content in frame, skipping";
+ continue;
+ }
+ cur_node = cur_frame_data_->elements.firstItem();
+ }
+
+ for (; !cur_node.isNull();
+ cur_node = cur_frame_data_->elements.nextItem()) {
+ if (!cur_node.isElementNode()) {
+ continue;
+ }
+ WebKit::WebElement element = cur_node.to<WebKit::WebElement>();
+ if (element.hasTagName("a")) {
+ HandleLink(element);
+ } else if (element.hasTagName("form")) {
+ HandleForm(element);
+ } else if (element.hasTagName("img")) {
+ HandleImage(element);
+ } else if (element.hasTagName("input")) {
+ HandleInput(element);
+ } else if (element.hasTagName("script")) {
+ HandleScript(element);
+ }
+
+ // TODO(bryner): stop if too much time has elapsed, and add histograms
+ // for the time spent processing.
+ }
+
+ // We're done with this frame, recalculate the FrameData when we
+ // advance to the next frame.
+ cur_frame_data_.reset();
+ }
+
+ InsertFeatures();
+ RunCallback(true);
+}
+
+void PhishingDOMFeatureExtractor::HandleLink(
+ const WebKit::WebElement& element) {
+ // Count the number of times we link to a different host.
+ if (!element.hasAttribute("href")) {
+ DLOG(INFO) << "Skipping anchor tag with no href";
+ return;
+ }
+
+ // Retrieve the link and resolve the link in case it's relative.
+ WebKit::WebURL full_url = element.document().completeURL(
+ element.getAttribute("href"));
+
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ LOG(ERROR) << "Could not extract domain from link: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->external_links;
+
+ // Record each unique domain that we link to.
+ page_feature_state_->external_domains.insert(domain);
+ }
+
+ // Check how many are https links.
+ if (GURL(full_url).SchemeIs("https")) {
+ ++page_feature_state_->secure_links;
+ }
+
+ ++page_feature_state_->total_links;
+}
+
+void PhishingDOMFeatureExtractor::HandleForm(
+ const WebKit::WebElement& element) {
+ // Increment the number of forms on this page.
+ ++page_feature_state_->num_forms;
+
+ // Record whether the action points to a different domain.
+ if (!element.hasAttribute("action")) {
+ return;
+ }
+
+ WebKit::WebURL full_url = element.document().completeURL(
+ element.getAttribute("action"));
+
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ LOG(ERROR) << "Could not extract domain from form action: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->action_other_domain;
+ }
+ ++page_feature_state_->total_actions;
+}
+
+void PhishingDOMFeatureExtractor::HandleImage(
+ const WebKit::WebElement& element) {
+ if (!element.hasAttribute("src")) {
+ DLOG(INFO) << "Skipping img tag with no src";
+ }
+
+ // Record whether the image points to a different domain.
+ WebKit::WebURL full_url = element.document().completeURL(
+ element.getAttribute("src"));
+ std::string domain;
+ bool is_external = IsExternalDomain(full_url, &domain);
+ if (domain.empty()) {
+ LOG(ERROR) << "Could not extract domain from image src: " << full_url;
+ return;
+ }
+
+ if (is_external) {
+ ++page_feature_state_->img_other_domain;
+ }
+ ++page_feature_state_->total_imgs;
+}
+
+void PhishingDOMFeatureExtractor::HandleInput(
+ const WebKit::WebElement& element) {
+ // The HTML spec says that if the type is unspecified, it defaults to text.
+ // In addition, any unrecognized type will be treated as a text input.
+ //
+ // Note that we use the attribute value rather than
+ // WebFormControlElement::formControlType() for consistency with the
+ // way the phishing classification model is created.
+ std::string type = element.getAttribute("type").utf8();
+ StringToLowerASCII(&type);
+ if (type == "password") {
+ ++page_feature_state_->num_pswd_inputs;
+ } else if (type == "radio") {
+ ++page_feature_state_->num_radio_inputs;
+ } else if (type == "checkbox") {
+ ++page_feature_state_->num_check_inputs;
+ } else if (type != "submit" && type != "reset" && type != "file" &&
+ type != "hidden" && type != "image" && type != "button") {
+ // Note that there are a number of new input types in HTML5 that are not
+ // handled above. For now, we will consider these as text inputs since
+ // they could be used to capture user input.
+ ++page_feature_state_->num_text_inputs;
+ }
+}
+
+void PhishingDOMFeatureExtractor::HandleScript(
+ const WebKit::WebElement& element) {
+ ++page_feature_state_->num_script_tags;
+}
+
+void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
+ DCHECK(!done_callback_.get());
+ DCHECK(!cur_frame_data_.get());
+ DCHECK(!cur_frame_);
+ if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) {
+ LOG(ERROR) << "Extraction in progress, missing call to "
+ << "CancelPendingExtraction";
+ }
+}
+
+void PhishingDOMFeatureExtractor::RunCallback(bool success) {
+ DCHECK(done_callback_.get());
+ done_callback_->Run(success);
+ Clear();
+}
+
+void PhishingDOMFeatureExtractor::Clear() {
+ features_ = NULL;
+ done_callback_.reset(NULL);
+ cur_frame_data_.reset(NULL);
+ cur_frame_ = NULL;
+}
+
+bool PhishingDOMFeatureExtractor::ResetFrameData() {
+ DCHECK(cur_frame_);
+ DCHECK(!cur_frame_data_.get());
+
+ WebKit::WebDocument doc = cur_frame_->document();
+ if (doc.isNull()) {
+ return false;
+ }
+ cur_frame_data_.reset(new FrameData());
+ cur_frame_data_->elements = doc.all();
+ cur_frame_data_->domain =
+ net::RegistryControlledDomainService::GetDomainAndRegistry(
+ cur_frame_->url());
+ return true;
+}
+
+bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
+ std::string* domain) const {
+ DCHECK(domain);
+ DCHECK(cur_frame_data_.get());
+
+ if (cur_frame_data_->domain.empty()) {
+ return false;
+ }
+
+ // TODO(bryner): Ensure that the url encoding is consistent with the features
+ // in the model.
+ if (url.HostIsIPAddress()) {
+ domain->assign(url.host());
+ } else {
+ domain->assign(net::RegistryControlledDomainService::GetDomainAndRegistry(
+ url));
+ }
+
+ return !domain->empty() && *domain != cur_frame_data_->domain;
+}
+
+void PhishingDOMFeatureExtractor::InsertFeatures() {
+ DCHECK(page_feature_state_.get());
+ features_->Clear();
+
+ if (page_feature_state_->total_links > 0) {
+ // Add a feature for the fraction of times the page links to an external
+ // domain vs. an internal domain.
+ double link_freq = static_cast<double>(
+ page_feature_state_->external_links) /
+ page_feature_state_->total_links;
+ features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
+
+ // Add a feature for each unique domain that we're linking to
+ for (base::hash_set<std::string>::iterator it =
+ page_feature_state_->external_domains.begin();
+ it != page_feature_state_->external_domains.end(); ++it) {
+ features_->AddBooleanFeature(features::kPageLinkDomain + *it);
+ }
+
+ // Fraction of links that use https.
+ double secure_freq = static_cast<double>(
+ page_feature_state_->secure_links) / page_feature_state_->total_links;
+ features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
+ }
+
+ // Record whether forms appear and whether various form elements appear.
+ if (page_feature_state_->num_forms > 0) {
+ features_->AddBooleanFeature(features::kPageHasForms);
+ }
+ if (page_feature_state_->num_text_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasTextInputs);
+ }
+ if (page_feature_state_->num_pswd_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasPswdInputs);
+ }
+ if (page_feature_state_->num_radio_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasRadioInputs);
+ }
+ if (page_feature_state_->num_check_inputs > 0) {
+ features_->AddBooleanFeature(features::kPageHasCheckInputs);
+ }
+
+ // Record fraction of form actions that point to a different domain.
+ if (page_feature_state_->total_actions > 0) {
+ double action_freq = static_cast<double>(
+ page_feature_state_->action_other_domain) /
+ page_feature_state_->total_actions;
+ features_->AddRealFeature(features::kPageActionOtherDomainFreq,
+ action_freq);
+ }
+
+ // Record how many image src attributes point to a different domain.
+ if (page_feature_state_->total_imgs > 0) {
+ double img_freq = static_cast<double>(
+ page_feature_state_->img_other_domain) /
+ page_feature_state_->total_imgs;
+ features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
+ }
+
+ // Record number of script tags (discretized for numerical stability.)
+ if (page_feature_state_->num_script_tags > 1) {
+ features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
+ if (page_feature_state_->num_script_tags > 6) {
+ features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
+ }
+ }
+}
+
+} // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
new file mode 100644
index 0000000..bc9d599
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// PhishingDOMFeatureExtractor handles computing DOM-based features for the
+// client-side phishing detection model. These include the presence of various
+// types of elements, ratios of external and secure links, and tokens for
+// external domains linked to.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
+#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/callback.h"
+#include "base/scoped_ptr.h"
+#include "base/task.h"
+
+class GURL;
+class RenderView;
+
+namespace WebKit {
+class WebElement;
+class WebFrame;
+}
+
+namespace safe_browsing {
+class FeatureMap;
+
+class PhishingDOMFeatureExtractor {
+ public:
+ // Callback to be run when feature extraction finishes. The callback
+ // argument is true if extraction was successful, false otherwise.
+ typedef Callback1<bool>::Type DoneCallback;
+
+ // Creates a PhishingDOMFeatureExtractor for the specified RenderView.
+ // The PhishingDOMFeatureExtrator should be destroyed prior to destroying
+ // the RenderView.
+ explicit PhishingDOMFeatureExtractor(RenderView* render_view);
+ ~PhishingDOMFeatureExtractor();
+
+ // Begins extracting features into the given FeatureMap for the page
+ // currently loaded in this object's RenderView. To avoid blocking the
+ // render thread for too long, the feature extractor may run in several
+ // chunks of work, posting a task to the current MessageLoop to continue
+ // processing. Once feature extraction is complete, |done_callback|
+ // is run. PhishingDOMFeatureExtractor takes ownership of the callback.
+ void ExtractFeatures(FeatureMap* features, DoneCallback* done_callback);
+
+ // Cancels any pending feature extraction. The DoneCallback will not be run.
+ // Must be called if there is a feature extraction in progress when the page
+ // is unloaded or the PhishingDOMFeatureExtractor is destroyed.
+ void CancelPendingExtraction();
+
+ private:
+ struct FrameData;
+ struct PageFeatureState;
+
+ // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
+ // until a predefined maximum amount of time has elapsed, then posts a task
+ // to the current MessageLoop to continue extraction. When extraction
+ // finishes, calls RunCallback().
+ void ExtractFeaturesWithTimeout();
+
+ // Handlers for the various HTML elements that we compute features for.
+ // Since some of the features (such as ratios) cannot be computed until
+ // feature extraction is finished, these handlers do not add to the feature
+ // map directly. Instead, they update the values in the PageFeatureState.
+ void HandleLink(const WebKit::WebElement& element);
+ void HandleForm(const WebKit::WebElement& element);
+ void HandleImage(const WebKit::WebElement& element);
+ void HandleInput(const WebKit::WebElement& element);
+ void HandleScript(const WebKit::WebElement& element);
+
+ // Helper to verify that there is no pending feature extraction. Dies in
+ // debug builds if the state is not as expected. This is a no-op in release
+ // builds.
+ void CheckNoPendingExtraction();
+
+ // Runs |done_callback_| and then clears all internal state.
+ void RunCallback(bool success);
+
+ // Clears all internal feature extraction state.
+ void Clear();
+
+ // Called after advancing |cur_frame_| to update the state in
+ // |cur_frame_data_|. Returns true if the state was updated successfully.
+ bool ResetFrameData();
+
+ // Given a URL, checks whether the domain is different from the domain of
+ // the current frame's URL. If so, stores the domain in |domain| and returns
+ // true, otherwise returns false.
+ bool IsExternalDomain(const GURL& url, std::string* domain) const;
+
+ // Called once all frames have been processed to compute features from the
+ // PageFeatureState and add them to |features_|. See features.h for a
+ // description of which features are computed.
+ void InsertFeatures();
+
+ // Non-owned pointer to the view that we will extract features from.
+ RenderView* render_view_;
+
+ // The output parameters from the most recent call to ExtractFeatures().
+ FeatureMap* features_; // The caller keeps ownership of this.
+ scoped_ptr<DoneCallback> done_callback_;
+
+ // Non-owned pointer to the current frame that we are processing.
+ WebKit::WebFrame* cur_frame_;
+
+ // Stores extra state for |cur_frame_| that will be persisted until we
+ // advance to the next frame.
+ scoped_ptr<FrameData> cur_frame_data_;
+
+ // Stores the intermediate data used to create features. This data is
+ // accumulated across all frames in the RenderView.
+ scoped_ptr<PageFeatureState> page_feature_state_;
+
+ // Used to create ExtractFeaturesWithTimeout tasks.
+ // These tasks are revoked if extraction is cancelled.
+ ScopedRunnableMethodFactory<PhishingDOMFeatureExtractor> method_factory_;
+
+ DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
+};
+
+} // namespace safe_browsing
+
+#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc
new file mode 100644
index 0000000..637b2bd
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc
@@ -0,0 +1,410 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
+
+#include <string.h> // for memcpy()
+#include <map>
+#include <string>
+
+#include "base/callback.h"
+#include "base/command_line.h"
+#include "base/message_loop.h"
+#include "base/process.h"
+#include "base/string_util.h"
+#include "chrome/common/main_function_params.h"
+#include "chrome/common/render_messages.h"
+#include "chrome/common/sandbox_init_wrapper.h"
+#include "chrome/renderer/mock_render_process.h"
+#include "chrome/renderer/render_thread.h"
+#include "chrome/renderer/render_view.h"
+#include "chrome/renderer/render_view_visitor.h"
+#include "chrome/renderer/renderer_main_platform_delegate.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "googleurl/src/gurl.h"
+#include "ipc/ipc_channel.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebFrame.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebURLRequest.h"
+#include "third_party/WebKit/WebKit/chromium/public/WebView.h"
+#include "webkit/glue/webkit_glue.h"
+
+using ::testing::ContainerEq;
+
+namespace safe_browsing {
+
+class PhishingDOMFeatureExtractorTest : public ::testing::Test,
+ public IPC::Channel::Listener,
+ public RenderViewVisitor {
+ public:
+ // IPC::Channel::Listener implementation.
+ virtual void OnMessageReceived(const IPC::Message& message) {
+ IPC_BEGIN_MESSAGE_MAP(PhishingDOMFeatureExtractorTest, message)
+ IPC_MESSAGE_HANDLER(ViewHostMsg_RenderViewReady, OnRenderViewReady)
+ IPC_MESSAGE_HANDLER(ViewHostMsg_DidStopLoading, OnDidStopLoading)
+ IPC_MESSAGE_HANDLER(ViewHostMsg_RequestResource, OnRequestResource)
+ IPC_END_MESSAGE_MAP()
+ }
+
+ // RenderViewVisitor implementation.
+ virtual bool Visit(RenderView* render_view) {
+ view_ = render_view;
+ return false;
+ }
+
+ protected:
+ virtual void SetUp() {
+ // Set up the renderer. This code is largely adapted from
+ // render_view_test.cc and renderer_main.cc. Note that we use a
+ // MockRenderProcess (because we don't need to use IPC for painting),
+ // but we use a real RenderThread so that we can use the ResourceDispatcher
+ // to fetch network resources. These are then served canned content
+ // in OnRequestResource().
+ sandbox_init_wrapper_.reset(new SandboxInitWrapper);
+ command_line_.reset(new CommandLine(CommandLine::ARGUMENTS_ONLY));
+ params_.reset(new MainFunctionParams(*command_line_,
+ *sandbox_init_wrapper_, NULL));
+ platform_.reset(new RendererMainPlatformDelegate(*params_));
+ platform_->PlatformInitialize();
+
+ // We use a new IPC channel name for each test that runs.
+ // This is necessary because the renderer-side IPC channel is not
+ // shut down when the RenderThread goes away, so attempting to reuse
+ // the channel name gives an error (see ChildThread::~ChildThread()).
+ std::string thread_name = StringPrintf(
+ "phishing_dom_feature_Extractor_unittest.%d",
+ next_thread_id_++);
+ channel_.reset(new IPC::Channel(thread_name,
+ IPC::Channel::MODE_SERVER, this));
+ ASSERT_TRUE(channel_->Connect());
+
+ webkit_glue::SetJavaScriptFlags(L"--expose-gc");
+ mock_process_.reset(new MockRenderProcess);
+ render_thread_ = new RenderThread(thread_name);
+ mock_process_->set_main_thread(render_thread_);
+
+ // Tell the renderer to create a view, then wait until it's ready.
+ // We can't call View::Create() directly here or else we won't get
+ // RenderProcess's lazy initialization of WebKit.
+ view_ = NULL;
+ ViewMsg_New_Params params;
+ params.parent_window = 0;
+ params.view_id = kViewId;
+ params.session_storage_namespace_id = kInvalidSessionStorageNamespaceId;
+ ASSERT_TRUE(channel_->Send(new ViewMsg_New(params)));
+ msg_loop_.Run();
+
+ extractor_.reset(new PhishingDOMFeatureExtractor(view_));
+ }
+
+ virtual void TearDown() {
+ // Try very hard to collect garbage before shutting down.
+ GetMainFrame()->collectGarbage();
+ GetMainFrame()->collectGarbage();
+
+ ASSERT_TRUE(channel_->Send(new ViewMsg_Close(kViewId)));
+ do {
+ msg_loop_.RunAllPending();
+ view_ = NULL;
+ RenderView::ForEach(this);
+ } while (view_);
+
+ mock_process_.reset();
+ msg_loop_.RunAllPending();
+ platform_->PlatformUninitialize();
+ platform_.reset();
+ command_line_.reset();
+ sandbox_init_wrapper_.reset();
+ }
+
+ // Returns the main WebFrame for our RenderView.
+ WebKit::WebFrame* GetMainFrame() {
+ return view_->webview()->mainFrame();
+ }
+
+ // Loads |url| into the RenderView, waiting for the load to finish.
+ void LoadURL(const std::string& url) {
+ GetMainFrame()->loadRequest(WebKit::WebURLRequest(GURL(url)));
+ msg_loop_.Run();
+ }
+
+ // Runs the DOMFeatureExtractor on the RenderView, waiting for the
+ // completion callback. Returns the success boolean from the callback.
+ bool ExtractFeatures(FeatureMap* features) {
+ success_ = false;
+ extractor_->ExtractFeatures(
+ features,
+ NewCallback(this, &PhishingDOMFeatureExtractorTest::ExtractionDone));
+ msg_loop_.Run();
+ return success_;
+ }
+
+ // Completion callback for feature extraction.
+ void ExtractionDone(bool success) {
+ success_ = success;
+ msg_loop_.Quit();
+ }
+
+ // IPC message handlers below
+
+ // Notification that page load has finished. Exit the message loop
+ // so that the test can continue.
+ void OnDidStopLoading() {
+ msg_loop_.Quit();
+ }
+
+ // Notification that the renderer wants to load a resource.
+ // If the requested url is in responses_, we send the renderer a 200
+ // and the supplied content, otherwise we send it a 404 error.
+ void OnRequestResource(const IPC::Message& message,
+ int request_id,
+ const ViewHostMsg_Resource_Request& request_data) {
+ std::string headers, body;
+ std::map<std::string, std::string>::const_iterator it =
+ responses_.find(request_data.url.spec());
+ if (it == responses_.end()) {
+ headers = "HTTP/1.1 404 Not Found\0Content-Type:text/html\0\0";
+ body = "content not found";
+ } else {
+ headers = "HTTP/1.1 200 OK\0Content-Type:text/html\0\0";
+ body = it->second;
+ }
+
+ ResourceResponseHead response_head;
+ response_head.headers = new net::HttpResponseHeaders(headers);
+ response_head.mime_type = "text/html";
+ ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_ReceivedResponse(
+ message.routing_id(), request_id, response_head)));
+
+ base::SharedMemory shared_memory;
+ ASSERT_TRUE(shared_memory.Create(std::wstring(), false,
+ false, body.size()));
+ ASSERT_TRUE(shared_memory.Map(body.size()));
+ memcpy(shared_memory.memory(), body.data(), body.size());
+
+ base::SharedMemoryHandle handle;
+ ASSERT_TRUE(shared_memory.GiveToProcess(base::Process::Current().handle(),
+ &handle));
+ ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_DataReceived(
+ message.routing_id(), request_id, handle, body.size())));
+
+ ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_RequestComplete(
+ message.routing_id(),
+ request_id,
+ URLRequestStatus(),
+ std::string())));
+ }
+
+ // Notification that the render view we've created is ready to use.
+ void OnRenderViewReady() {
+ // Grab a pointer to the new view using RenderViewVisitor.
+ ASSERT_TRUE(!view_);
+ RenderView::ForEach(this);
+ ASSERT_TRUE(view_);
+ msg_loop_.Quit();
+ }
+
+ static int next_thread_id_; // incrementing counter for thread ids
+ static const int32 kViewId = 5; // arbitrary id for our testing view
+
+ MessageLoopForIO msg_loop_;
+ // channel that the renderer uses to talk to the browser.
+ // For this test, we will handle the browser end of the channel.
+ scoped_ptr<IPC::Channel> channel_;
+ RenderThread* render_thread_; // owned by mock_process_
+ scoped_ptr<MockRenderProcess> mock_process_;
+ RenderView* view_; // not owned, deletes itself on close
+ scoped_ptr<RendererMainPlatformDelegate> platform_;
+ scoped_ptr<MainFunctionParams> params_;
+ scoped_ptr<CommandLine> command_line_;
+ scoped_ptr<SandboxInitWrapper> sandbox_init_wrapper_;
+
+ scoped_ptr<PhishingDOMFeatureExtractor> extractor_;
+ // Map of URL -> response body for network requests from the renderer.
+ // Any URLs not in this map are served a 404 error.
+ std::map<std::string, std::string> responses_;
+ bool success_; // holds the success value from ExtractFeatures
+};
+
+int PhishingDOMFeatureExtractorTest::next_thread_id_ = 0;
+
+TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) {
+ responses_["http://host.com/"] =
+ "<html><head><body>"
+ "<form action=\"query\"><input type=text><input type=checkbox></form>"
+ "<form action=\"http://cgi.host.com/submit\"></form>"
+ "<form action=\"http://other.com/\"></form>"
+ "<form action=\"query\"></form>"
+ "<form></form></body></html>";
+
+ FeatureMap expected_features;
+ expected_features.AddBooleanFeature(features::kPageHasForms);
+ expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.25);
+ expected_features.AddBooleanFeature(features::kPageHasTextInputs);
+ expected_features.AddBooleanFeature(features::kPageHasCheckInputs);
+
+ FeatureMap features;
+ LoadURL("http://host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ responses_["http://host.com/"] =
+ "<html><head><body>"
+ "<input type=\"radio\"><input type=password></body></html>";
+
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kPageHasRadioInputs);
+ expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
+
+ features.Clear();
+ LoadURL("http://host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ responses_["http://host.com/"] =
+ "<html><head><body><input></body></html>";
+
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kPageHasTextInputs);
+
+ features.Clear();
+ LoadURL("http://host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ responses_["http://host.com/"] =
+ "<html><head><body><input type=\"invalid\"></body></html>";
+
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kPageHasTextInputs);
+
+ features.Clear();
+ LoadURL("http://host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+}
+
+TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) {
+ responses_["http://www.host.com/"] =
+ "<html><head><body>"
+ "<a href=\"http://www2.host.com/abc\">link</a>"
+ "<a name=page_anchor></a>"
+ "<a href=\"http://www.chromium.org/\">chromium</a>"
+ "</body></html";
+
+ FeatureMap expected_features;
+ expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.5);
+ expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.0);
+ expected_features.AddBooleanFeature(features::kPageLinkDomain +
+ std::string("chromium.org"));
+
+ FeatureMap features;
+ LoadURL("http://www.host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ responses_.clear();
+ responses_["https://www.host.com/"] =
+ "<html><head><body>"
+ "<a href=\"login\">this is secure</a>"
+ "<a href=\"http://host.com\">not secure</a>"
+ "<a href=\"https://www2.host.com/login\">also secure</a>"
+ "<a href=\"http://chromium.org/\">also not secure</a>"
+ "</body></html>";
+
+ expected_features.Clear();
+ expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25);
+ expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.5);
+ expected_features.AddBooleanFeature(features::kPageLinkDomain +
+ std::string("chromium.org"));
+
+ features.Clear();
+ LoadURL("https://www.host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+}
+
+TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) {
+ responses_["http://host.com/"] =
+ "<html><head><script></script><script></script></head></html>";
+
+ FeatureMap expected_features;
+ expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
+
+ FeatureMap features;
+ LoadURL("http://host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ responses_["http://host.com/"] =
+ "<html><head><script></script><script></script><script></script>"
+ "<script></script><script></script><script></script><script></script>"
+ "</head><body><img src=\"blah.gif\">"
+ "<img src=\"http://host2.com/blah.gif\"></body></html>";
+
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
+ expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTSix);
+ expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 0.5);
+
+ features.Clear();
+ LoadURL("http://host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+}
+
+TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) {
+ // Test that features are aggregated across all frames.
+ responses_["http://host.com/"] =
+ "<html><body><input type=text><a href=\"info.html\">link</a>"
+ "<iframe src=\"http://host2.com/\"></iframe>"
+ "<iframe src=\"http://host3.com/\"></iframe>"
+ "</body></html>";
+
+ responses_["http://host2.com/"] =
+ "<html><head><script></script><body>"
+ "<form action=\"http://host4.com/\"><input type=checkbox></form>"
+ "<form action=\"http://host2.com/submit\"></form>"
+ "<a href=\"http://www.host2.com/home\">link</a>"
+ "<iframe src=\"nested.html\"></iframe>"
+ "<body></html>";
+
+ responses_["http://host2.com/nested.html"] =
+ "<html><body><input type=password>"
+ "<a href=\"https://host4.com/\">link</a>"
+ "<a href=\"relative\">another</a>"
+ "</body></html>";
+
+ responses_["http://host3.com/"] =
+ "<html><head><script></script><body>"
+ "<img src=\"http://host.com/123.png\">"
+ "</body></html>";
+
+ FeatureMap expected_features;
+ expected_features.AddBooleanFeature(features::kPageHasForms);
+ // Form action domains are compared to the URL of the document they're in,
+ // not the URL of the toplevel page. So http://host2.com/ has two form
+ // actions, one of which is external.
+ expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5);
+ expected_features.AddBooleanFeature(features::kPageHasTextInputs);
+ expected_features.AddBooleanFeature(features::kPageHasPswdInputs);
+ expected_features.AddBooleanFeature(features::kPageHasCheckInputs);
+ expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25);
+ expected_features.AddBooleanFeature(features::kPageLinkDomain +
+ std::string("host4.com"));
+ expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.25);
+ expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne);
+ expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 1.0);
+
+ FeatureMap features;
+ LoadURL("http://host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+}
+
+// TODO(bryner): Test extraction with multiple passes, including the case where
+// the node we stopped on is removed from the document.
+
+} // namespace safe_browsing