diff options
author | bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-07-29 00:37:45 +0000 |
---|---|---|
committer | bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-07-29 00:37:45 +0000 |
commit | f4dafe029aab967bbc6ec5ad28c9f928280367f3 (patch) | |
tree | 21233b5642296b8ba402c6a1b771cce64d9d37d1 /chrome/renderer | |
parent | 92608249eb322c9075f147c2dea302fb1c65acb1 (diff) | |
download | chromium_src-f4dafe029aab967bbc6ec5ad28c9f928280367f3.zip chromium_src-f4dafe029aab967bbc6ec5ad28c9f928280367f3.tar.gz chromium_src-f4dafe029aab967bbc6ec5ad28c9f928280367f3.tar.bz2 |
Add an extractor for DOM features to be used for client side phishing detection.
PhishingDOMFeatureExtractor iterates over the page elements and computes a
number of features. To avoid blocking the renderer for too long, the extractor
may run in several chunks of works, posting a task to continue processing if
necessary.
This CL only includes the feature extraction itself. I will add the logic to
cap the time per iteration in a follow-up CL.
BUG=none
TEST=PhishingDOMFeatureExtractorTest
Review URL: http://codereview.chromium.org/2878046
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@54082 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer')
6 files changed, 1060 insertions, 1 deletions
diff --git a/chrome/renderer/safe_browsing/features.cc b/chrome/renderer/safe_browsing/features.cc index 47a093c..4d67cf3 100644 --- a/chrome/renderer/safe_browsing/features.cc +++ b/chrome/renderer/safe_browsing/features.cc @@ -15,6 +15,10 @@ FeatureMap::FeatureMap() {} FeatureMap::~FeatureMap() {} bool FeatureMap::AddBooleanFeature(const std::string& name) { + return AddRealFeature(name, 1.0); +} + +bool FeatureMap::AddRealFeature(const std::string& name, double value) { if (features_.size() >= kMaxFeatureMapSize) { // If we hit this case, it indicates that either kMaxFeatureMapSize is // too small, or there is a bug causing too many features to be added. @@ -25,7 +29,16 @@ bool FeatureMap::AddBooleanFeature(const std::string& name) { UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1); return false; } - features_[name] = 1.0; + // We only expect features in the range [0.0, 1.0], so fail if the feature is + // outside this range. + if (value < 0.0 || value > 1.0) { + LOG(ERROR) << "Not adding feature: " << name << " because the value " + << value << " is not in the range [0.0, 1.0]."; + UMA_HISTOGRAM_COUNTS("SBClientPhishing.IllegalFeatureValue", 1); + return false; + } + + features_[name] = value; return true; } @@ -47,5 +60,25 @@ const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3"; // URL path features const char kUrlPathToken[] = "UrlPathToken="; +// DOM HTML form features +const char kPageHasForms[] = "PageHasForms"; +const char kPageActionOtherDomainFreq[] = "PageActionOtherDomainFreq"; +const char kPageHasTextInputs[] = "PageHasTextInputs"; +const char kPageHasPswdInputs[] = "PageHasPswdInputs"; +const char kPageHasRadioInputs[] = "PageHasRadioInputs"; +const char kPageHasCheckInputs[] = "PageHasCheckInputs"; + +// DOM HTML link features +const char kPageExternalLinksFreq[] = "PageExternalLinksFreq"; +const char kPageLinkDomain[] = "PageLinkDomain="; +const char kPageSecureLinksFreq[] = "PageSecureLinksFreq"; + +// DOM HTML script features +const char kPageNumScriptTagsGTOne[] = "PageNumScriptTags>1"; +const char kPageNumScriptTagsGTSix[] = "PageNumScriptTags>6"; + +// Other DOM HTML features +const char kPageImgOtherDomainFreq[] = "PageImgOtherDomainFreq"; + } // namespace features } // namespace safe_browsing diff --git a/chrome/renderer/safe_browsing/features.h b/chrome/renderer/safe_browsing/features.h index 1a82c61..f3c8348 100644 --- a/chrome/renderer/safe_browsing/features.h +++ b/chrome/renderer/safe_browsing/features.h @@ -44,6 +44,12 @@ class FeatureMap { // kMaxFeatureMapSize. bool AddBooleanFeature(const std::string& name); + // Adds a real-valued feature to a FeatureMap with the given value. + // Values must always be in the range [0.0, 1.0]. Returns true on + // success, or false if the feature map exceeds kMaxFeatureMapSize + // or the value is outside of the allowed range. + bool AddRealFeature(const std::string& name, double value); + // Provides read-only access to the current set of features. const base::hash_map<std::string, double>& features() const { return features_; @@ -103,6 +109,55 @@ extern const char kUrlNumOtherHostTokensGTThree[]; // token features, "abc" and "efg". Query parameters are not included. extern const char kUrlPathToken[]; +//////////////////////////////////////////////////// +// DOM HTML form features +//////////////////////////////////////////////////// + +// Set if the page has any <form> elements. +extern const char kPageHasForms[]; +// The fraction of form elements whose |action| attribute points to a +// URL on a different domain from the document URL. +extern const char kPageActionOtherDomainFreq[]; + +// Set if the page has any <input type="text"> elements +// (includes inputs with missing or unknown types). +extern const char kPageHasTextInputs[]; +// Set if the page has any <input type="password"> elements. +extern const char kPageHasPswdInputs[]; +// Set if the page has any <input type="radio"> elements. +extern const char kPageHasRadioInputs[]; +// Set if the page has any <input type="checkbox"> elements. +extern const char kPageHasCheckInputs[]; + +//////////////////////////////////////////////////// +// DOM HTML link features +//////////////////////////////////////////////////// + +// The fraction of links in the page which point to a domain other than the +// domain of the document. See "URL host features" above for a discussion +// of how the doamin is computed. +extern const char kPageExternalLinksFreq[]; +// Token feature containing each external domain that is linked to. +extern const char kPageLinkDomain[]; +// Fraction of links in the page that use https. +extern const char kPageSecureLinksFreq[]; + +//////////////////////////////////////////////////// +// DOM HTML script features +//////////////////////////////////////////////////// + +// Set if the number of <script> elements in the page is greater than 1. +extern const char kPageNumScriptTagsGTOne[]; +// Set if the number of <script> elements in the page is greater than 6. +extern const char kPageNumScriptTagsGTSix[]; + +//////////////////////////////////////////////////// +// Other DOM HTML features +//////////////////////////////////////////////////// + +// The fraction of images whose src attribute points to an external domain. +extern const char kPageImgOtherDomainFreq[]; + } // namespace features } // namepsace safe_browsing diff --git a/chrome/renderer/safe_browsing/features_unittest.cc b/chrome/renderer/safe_browsing/features_unittest.cc index ad07ba2..ac5cb55 100644 --- a/chrome/renderer/safe_browsing/features_unittest.cc +++ b/chrome/renderer/safe_browsing/features_unittest.cc @@ -6,6 +6,7 @@ #include "base/format_macros.h" #include "base/string_util.h" +#include "testing/gmock/include/gmock/gmock.h" #include "testing/gtest/include/gtest/gtest.h" namespace safe_browsing { @@ -24,4 +25,20 @@ TEST(PhishingFeaturesTest, TooManyFeatures) { EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size()); } +TEST(PhishingFeaturesTest, IllegalFeatureValue) { + FeatureMap features; + EXPECT_FALSE(features.AddRealFeature("toosmall", -0.1)); + EXPECT_TRUE(features.AddRealFeature("zero", 0.0)); + EXPECT_TRUE(features.AddRealFeature("pointfive", 0.5)); + EXPECT_TRUE(features.AddRealFeature("one", 1.0)); + EXPECT_FALSE(features.AddRealFeature("toolarge", 1.1)); + + FeatureMap expected_features; + expected_features.AddRealFeature("zero", 0.0); + expected_features.AddRealFeature("pointfive", 0.5); + expected_features.AddRealFeature("one", 1.0); + EXPECT_THAT(features.features(), + ::testing::ContainerEq(expected_features.features())); +} + } // namespace safe_browsing diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc new file mode 100644 index 0000000..c8f4bd0 --- /dev/null +++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc @@ -0,0 +1,416 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" + +#include "base/compiler_specific.h" +#include "base/hash_tables.h" +#include "base/histogram.h" +#include "base/logging.h" +#include "chrome/renderer/render_view.h" +#include "chrome/renderer/safe_browsing/features.h" +#include "net/base/registry_controlled_domain.h" +#include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" +#include "third_party/WebKit/WebKit/chromium/public/WebElement.h" +#include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" +#include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h" +#include "third_party/WebKit/WebKit/chromium/public/WebString.h" +#include "third_party/WebKit/WebKit/chromium/public/WebView.h" + +namespace safe_browsing { + +// Intermediate state used for computing features. See features.h for +// descriptions of the DOM features that are computed. +struct PhishingDOMFeatureExtractor::PageFeatureState { + // Link related features + int external_links; + base::hash_set<std::string> external_domains; + int secure_links; + int total_links; + + // Form related features + int num_forms; + int num_text_inputs; + int num_pswd_inputs; + int num_radio_inputs; + int num_check_inputs; + int action_other_domain; + int total_actions; + + // Image related features + int img_other_domain; + int total_imgs; + + // How many script tags + int num_script_tags; + + PageFeatureState() + : external_links(0), + secure_links(0), + total_links(0), + num_forms(0), + num_text_inputs(0), + num_pswd_inputs(0), + num_radio_inputs(0), + num_check_inputs(0), + action_other_domain(0), + total_actions(0), + img_other_domain(0), + total_imgs(0), + num_script_tags(0) {} + + ~PageFeatureState() {} +}; + +// Per-frame state +struct PhishingDOMFeatureExtractor::FrameData { + // This is our reference to document.all, which is an iterator over all + // of the elements in the document. It keeps track of our current position. + WebKit::WebNodeCollection elements; + // The domain of the document URL, stored here so that we don't need to + // recompute it every time it's needed. + std::string domain; +}; + +PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( + RenderView* render_view) + : render_view_(render_view), + ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { + Clear(); +} + +PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() { + // The RenderView should have called CancelPendingExtraction() before + // we are destroyed. + CheckNoPendingExtraction(); +} + +void PhishingDOMFeatureExtractor::ExtractFeatures( + FeatureMap* features, + DoneCallback* done_callback) { + // The RenderView should have called CancelPendingExtraction() before + // starting a new extraction, so DCHECK this. + CheckNoPendingExtraction(); + // However, in an opt build, we will go ahead and clean up the pending + // extraction so that we can start in a known state. + CancelPendingExtraction(); + + features_ = features; + done_callback_.reset(done_callback); + MessageLoop::current()->PostTask( + FROM_HERE, + method_factory_.NewRunnableMethod( + &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); +} + +void PhishingDOMFeatureExtractor::CancelPendingExtraction() { + // Cancel any pending callbacks, and clear our state. + method_factory_.RevokeAll(); + Clear(); +} + +void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { + if (!cur_frame_) { + WebKit::WebView* web_view = render_view_->webview(); + if (!web_view) { + // When the WebView is going away, the render view should have called + // CancelPendingExtraction() which should have stopped any pending work, + // so this case should not happen. + NOTREACHED(); + RunCallback(false); + return; + } + cur_frame_ = web_view->mainFrame(); + page_feature_state_.reset(new PageFeatureState); + } + + for (; cur_frame_; + cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { + WebKit::WebNode cur_node; + if (cur_frame_data_.get()) { + // We're resuming traversal of a frame, so just advance to the next node. + cur_node = cur_frame_data_->elements.nextItem(); + } else { + // We just moved to a new frame, so update our frame state + // and advance to the first element. + if (!ResetFrameData()) { + // Nothing in this frame, move on to the next one. + LOG(WARNING) << "No content in frame, skipping"; + continue; + } + cur_node = cur_frame_data_->elements.firstItem(); + } + + for (; !cur_node.isNull(); + cur_node = cur_frame_data_->elements.nextItem()) { + if (!cur_node.isElementNode()) { + continue; + } + WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); + if (element.hasTagName("a")) { + HandleLink(element); + } else if (element.hasTagName("form")) { + HandleForm(element); + } else if (element.hasTagName("img")) { + HandleImage(element); + } else if (element.hasTagName("input")) { + HandleInput(element); + } else if (element.hasTagName("script")) { + HandleScript(element); + } + + // TODO(bryner): stop if too much time has elapsed, and add histograms + // for the time spent processing. + } + + // We're done with this frame, recalculate the FrameData when we + // advance to the next frame. + cur_frame_data_.reset(); + } + + InsertFeatures(); + RunCallback(true); +} + +void PhishingDOMFeatureExtractor::HandleLink( + const WebKit::WebElement& element) { + // Count the number of times we link to a different host. + if (!element.hasAttribute("href")) { + DLOG(INFO) << "Skipping anchor tag with no href"; + return; + } + + // Retrieve the link and resolve the link in case it's relative. + WebKit::WebURL full_url = element.document().completeURL( + element.getAttribute("href")); + + std::string domain; + bool is_external = IsExternalDomain(full_url, &domain); + if (domain.empty()) { + LOG(ERROR) << "Could not extract domain from link: " << full_url; + return; + } + + if (is_external) { + ++page_feature_state_->external_links; + + // Record each unique domain that we link to. + page_feature_state_->external_domains.insert(domain); + } + + // Check how many are https links. + if (GURL(full_url).SchemeIs("https")) { + ++page_feature_state_->secure_links; + } + + ++page_feature_state_->total_links; +} + +void PhishingDOMFeatureExtractor::HandleForm( + const WebKit::WebElement& element) { + // Increment the number of forms on this page. + ++page_feature_state_->num_forms; + + // Record whether the action points to a different domain. + if (!element.hasAttribute("action")) { + return; + } + + WebKit::WebURL full_url = element.document().completeURL( + element.getAttribute("action")); + + std::string domain; + bool is_external = IsExternalDomain(full_url, &domain); + if (domain.empty()) { + LOG(ERROR) << "Could not extract domain from form action: " << full_url; + return; + } + + if (is_external) { + ++page_feature_state_->action_other_domain; + } + ++page_feature_state_->total_actions; +} + +void PhishingDOMFeatureExtractor::HandleImage( + const WebKit::WebElement& element) { + if (!element.hasAttribute("src")) { + DLOG(INFO) << "Skipping img tag with no src"; + } + + // Record whether the image points to a different domain. + WebKit::WebURL full_url = element.document().completeURL( + element.getAttribute("src")); + std::string domain; + bool is_external = IsExternalDomain(full_url, &domain); + if (domain.empty()) { + LOG(ERROR) << "Could not extract domain from image src: " << full_url; + return; + } + + if (is_external) { + ++page_feature_state_->img_other_domain; + } + ++page_feature_state_->total_imgs; +} + +void PhishingDOMFeatureExtractor::HandleInput( + const WebKit::WebElement& element) { + // The HTML spec says that if the type is unspecified, it defaults to text. + // In addition, any unrecognized type will be treated as a text input. + // + // Note that we use the attribute value rather than + // WebFormControlElement::formControlType() for consistency with the + // way the phishing classification model is created. + std::string type = element.getAttribute("type").utf8(); + StringToLowerASCII(&type); + if (type == "password") { + ++page_feature_state_->num_pswd_inputs; + } else if (type == "radio") { + ++page_feature_state_->num_radio_inputs; + } else if (type == "checkbox") { + ++page_feature_state_->num_check_inputs; + } else if (type != "submit" && type != "reset" && type != "file" && + type != "hidden" && type != "image" && type != "button") { + // Note that there are a number of new input types in HTML5 that are not + // handled above. For now, we will consider these as text inputs since + // they could be used to capture user input. + ++page_feature_state_->num_text_inputs; + } +} + +void PhishingDOMFeatureExtractor::HandleScript( + const WebKit::WebElement& element) { + ++page_feature_state_->num_script_tags; +} + +void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { + DCHECK(!done_callback_.get()); + DCHECK(!cur_frame_data_.get()); + DCHECK(!cur_frame_); + if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { + LOG(ERROR) << "Extraction in progress, missing call to " + << "CancelPendingExtraction"; + } +} + +void PhishingDOMFeatureExtractor::RunCallback(bool success) { + DCHECK(done_callback_.get()); + done_callback_->Run(success); + Clear(); +} + +void PhishingDOMFeatureExtractor::Clear() { + features_ = NULL; + done_callback_.reset(NULL); + cur_frame_data_.reset(NULL); + cur_frame_ = NULL; +} + +bool PhishingDOMFeatureExtractor::ResetFrameData() { + DCHECK(cur_frame_); + DCHECK(!cur_frame_data_.get()); + + WebKit::WebDocument doc = cur_frame_->document(); + if (doc.isNull()) { + return false; + } + cur_frame_data_.reset(new FrameData()); + cur_frame_data_->elements = doc.all(); + cur_frame_data_->domain = + net::RegistryControlledDomainService::GetDomainAndRegistry( + cur_frame_->url()); + return true; +} + +bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, + std::string* domain) const { + DCHECK(domain); + DCHECK(cur_frame_data_.get()); + + if (cur_frame_data_->domain.empty()) { + return false; + } + + // TODO(bryner): Ensure that the url encoding is consistent with the features + // in the model. + if (url.HostIsIPAddress()) { + domain->assign(url.host()); + } else { + domain->assign(net::RegistryControlledDomainService::GetDomainAndRegistry( + url)); + } + + return !domain->empty() && *domain != cur_frame_data_->domain; +} + +void PhishingDOMFeatureExtractor::InsertFeatures() { + DCHECK(page_feature_state_.get()); + features_->Clear(); + + if (page_feature_state_->total_links > 0) { + // Add a feature for the fraction of times the page links to an external + // domain vs. an internal domain. + double link_freq = static_cast<double>( + page_feature_state_->external_links) / + page_feature_state_->total_links; + features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq); + + // Add a feature for each unique domain that we're linking to + for (base::hash_set<std::string>::iterator it = + page_feature_state_->external_domains.begin(); + it != page_feature_state_->external_domains.end(); ++it) { + features_->AddBooleanFeature(features::kPageLinkDomain + *it); + } + + // Fraction of links that use https. + double secure_freq = static_cast<double>( + page_feature_state_->secure_links) / page_feature_state_->total_links; + features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq); + } + + // Record whether forms appear and whether various form elements appear. + if (page_feature_state_->num_forms > 0) { + features_->AddBooleanFeature(features::kPageHasForms); + } + if (page_feature_state_->num_text_inputs > 0) { + features_->AddBooleanFeature(features::kPageHasTextInputs); + } + if (page_feature_state_->num_pswd_inputs > 0) { + features_->AddBooleanFeature(features::kPageHasPswdInputs); + } + if (page_feature_state_->num_radio_inputs > 0) { + features_->AddBooleanFeature(features::kPageHasRadioInputs); + } + if (page_feature_state_->num_check_inputs > 0) { + features_->AddBooleanFeature(features::kPageHasCheckInputs); + } + + // Record fraction of form actions that point to a different domain. + if (page_feature_state_->total_actions > 0) { + double action_freq = static_cast<double>( + page_feature_state_->action_other_domain) / + page_feature_state_->total_actions; + features_->AddRealFeature(features::kPageActionOtherDomainFreq, + action_freq); + } + + // Record how many image src attributes point to a different domain. + if (page_feature_state_->total_imgs > 0) { + double img_freq = static_cast<double>( + page_feature_state_->img_other_domain) / + page_feature_state_->total_imgs; + features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq); + } + + // Record number of script tags (discretized for numerical stability.) + if (page_feature_state_->num_script_tags > 1) { + features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); + if (page_feature_state_->num_script_tags > 6) { + features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); + } + } +} + +} // namespace safe_browsing diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h new file mode 100644 index 0000000..bc9d599 --- /dev/null +++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h @@ -0,0 +1,128 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// PhishingDOMFeatureExtractor handles computing DOM-based features for the +// client-side phishing detection model. These include the presence of various +// types of elements, ratios of external and secure links, and tokens for +// external domains linked to. + +#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ +#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ + +#include <string> + +#include "base/basictypes.h" +#include "base/callback.h" +#include "base/scoped_ptr.h" +#include "base/task.h" + +class GURL; +class RenderView; + +namespace WebKit { +class WebElement; +class WebFrame; +} + +namespace safe_browsing { +class FeatureMap; + +class PhishingDOMFeatureExtractor { + public: + // Callback to be run when feature extraction finishes. The callback + // argument is true if extraction was successful, false otherwise. + typedef Callback1<bool>::Type DoneCallback; + + // Creates a PhishingDOMFeatureExtractor for the specified RenderView. + // The PhishingDOMFeatureExtrator should be destroyed prior to destroying + // the RenderView. + explicit PhishingDOMFeatureExtractor(RenderView* render_view); + ~PhishingDOMFeatureExtractor(); + + // Begins extracting features into the given FeatureMap for the page + // currently loaded in this object's RenderView. To avoid blocking the + // render thread for too long, the feature extractor may run in several + // chunks of work, posting a task to the current MessageLoop to continue + // processing. Once feature extraction is complete, |done_callback| + // is run. PhishingDOMFeatureExtractor takes ownership of the callback. + void ExtractFeatures(FeatureMap* features, DoneCallback* done_callback); + + // Cancels any pending feature extraction. The DoneCallback will not be run. + // Must be called if there is a feature extraction in progress when the page + // is unloaded or the PhishingDOMFeatureExtractor is destroyed. + void CancelPendingExtraction(); + + private: + struct FrameData; + struct PageFeatureState; + + // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs + // until a predefined maximum amount of time has elapsed, then posts a task + // to the current MessageLoop to continue extraction. When extraction + // finishes, calls RunCallback(). + void ExtractFeaturesWithTimeout(); + + // Handlers for the various HTML elements that we compute features for. + // Since some of the features (such as ratios) cannot be computed until + // feature extraction is finished, these handlers do not add to the feature + // map directly. Instead, they update the values in the PageFeatureState. + void HandleLink(const WebKit::WebElement& element); + void HandleForm(const WebKit::WebElement& element); + void HandleImage(const WebKit::WebElement& element); + void HandleInput(const WebKit::WebElement& element); + void HandleScript(const WebKit::WebElement& element); + + // Helper to verify that there is no pending feature extraction. Dies in + // debug builds if the state is not as expected. This is a no-op in release + // builds. + void CheckNoPendingExtraction(); + + // Runs |done_callback_| and then clears all internal state. + void RunCallback(bool success); + + // Clears all internal feature extraction state. + void Clear(); + + // Called after advancing |cur_frame_| to update the state in + // |cur_frame_data_|. Returns true if the state was updated successfully. + bool ResetFrameData(); + + // Given a URL, checks whether the domain is different from the domain of + // the current frame's URL. If so, stores the domain in |domain| and returns + // true, otherwise returns false. + bool IsExternalDomain(const GURL& url, std::string* domain) const; + + // Called once all frames have been processed to compute features from the + // PageFeatureState and add them to |features_|. See features.h for a + // description of which features are computed. + void InsertFeatures(); + + // Non-owned pointer to the view that we will extract features from. + RenderView* render_view_; + + // The output parameters from the most recent call to ExtractFeatures(). + FeatureMap* features_; // The caller keeps ownership of this. + scoped_ptr<DoneCallback> done_callback_; + + // Non-owned pointer to the current frame that we are processing. + WebKit::WebFrame* cur_frame_; + + // Stores extra state for |cur_frame_| that will be persisted until we + // advance to the next frame. + scoped_ptr<FrameData> cur_frame_data_; + + // Stores the intermediate data used to create features. This data is + // accumulated across all frames in the RenderView. + scoped_ptr<PageFeatureState> page_feature_state_; + + // Used to create ExtractFeaturesWithTimeout tasks. + // These tasks are revoked if extraction is cancelled. + ScopedRunnableMethodFactory<PhishingDOMFeatureExtractor> method_factory_; + + DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor); +}; + +} // namespace safe_browsing + +#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_ diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc new file mode 100644 index 0000000..637b2bd --- /dev/null +++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_unittest.cc @@ -0,0 +1,410 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" + +#include <string.h> // for memcpy() +#include <map> +#include <string> + +#include "base/callback.h" +#include "base/command_line.h" +#include "base/message_loop.h" +#include "base/process.h" +#include "base/string_util.h" +#include "chrome/common/main_function_params.h" +#include "chrome/common/render_messages.h" +#include "chrome/common/sandbox_init_wrapper.h" +#include "chrome/renderer/mock_render_process.h" +#include "chrome/renderer/render_thread.h" +#include "chrome/renderer/render_view.h" +#include "chrome/renderer/render_view_visitor.h" +#include "chrome/renderer/renderer_main_platform_delegate.h" +#include "chrome/renderer/safe_browsing/features.h" +#include "googleurl/src/gurl.h" +#include "ipc/ipc_channel.h" +#include "testing/gmock/include/gmock/gmock.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "third_party/WebKit/WebKit/chromium/public/WebFrame.h" +#include "third_party/WebKit/WebKit/chromium/public/WebURLRequest.h" +#include "third_party/WebKit/WebKit/chromium/public/WebView.h" +#include "webkit/glue/webkit_glue.h" + +using ::testing::ContainerEq; + +namespace safe_browsing { + +class PhishingDOMFeatureExtractorTest : public ::testing::Test, + public IPC::Channel::Listener, + public RenderViewVisitor { + public: + // IPC::Channel::Listener implementation. + virtual void OnMessageReceived(const IPC::Message& message) { + IPC_BEGIN_MESSAGE_MAP(PhishingDOMFeatureExtractorTest, message) + IPC_MESSAGE_HANDLER(ViewHostMsg_RenderViewReady, OnRenderViewReady) + IPC_MESSAGE_HANDLER(ViewHostMsg_DidStopLoading, OnDidStopLoading) + IPC_MESSAGE_HANDLER(ViewHostMsg_RequestResource, OnRequestResource) + IPC_END_MESSAGE_MAP() + } + + // RenderViewVisitor implementation. + virtual bool Visit(RenderView* render_view) { + view_ = render_view; + return false; + } + + protected: + virtual void SetUp() { + // Set up the renderer. This code is largely adapted from + // render_view_test.cc and renderer_main.cc. Note that we use a + // MockRenderProcess (because we don't need to use IPC for painting), + // but we use a real RenderThread so that we can use the ResourceDispatcher + // to fetch network resources. These are then served canned content + // in OnRequestResource(). + sandbox_init_wrapper_.reset(new SandboxInitWrapper); + command_line_.reset(new CommandLine(CommandLine::ARGUMENTS_ONLY)); + params_.reset(new MainFunctionParams(*command_line_, + *sandbox_init_wrapper_, NULL)); + platform_.reset(new RendererMainPlatformDelegate(*params_)); + platform_->PlatformInitialize(); + + // We use a new IPC channel name for each test that runs. + // This is necessary because the renderer-side IPC channel is not + // shut down when the RenderThread goes away, so attempting to reuse + // the channel name gives an error (see ChildThread::~ChildThread()). + std::string thread_name = StringPrintf( + "phishing_dom_feature_Extractor_unittest.%d", + next_thread_id_++); + channel_.reset(new IPC::Channel(thread_name, + IPC::Channel::MODE_SERVER, this)); + ASSERT_TRUE(channel_->Connect()); + + webkit_glue::SetJavaScriptFlags(L"--expose-gc"); + mock_process_.reset(new MockRenderProcess); + render_thread_ = new RenderThread(thread_name); + mock_process_->set_main_thread(render_thread_); + + // Tell the renderer to create a view, then wait until it's ready. + // We can't call View::Create() directly here or else we won't get + // RenderProcess's lazy initialization of WebKit. + view_ = NULL; + ViewMsg_New_Params params; + params.parent_window = 0; + params.view_id = kViewId; + params.session_storage_namespace_id = kInvalidSessionStorageNamespaceId; + ASSERT_TRUE(channel_->Send(new ViewMsg_New(params))); + msg_loop_.Run(); + + extractor_.reset(new PhishingDOMFeatureExtractor(view_)); + } + + virtual void TearDown() { + // Try very hard to collect garbage before shutting down. + GetMainFrame()->collectGarbage(); + GetMainFrame()->collectGarbage(); + + ASSERT_TRUE(channel_->Send(new ViewMsg_Close(kViewId))); + do { + msg_loop_.RunAllPending(); + view_ = NULL; + RenderView::ForEach(this); + } while (view_); + + mock_process_.reset(); + msg_loop_.RunAllPending(); + platform_->PlatformUninitialize(); + platform_.reset(); + command_line_.reset(); + sandbox_init_wrapper_.reset(); + } + + // Returns the main WebFrame for our RenderView. + WebKit::WebFrame* GetMainFrame() { + return view_->webview()->mainFrame(); + } + + // Loads |url| into the RenderView, waiting for the load to finish. + void LoadURL(const std::string& url) { + GetMainFrame()->loadRequest(WebKit::WebURLRequest(GURL(url))); + msg_loop_.Run(); + } + + // Runs the DOMFeatureExtractor on the RenderView, waiting for the + // completion callback. Returns the success boolean from the callback. + bool ExtractFeatures(FeatureMap* features) { + success_ = false; + extractor_->ExtractFeatures( + features, + NewCallback(this, &PhishingDOMFeatureExtractorTest::ExtractionDone)); + msg_loop_.Run(); + return success_; + } + + // Completion callback for feature extraction. + void ExtractionDone(bool success) { + success_ = success; + msg_loop_.Quit(); + } + + // IPC message handlers below + + // Notification that page load has finished. Exit the message loop + // so that the test can continue. + void OnDidStopLoading() { + msg_loop_.Quit(); + } + + // Notification that the renderer wants to load a resource. + // If the requested url is in responses_, we send the renderer a 200 + // and the supplied content, otherwise we send it a 404 error. + void OnRequestResource(const IPC::Message& message, + int request_id, + const ViewHostMsg_Resource_Request& request_data) { + std::string headers, body; + std::map<std::string, std::string>::const_iterator it = + responses_.find(request_data.url.spec()); + if (it == responses_.end()) { + headers = "HTTP/1.1 404 Not Found\0Content-Type:text/html\0\0"; + body = "content not found"; + } else { + headers = "HTTP/1.1 200 OK\0Content-Type:text/html\0\0"; + body = it->second; + } + + ResourceResponseHead response_head; + response_head.headers = new net::HttpResponseHeaders(headers); + response_head.mime_type = "text/html"; + ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_ReceivedResponse( + message.routing_id(), request_id, response_head))); + + base::SharedMemory shared_memory; + ASSERT_TRUE(shared_memory.Create(std::wstring(), false, + false, body.size())); + ASSERT_TRUE(shared_memory.Map(body.size())); + memcpy(shared_memory.memory(), body.data(), body.size()); + + base::SharedMemoryHandle handle; + ASSERT_TRUE(shared_memory.GiveToProcess(base::Process::Current().handle(), + &handle)); + ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_DataReceived( + message.routing_id(), request_id, handle, body.size()))); + + ASSERT_TRUE(channel_->Send(new ViewMsg_Resource_RequestComplete( + message.routing_id(), + request_id, + URLRequestStatus(), + std::string()))); + } + + // Notification that the render view we've created is ready to use. + void OnRenderViewReady() { + // Grab a pointer to the new view using RenderViewVisitor. + ASSERT_TRUE(!view_); + RenderView::ForEach(this); + ASSERT_TRUE(view_); + msg_loop_.Quit(); + } + + static int next_thread_id_; // incrementing counter for thread ids + static const int32 kViewId = 5; // arbitrary id for our testing view + + MessageLoopForIO msg_loop_; + // channel that the renderer uses to talk to the browser. + // For this test, we will handle the browser end of the channel. + scoped_ptr<IPC::Channel> channel_; + RenderThread* render_thread_; // owned by mock_process_ + scoped_ptr<MockRenderProcess> mock_process_; + RenderView* view_; // not owned, deletes itself on close + scoped_ptr<RendererMainPlatformDelegate> platform_; + scoped_ptr<MainFunctionParams> params_; + scoped_ptr<CommandLine> command_line_; + scoped_ptr<SandboxInitWrapper> sandbox_init_wrapper_; + + scoped_ptr<PhishingDOMFeatureExtractor> extractor_; + // Map of URL -> response body for network requests from the renderer. + // Any URLs not in this map are served a 404 error. + std::map<std::string, std::string> responses_; + bool success_; // holds the success value from ExtractFeatures +}; + +int PhishingDOMFeatureExtractorTest::next_thread_id_ = 0; + +TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) { + responses_["http://host.com/"] = + "<html><head><body>" + "<form action=\"query\"><input type=text><input type=checkbox></form>" + "<form action=\"http://cgi.host.com/submit\"></form>" + "<form action=\"http://other.com/\"></form>" + "<form action=\"query\"></form>" + "<form></form></body></html>"; + + FeatureMap expected_features; + expected_features.AddBooleanFeature(features::kPageHasForms); + expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.25); + expected_features.AddBooleanFeature(features::kPageHasTextInputs); + expected_features.AddBooleanFeature(features::kPageHasCheckInputs); + + FeatureMap features; + LoadURL("http://host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + responses_["http://host.com/"] = + "<html><head><body>" + "<input type=\"radio\"><input type=password></body></html>"; + + expected_features.Clear(); + expected_features.AddBooleanFeature(features::kPageHasRadioInputs); + expected_features.AddBooleanFeature(features::kPageHasPswdInputs); + + features.Clear(); + LoadURL("http://host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + responses_["http://host.com/"] = + "<html><head><body><input></body></html>"; + + expected_features.Clear(); + expected_features.AddBooleanFeature(features::kPageHasTextInputs); + + features.Clear(); + LoadURL("http://host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + responses_["http://host.com/"] = + "<html><head><body><input type=\"invalid\"></body></html>"; + + expected_features.Clear(); + expected_features.AddBooleanFeature(features::kPageHasTextInputs); + + features.Clear(); + LoadURL("http://host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); +} + +TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) { + responses_["http://www.host.com/"] = + "<html><head><body>" + "<a href=\"http://www2.host.com/abc\">link</a>" + "<a name=page_anchor></a>" + "<a href=\"http://www.chromium.org/\">chromium</a>" + "</body></html"; + + FeatureMap expected_features; + expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.5); + expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.0); + expected_features.AddBooleanFeature(features::kPageLinkDomain + + std::string("chromium.org")); + + FeatureMap features; + LoadURL("http://www.host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + responses_.clear(); + responses_["https://www.host.com/"] = + "<html><head><body>" + "<a href=\"login\">this is secure</a>" + "<a href=\"http://host.com\">not secure</a>" + "<a href=\"https://www2.host.com/login\">also secure</a>" + "<a href=\"http://chromium.org/\">also not secure</a>" + "</body></html>"; + + expected_features.Clear(); + expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25); + expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.5); + expected_features.AddBooleanFeature(features::kPageLinkDomain + + std::string("chromium.org")); + + features.Clear(); + LoadURL("https://www.host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); +} + +TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) { + responses_["http://host.com/"] = + "<html><head><script></script><script></script></head></html>"; + + FeatureMap expected_features; + expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne); + + FeatureMap features; + LoadURL("http://host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + responses_["http://host.com/"] = + "<html><head><script></script><script></script><script></script>" + "<script></script><script></script><script></script><script></script>" + "</head><body><img src=\"blah.gif\">" + "<img src=\"http://host2.com/blah.gif\"></body></html>"; + + expected_features.Clear(); + expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne); + expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTSix); + expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 0.5); + + features.Clear(); + LoadURL("http://host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); +} + +TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) { + // Test that features are aggregated across all frames. + responses_["http://host.com/"] = + "<html><body><input type=text><a href=\"info.html\">link</a>" + "<iframe src=\"http://host2.com/\"></iframe>" + "<iframe src=\"http://host3.com/\"></iframe>" + "</body></html>"; + + responses_["http://host2.com/"] = + "<html><head><script></script><body>" + "<form action=\"http://host4.com/\"><input type=checkbox></form>" + "<form action=\"http://host2.com/submit\"></form>" + "<a href=\"http://www.host2.com/home\">link</a>" + "<iframe src=\"nested.html\"></iframe>" + "<body></html>"; + + responses_["http://host2.com/nested.html"] = + "<html><body><input type=password>" + "<a href=\"https://host4.com/\">link</a>" + "<a href=\"relative\">another</a>" + "</body></html>"; + + responses_["http://host3.com/"] = + "<html><head><script></script><body>" + "<img src=\"http://host.com/123.png\">" + "</body></html>"; + + FeatureMap expected_features; + expected_features.AddBooleanFeature(features::kPageHasForms); + // Form action domains are compared to the URL of the document they're in, + // not the URL of the toplevel page. So http://host2.com/ has two form + // actions, one of which is external. + expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5); + expected_features.AddBooleanFeature(features::kPageHasTextInputs); + expected_features.AddBooleanFeature(features::kPageHasPswdInputs); + expected_features.AddBooleanFeature(features::kPageHasCheckInputs); + expected_features.AddRealFeature(features::kPageExternalLinksFreq, 0.25); + expected_features.AddBooleanFeature(features::kPageLinkDomain + + std::string("host4.com")); + expected_features.AddRealFeature(features::kPageSecureLinksFreq, 0.25); + expected_features.AddBooleanFeature(features::kPageNumScriptTagsGTOne); + expected_features.AddRealFeature(features::kPageImgOtherDomainFreq, 1.0); + + FeatureMap features; + LoadURL("http://host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); +} + +// TODO(bryner): Test extraction with multiple passes, including the case where +// the node we stopped on is removed from the document. + +} // namespace safe_browsing |