diff options
5 files changed, 234 insertions, 11 deletions
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi index f101c1e..b47adea 100644 --- a/chrome/chrome_renderer.gypi +++ b/chrome/chrome_renderer.gypi @@ -181,6 +181,7 @@ 'renderer/renderer_webstoragearea_impl.h', 'renderer/renderer_webstoragenamespace_impl.cc', 'renderer/renderer_webstoragenamespace_impl.h', + 'renderer/safe_browsing/feature_extractor_clock.h', 'renderer/safe_browsing/features.cc', 'renderer/safe_browsing/features.h', 'renderer/safe_browsing/phishing_dom_feature_extractor.cc', diff --git a/chrome/renderer/safe_browsing/feature_extractor_clock.h b/chrome/renderer/safe_browsing/feature_extractor_clock.h new file mode 100644 index 0000000..0f5cd37 --- /dev/null +++ b/chrome/renderer/safe_browsing/feature_extractor_clock.h @@ -0,0 +1,29 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// A simple abstraction for getting the current time during feature extraction. +// This can be mocked out for testing. + +#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_ +#define CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_ + +#include "base/time.h" + +namespace safe_browsing { + +class FeatureExtractorClock { + public: + FeatureExtractorClock() {} + virtual ~FeatureExtractorClock() {} + + // Returns the current time. May be mocked for testing. + virtual base::TimeTicks Now() { return base::TimeTicks::Now(); } + + private: + DISALLOW_COPY_AND_ASSIGN(FeatureExtractorClock); +}; + +} // namespace safe_browsing + +#endif // CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_ diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc index 7dbf765..db17e1d 100644 --- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc +++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc @@ -8,7 +8,10 @@ #include "base/hash_tables.h" #include "base/histogram.h" #include "base/logging.h" +#include "base/message_loop.h" +#include "base/time.h" #include "chrome/renderer/render_view.h" +#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" #include "chrome/renderer/safe_browsing/features.h" #include "net/base/registry_controlled_domain.h" #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" @@ -20,6 +23,19 @@ namespace safe_browsing { +// This time should be short enough that it doesn't noticeably disrupt the +// user's interaction with the page. +const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 50; + +// Experimenting shows that we get a reasonable gain in performance by +// increasing this up to around 10, but there's not much benefit in +// increasing it past that. +const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; + +// This should be longer than we expect feature extraction to take on any +// actual phishing page. +const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; + // Intermediate state used for computing features. See features.h for // descriptions of the DOM features that are computed. struct PhishingDOMFeatureExtractor::PageFeatureState { @@ -45,7 +61,13 @@ struct PhishingDOMFeatureExtractor::PageFeatureState { // How many script tags int num_script_tags; - PageFeatureState() + // The time at which we started feature extraction for the current page. + base::TimeTicks start_time; + + // The number of iterations we've done for the current extraction. + int num_iterations; + + explicit PageFeatureState(base::TimeTicks start_time_ticks) : external_links(0), secure_links(0), total_links(0), @@ -58,7 +80,9 @@ struct PhishingDOMFeatureExtractor::PageFeatureState { total_actions(0), img_other_domain(0), total_imgs(0), - num_script_tags(0) {} + num_script_tags(0), + start_time(start_time_ticks), + num_iterations(0) {} ~PageFeatureState() {} }; @@ -74,8 +98,10 @@ struct PhishingDOMFeatureExtractor::FrameData { }; PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( - RenderView* render_view) + RenderView* render_view, + FeatureExtractorClock* clock) : render_view_(render_view), + clock_(clock), ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { Clear(); } @@ -98,6 +124,8 @@ void PhishingDOMFeatureExtractor::ExtractFeatures( features_ = features; done_callback_.reset(done_callback); + + page_feature_state_.reset(new PageFeatureState(clock_->Now())); MessageLoop::current()->PostTask( FROM_HERE, method_factory_.NewRunnableMethod( @@ -111,6 +139,10 @@ void PhishingDOMFeatureExtractor::CancelPendingExtraction() { } void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { + DCHECK(page_feature_state_.get()); + ++page_feature_state_->num_iterations; + base::TimeTicks current_chunk_start_time = clock_->Now(); + if (!cur_frame_) { WebKit::WebView* web_view = render_view_->webview(); if (!web_view) { @@ -122,15 +154,21 @@ void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { return; } cur_frame_ = web_view->mainFrame(); - page_feature_state_.reset(new PageFeatureState); } + int num_elements = 0; for (; cur_frame_; cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { WebKit::WebNode cur_node; if (cur_frame_data_.get()) { // We're resuming traversal of a frame, so just advance to the next node. cur_node = cur_frame_data_->elements.nextItem(); + // When we resume the traversal, the first call to nextItem() potentially + // has to walk through the document again from the beginning, if it was + // modified between our chunks of work. Log how long this takes, so we + // can tell if it's too slow. + UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", + clock_->Now() - current_chunk_start_time); } else { // We just moved to a new frame, so update our frame state // and advance to the first element. @@ -160,8 +198,36 @@ void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { HandleScript(element); } - // TODO(bryner): stop if too much time has elapsed, and add histograms - // for the time spent processing. + if (++num_elements >= kClockCheckGranularity) { + num_elements = 0; + base::TimeTicks now = clock_->Now(); + if (now - page_feature_state_->start_time >= + base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { + DLOG(ERROR) << "Feature extraction took too long, giving up"; + // We expect this to happen infrequently, so record when it does. + UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); + RunCallback(false); + return; + } + base::TimeDelta chunk_elapsed = now - current_chunk_start_time; + if (chunk_elapsed >= + base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { + // The time limit for the current chunk is up, so post a task to + // continue extraction. + // + // Record how much time we actually spent on the chunk. If this is + // much higher than kMaxTimePerChunkMs, we may need to adjust the + // clock granularity. + UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", + chunk_elapsed); + MessageLoop::current()->PostTask( + FROM_HERE, + method_factory_.NewRunnableMethod( + &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); + return; + } + // Otherwise, continue. + } } // We're done with this frame, recalculate the FrameData when we @@ -296,6 +362,14 @@ void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { } void PhishingDOMFeatureExtractor::RunCallback(bool success) { + // Record some timing stats that we can use to evaluate feature extraction + // performance. These include both successful and failed extractions. + DCHECK(page_feature_state_.get()); + UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", + page_feature_state_->num_iterations); + UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", + clock_->Now() - page_feature_state_->start_time); + DCHECK(done_callback_.get()); done_callback_->Run(success); Clear(); diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h index bc9d599..ef0e42e 100644 --- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h +++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h @@ -26,6 +26,7 @@ class WebFrame; } namespace safe_browsing { +class FeatureExtractorClock; class FeatureMap; class PhishingDOMFeatureExtractor { @@ -36,8 +37,11 @@ class PhishingDOMFeatureExtractor { // Creates a PhishingDOMFeatureExtractor for the specified RenderView. // The PhishingDOMFeatureExtrator should be destroyed prior to destroying - // the RenderView. - explicit PhishingDOMFeatureExtractor(RenderView* render_view); + // the RenderView. |clock| is used for timing feature extractor operations, + // and may be mocked for testing. PhishingDOMFeatureExtractor takes + // ownership of the clock. + PhishingDOMFeatureExtractor(RenderView* render_view, + FeatureExtractorClock* clock); ~PhishingDOMFeatureExtractor(); // Begins extracting features into the given FeatureMap for the page @@ -57,6 +61,19 @@ class PhishingDOMFeatureExtractor { struct FrameData; struct PageFeatureState; + // The maximum amount of time that we will spend on a single extraction + // iteration before pausing to let other MessageLoop tasks run. + static const int kMaxTimePerChunkMs; + + // The number of elements that we will process before checking to see whether + // kMaxTimePerChunkMs has elapsed. Since checking the current time can be + // slow, we don't do this on every element processed. + static const int kClockCheckGranularity; + + // The maximum total amount of time that the feature extractor will run + // before giving up on the current page. + static const int kMaxTotalTimeMs; + // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs // until a predefined maximum amount of time has elapsed, then posts a task // to the current MessageLoop to continue extraction. When extraction @@ -101,6 +118,9 @@ class PhishingDOMFeatureExtractor { // Non-owned pointer to the view that we will extract features from. RenderView* render_view_; + // Owned pointer to our clock. + scoped_ptr<FeatureExtractorClock> clock_; + // The output parameters from the most recent call to ExtractFeatures(). FeatureMap* features_; // The caller keeps ownership of this. scoped_ptr<DoneCallback> done_callback_; diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc index 9eae2ef..995ba01 100644 --- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc +++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc @@ -18,6 +18,7 @@ #include "base/message_loop.h" #include "base/process.h" #include "base/string_util.h" +#include "base/time.h" #include "chrome/common/main_function_params.h" #include "chrome/common/render_messages.h" #include "chrome/common/render_messages_params.h" @@ -27,6 +28,7 @@ #include "chrome/renderer/render_view.h" #include "chrome/renderer/render_view_visitor.h" #include "chrome/renderer/renderer_main_platform_delegate.h" +#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" #include "chrome/renderer/safe_browsing/features.h" #include "googleurl/src/gurl.h" #include "ipc/ipc_channel.h" @@ -40,6 +42,7 @@ #include "webkit/glue/webkit_glue.h" using ::testing::ContainerEq; +using ::testing::Return; namespace safe_browsing { @@ -63,6 +66,11 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test, } protected: + class MockClock : public FeatureExtractorClock { + public: + MOCK_METHOD0(Now, base::TimeTicks()); + }; + virtual void SetUp() { // Set up the renderer. This code is largely adapted from // render_view_test.cc and renderer_main.cc. Note that we use a @@ -98,7 +106,8 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test, ASSERT_TRUE(channel_->Send(new ViewMsg_New(params))); msg_loop_.Run(); - extractor_.reset(new PhishingDOMFeatureExtractor(view_)); + clock_ = new MockClock(); + extractor_.reset(new PhishingDOMFeatureExtractor(view_, clock_)); } virtual void TearDown() { @@ -234,6 +243,7 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test, scoped_ptr<SandboxInitWrapper> sandbox_init_wrapper_; scoped_ptr<PhishingDOMFeatureExtractor> extractor_; + MockClock* clock_; // owned by extractor_ // Map of URL -> response body for network requests from the renderer. // Any URLs not in this map are served a 404 error. std::map<std::string, std::string> responses_; @@ -243,6 +253,8 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test, int PhishingDOMFeatureExtractorTest::next_thread_id_ = 0; TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) { + // This test doesn't exercise the extraction timing. + EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); responses_["http://host.com/"] = "<html><head><body>" "<form action=\"query\"><input type=text><input type=checkbox></form>" @@ -299,6 +311,8 @@ TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) { } TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) { + // This test doesn't exercise the extraction timing. + EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); responses_["http://www.host.com/"] = "<html><head><body>" "<a href=\"http://www2.host.com/abc\">link</a>" @@ -339,6 +353,8 @@ TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) { } TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) { + // This test doesn't exercise the extraction timing. + EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); responses_["http://host.com/"] = "<html><head><script></script><script></script></head></html>"; @@ -368,6 +384,9 @@ TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) { } TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) { + // This test doesn't exercise the extraction timing. + EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now())); + // Test that features are aggregated across all frames. responses_["http://host.com/"] = "<html><body><input type=text><a href=\"info.html\">link</a>" @@ -416,7 +435,87 @@ TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) { EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); } -// TODO(bryner): Test extraction with multiple passes, including the case where -// the node we stopped on is removed from the document. +TEST_F(PhishingDOMFeatureExtractorTest, Continuation) { + // For this test, we'll cause the feature extraction to run multiple + // iterations by incrementing the clock. + + // This page has a total of 50 elements. For the external forms feature to + // be computed correctly, the extractor has to examine the whole document. + // Note: the empty HEAD is important -- WebKit will synthesize a HEAD if + // there isn't one present, which can be confusing for the element counts. + std::string response = "<html><head></head><body>" + "<form action=\"ondomain\"></form>"; + for (int i = 0; i < 45; ++i) { + response.append("<p>"); + } + response.append("<form action=\"http://host2.com/\"></form></body></html>"); + responses_["http://host.com/"] = response; + + // Advance the clock 30 ms every 10 elements processed, 10 ms between chunks. + // Note that this assumes kClockCheckGranularity = 10 and + // kMaxTimePerChunkMs = 50. + base::TimeTicks now = base::TimeTicks::Now(); + EXPECT_CALL(*clock_, Now()) + // Time check at the start of extraction. + .WillOnce(Return(now)) + // Time check at the start of the first chunk of work. + .WillOnce(Return(now)) + // Time check after the first 10 elements. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30))) + // Time check after the next 10 elements. This is over the chunk + // time limit, so a continuation task will be posted. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60))) + // Time check at the start of the second chunk of work. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70))) + // Time check after resuming iteration for the second chunk. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(72))) + // Time check after the next 10 elements. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100))) + // Time check after the next 10 elements. This will trigger another + // continuation task. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(130))) + // Time check at the start of the third chunk of work. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(140))) + // Time check after resuming iteration for the third chunk. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(142))) + // Time check after the last 10 elements. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(170))) + // A final time check for the histograms. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(180))); + + FeatureMap expected_features; + expected_features.AddBooleanFeature(features::kPageHasForms); + expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5); + + FeatureMap features; + LoadURL("http://host.com/"); + ASSERT_TRUE(ExtractFeatures(&features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + // Make sure none of the mock expectations carry over to the next test. + ::testing::Mock::VerifyAndClearExpectations(clock_); + + // Now repeat the test with the same page, but advance the clock faster so + // that the extraction time exceeds the maximum total time for the feature + // extractor. Extraction should fail. Note that this assumes + // kMaxTotalTimeMs = 500. + EXPECT_CALL(*clock_, Now()) + // Time check at the start of extraction. + .WillOnce(Return(now)) + // Time check at the start of the first chunk of work. + .WillOnce(Return(now)) + // Time check after the first 10 elements. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300))) + // Time check at the start of the second chunk of work. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350))) + // Time check after resuming iteration for the second chunk. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(360))) + // Time check after the next 10 elements. This is over the limit. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600))) + // A final time check for the histograms. + .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620))); + + features.Clear(); + EXPECT_FALSE(ExtractFeatures(&features)); +} } // namespace safe_browsing |