summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--chrome/chrome_renderer.gypi1
-rw-r--r--chrome/renderer/safe_browsing/feature_extractor_clock.h29
-rw-r--r--chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc86
-rw-r--r--chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h24
-rw-r--r--chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc105
5 files changed, 234 insertions, 11 deletions
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi
index f101c1e..b47adea 100644
--- a/chrome/chrome_renderer.gypi
+++ b/chrome/chrome_renderer.gypi
@@ -181,6 +181,7 @@
'renderer/renderer_webstoragearea_impl.h',
'renderer/renderer_webstoragenamespace_impl.cc',
'renderer/renderer_webstoragenamespace_impl.h',
+ 'renderer/safe_browsing/feature_extractor_clock.h',
'renderer/safe_browsing/features.cc',
'renderer/safe_browsing/features.h',
'renderer/safe_browsing/phishing_dom_feature_extractor.cc',
diff --git a/chrome/renderer/safe_browsing/feature_extractor_clock.h b/chrome/renderer/safe_browsing/feature_extractor_clock.h
new file mode 100644
index 0000000..0f5cd37
--- /dev/null
+++ b/chrome/renderer/safe_browsing/feature_extractor_clock.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// A simple abstraction for getting the current time during feature extraction.
+// This can be mocked out for testing.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_
+#define CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_
+
+#include "base/time.h"
+
+namespace safe_browsing {
+
+class FeatureExtractorClock {
+ public:
+ FeatureExtractorClock() {}
+ virtual ~FeatureExtractorClock() {}
+
+ // Returns the current time. May be mocked for testing.
+ virtual base::TimeTicks Now() { return base::TimeTicks::Now(); }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(FeatureExtractorClock);
+};
+
+} // namespace safe_browsing
+
+#endif // CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
index 7dbf765..db17e1d 100644
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
@@ -8,7 +8,10 @@
#include "base/hash_tables.h"
#include "base/histogram.h"
#include "base/logging.h"
+#include "base/message_loop.h"
+#include "base/time.h"
#include "chrome/renderer/render_view.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "net/base/registry_controlled_domain.h"
#include "third_party/WebKit/WebKit/chromium/public/WebDocument.h"
@@ -20,6 +23,19 @@
namespace safe_browsing {
+// This time should be short enough that it doesn't noticeably disrupt the
+// user's interaction with the page.
+const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 50;
+
+// Experimenting shows that we get a reasonable gain in performance by
+// increasing this up to around 10, but there's not much benefit in
+// increasing it past that.
+const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
+
+// This should be longer than we expect feature extraction to take on any
+// actual phishing page.
+const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
+
// Intermediate state used for computing features. See features.h for
// descriptions of the DOM features that are computed.
struct PhishingDOMFeatureExtractor::PageFeatureState {
@@ -45,7 +61,13 @@ struct PhishingDOMFeatureExtractor::PageFeatureState {
// How many script tags
int num_script_tags;
- PageFeatureState()
+ // The time at which we started feature extraction for the current page.
+ base::TimeTicks start_time;
+
+ // The number of iterations we've done for the current extraction.
+ int num_iterations;
+
+ explicit PageFeatureState(base::TimeTicks start_time_ticks)
: external_links(0),
secure_links(0),
total_links(0),
@@ -58,7 +80,9 @@ struct PhishingDOMFeatureExtractor::PageFeatureState {
total_actions(0),
img_other_domain(0),
total_imgs(0),
- num_script_tags(0) {}
+ num_script_tags(0),
+ start_time(start_time_ticks),
+ num_iterations(0) {}
~PageFeatureState() {}
};
@@ -74,8 +98,10 @@ struct PhishingDOMFeatureExtractor::FrameData {
};
PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
- RenderView* render_view)
+ RenderView* render_view,
+ FeatureExtractorClock* clock)
: render_view_(render_view),
+ clock_(clock),
ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
Clear();
}
@@ -98,6 +124,8 @@ void PhishingDOMFeatureExtractor::ExtractFeatures(
features_ = features;
done_callback_.reset(done_callback);
+
+ page_feature_state_.reset(new PageFeatureState(clock_->Now()));
MessageLoop::current()->PostTask(
FROM_HERE,
method_factory_.NewRunnableMethod(
@@ -111,6 +139,10 @@ void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
}
void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
+ DCHECK(page_feature_state_.get());
+ ++page_feature_state_->num_iterations;
+ base::TimeTicks current_chunk_start_time = clock_->Now();
+
if (!cur_frame_) {
WebKit::WebView* web_view = render_view_->webview();
if (!web_view) {
@@ -122,15 +154,21 @@ void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
return;
}
cur_frame_ = web_view->mainFrame();
- page_feature_state_.reset(new PageFeatureState);
}
+ int num_elements = 0;
for (; cur_frame_;
cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) {
WebKit::WebNode cur_node;
if (cur_frame_data_.get()) {
// We're resuming traversal of a frame, so just advance to the next node.
cur_node = cur_frame_data_->elements.nextItem();
+ // When we resume the traversal, the first call to nextItem() potentially
+ // has to walk through the document again from the beginning, if it was
+ // modified between our chunks of work. Log how long this takes, so we
+ // can tell if it's too slow.
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
+ clock_->Now() - current_chunk_start_time);
} else {
// We just moved to a new frame, so update our frame state
// and advance to the first element.
@@ -160,8 +198,36 @@ void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
HandleScript(element);
}
- // TODO(bryner): stop if too much time has elapsed, and add histograms
- // for the time spent processing.
+ if (++num_elements >= kClockCheckGranularity) {
+ num_elements = 0;
+ base::TimeTicks now = clock_->Now();
+ if (now - page_feature_state_->start_time >=
+ base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
+ DLOG(ERROR) << "Feature extraction took too long, giving up";
+ // We expect this to happen infrequently, so record when it does.
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
+ RunCallback(false);
+ return;
+ }
+ base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
+ if (chunk_elapsed >=
+ base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
+ // The time limit for the current chunk is up, so post a task to
+ // continue extraction.
+ //
+ // Record how much time we actually spent on the chunk. If this is
+ // much higher than kMaxTimePerChunkMs, we may need to adjust the
+ // clock granularity.
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
+ chunk_elapsed);
+ MessageLoop::current()->PostTask(
+ FROM_HERE,
+ method_factory_.NewRunnableMethod(
+ &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
+ return;
+ }
+ // Otherwise, continue.
+ }
}
// We're done with this frame, recalculate the FrameData when we
@@ -296,6 +362,14 @@ void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
}
void PhishingDOMFeatureExtractor::RunCallback(bool success) {
+ // Record some timing stats that we can use to evaluate feature extraction
+ // performance. These include both successful and failed extractions.
+ DCHECK(page_feature_state_.get());
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
+ page_feature_state_->num_iterations);
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
+ clock_->Now() - page_feature_state_->start_time);
+
DCHECK(done_callback_.get());
done_callback_->Run(success);
Clear();
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
index bc9d599..ef0e42e 100644
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
@@ -26,6 +26,7 @@ class WebFrame;
}
namespace safe_browsing {
+class FeatureExtractorClock;
class FeatureMap;
class PhishingDOMFeatureExtractor {
@@ -36,8 +37,11 @@ class PhishingDOMFeatureExtractor {
// Creates a PhishingDOMFeatureExtractor for the specified RenderView.
// The PhishingDOMFeatureExtrator should be destroyed prior to destroying
- // the RenderView.
- explicit PhishingDOMFeatureExtractor(RenderView* render_view);
+ // the RenderView. |clock| is used for timing feature extractor operations,
+ // and may be mocked for testing. PhishingDOMFeatureExtractor takes
+ // ownership of the clock.
+ PhishingDOMFeatureExtractor(RenderView* render_view,
+ FeatureExtractorClock* clock);
~PhishingDOMFeatureExtractor();
// Begins extracting features into the given FeatureMap for the page
@@ -57,6 +61,19 @@ class PhishingDOMFeatureExtractor {
struct FrameData;
struct PageFeatureState;
+ // The maximum amount of time that we will spend on a single extraction
+ // iteration before pausing to let other MessageLoop tasks run.
+ static const int kMaxTimePerChunkMs;
+
+ // The number of elements that we will process before checking to see whether
+ // kMaxTimePerChunkMs has elapsed. Since checking the current time can be
+ // slow, we don't do this on every element processed.
+ static const int kClockCheckGranularity;
+
+ // The maximum total amount of time that the feature extractor will run
+ // before giving up on the current page.
+ static const int kMaxTotalTimeMs;
+
// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
// until a predefined maximum amount of time has elapsed, then posts a task
// to the current MessageLoop to continue extraction. When extraction
@@ -101,6 +118,9 @@ class PhishingDOMFeatureExtractor {
// Non-owned pointer to the view that we will extract features from.
RenderView* render_view_;
+ // Owned pointer to our clock.
+ scoped_ptr<FeatureExtractorClock> clock_;
+
// The output parameters from the most recent call to ExtractFeatures().
FeatureMap* features_; // The caller keeps ownership of this.
scoped_ptr<DoneCallback> done_callback_;
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc
index 9eae2ef..995ba01 100644
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc
@@ -18,6 +18,7 @@
#include "base/message_loop.h"
#include "base/process.h"
#include "base/string_util.h"
+#include "base/time.h"
#include "chrome/common/main_function_params.h"
#include "chrome/common/render_messages.h"
#include "chrome/common/render_messages_params.h"
@@ -27,6 +28,7 @@
#include "chrome/renderer/render_view.h"
#include "chrome/renderer/render_view_visitor.h"
#include "chrome/renderer/renderer_main_platform_delegate.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "googleurl/src/gurl.h"
#include "ipc/ipc_channel.h"
@@ -40,6 +42,7 @@
#include "webkit/glue/webkit_glue.h"
using ::testing::ContainerEq;
+using ::testing::Return;
namespace safe_browsing {
@@ -63,6 +66,11 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test,
}
protected:
+ class MockClock : public FeatureExtractorClock {
+ public:
+ MOCK_METHOD0(Now, base::TimeTicks());
+ };
+
virtual void SetUp() {
// Set up the renderer. This code is largely adapted from
// render_view_test.cc and renderer_main.cc. Note that we use a
@@ -98,7 +106,8 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test,
ASSERT_TRUE(channel_->Send(new ViewMsg_New(params)));
msg_loop_.Run();
- extractor_.reset(new PhishingDOMFeatureExtractor(view_));
+ clock_ = new MockClock();
+ extractor_.reset(new PhishingDOMFeatureExtractor(view_, clock_));
}
virtual void TearDown() {
@@ -234,6 +243,7 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test,
scoped_ptr<SandboxInitWrapper> sandbox_init_wrapper_;
scoped_ptr<PhishingDOMFeatureExtractor> extractor_;
+ MockClock* clock_; // owned by extractor_
// Map of URL -> response body for network requests from the renderer.
// Any URLs not in this map are served a 404 error.
std::map<std::string, std::string> responses_;
@@ -243,6 +253,8 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test,
int PhishingDOMFeatureExtractorTest::next_thread_id_ = 0;
TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) {
+ // This test doesn't exercise the extraction timing.
+ EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
responses_["http://host.com/"] =
"<html><head><body>"
"<form action=\"query\"><input type=text><input type=checkbox></form>"
@@ -299,6 +311,8 @@ TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) {
}
TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) {
+ // This test doesn't exercise the extraction timing.
+ EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
responses_["http://www.host.com/"] =
"<html><head><body>"
"<a href=\"http://www2.host.com/abc\">link</a>"
@@ -339,6 +353,8 @@ TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) {
}
TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) {
+ // This test doesn't exercise the extraction timing.
+ EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
responses_["http://host.com/"] =
"<html><head><script></script><script></script></head></html>";
@@ -368,6 +384,9 @@ TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) {
}
TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) {
+ // This test doesn't exercise the extraction timing.
+ EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
+
// Test that features are aggregated across all frames.
responses_["http://host.com/"] =
"<html><body><input type=text><a href=\"info.html\">link</a>"
@@ -416,7 +435,87 @@ TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) {
EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
}
-// TODO(bryner): Test extraction with multiple passes, including the case where
-// the node we stopped on is removed from the document.
+TEST_F(PhishingDOMFeatureExtractorTest, Continuation) {
+ // For this test, we'll cause the feature extraction to run multiple
+ // iterations by incrementing the clock.
+
+ // This page has a total of 50 elements. For the external forms feature to
+ // be computed correctly, the extractor has to examine the whole document.
+ // Note: the empty HEAD is important -- WebKit will synthesize a HEAD if
+ // there isn't one present, which can be confusing for the element counts.
+ std::string response = "<html><head></head><body>"
+ "<form action=\"ondomain\"></form>";
+ for (int i = 0; i < 45; ++i) {
+ response.append("<p>");
+ }
+ response.append("<form action=\"http://host2.com/\"></form></body></html>");
+ responses_["http://host.com/"] = response;
+
+ // Advance the clock 30 ms every 10 elements processed, 10 ms between chunks.
+ // Note that this assumes kClockCheckGranularity = 10 and
+ // kMaxTimePerChunkMs = 50.
+ base::TimeTicks now = base::TimeTicks::Now();
+ EXPECT_CALL(*clock_, Now())
+ // Time check at the start of extraction.
+ .WillOnce(Return(now))
+ // Time check at the start of the first chunk of work.
+ .WillOnce(Return(now))
+ // Time check after the first 10 elements.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
+ // Time check after the next 10 elements. This is over the chunk
+ // time limit, so a continuation task will be posted.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60)))
+ // Time check at the start of the second chunk of work.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70)))
+ // Time check after resuming iteration for the second chunk.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(72)))
+ // Time check after the next 10 elements.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100)))
+ // Time check after the next 10 elements. This will trigger another
+ // continuation task.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(130)))
+ // Time check at the start of the third chunk of work.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(140)))
+ // Time check after resuming iteration for the third chunk.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(142)))
+ // Time check after the last 10 elements.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(170)))
+ // A final time check for the histograms.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(180)));
+
+ FeatureMap expected_features;
+ expected_features.AddBooleanFeature(features::kPageHasForms);
+ expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5);
+
+ FeatureMap features;
+ LoadURL("http://host.com/");
+ ASSERT_TRUE(ExtractFeatures(&features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+ // Make sure none of the mock expectations carry over to the next test.
+ ::testing::Mock::VerifyAndClearExpectations(clock_);
+
+ // Now repeat the test with the same page, but advance the clock faster so
+ // that the extraction time exceeds the maximum total time for the feature
+ // extractor. Extraction should fail. Note that this assumes
+ // kMaxTotalTimeMs = 500.
+ EXPECT_CALL(*clock_, Now())
+ // Time check at the start of extraction.
+ .WillOnce(Return(now))
+ // Time check at the start of the first chunk of work.
+ .WillOnce(Return(now))
+ // Time check after the first 10 elements.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
+ // Time check at the start of the second chunk of work.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
+ // Time check after resuming iteration for the second chunk.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(360)))
+ // Time check after the next 10 elements. This is over the limit.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
+ // A final time check for the histograms.
+ .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
+
+ features.Clear();
+ EXPECT_FALSE(ExtractFeatures(&features));
+}
} // namespace safe_browsing