5 files changed, 234 insertions, 11 deletions
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi
index f101c1e..b47adea 100644
--- a/chrome/chrome_renderer.gypi
+++ b/chrome/chrome_renderer.gypi
@@ -181,6 +181,7 @@
         'renderer/renderer_webstoragearea_impl.h',
         'renderer/renderer_webstoragenamespace_impl.cc',
         'renderer/renderer_webstoragenamespace_impl.h',
+        'renderer/safe_browsing/feature_extractor_clock.h',
         'renderer/safe_browsing/features.cc',
         'renderer/safe_browsing/features.h',
         'renderer/safe_browsing/phishing_dom_feature_extractor.cc',
diff --git a/chrome/renderer/safe_browsing/feature_extractor_clock.h b/chrome/renderer/safe_browsing/feature_extractor_clock.h
new file mode 100644
index 0000000..0f5cd37
--- /dev/null
+++ b/chrome/renderer/safe_browsing/feature_extractor_clock.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// A simple abstraction for getting the current time during feature extraction.
+// This can be mocked out for testing.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_
+#define CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_
+
+#include "base/time.h"
+
+namespace safe_browsing {
+
+class FeatureExtractorClock {
+ public:
+  FeatureExtractorClock() {}
+  virtual ~FeatureExtractorClock() {}
+
+  // Returns the current time.  May be mocked for testing.
+  virtual base::TimeTicks Now() { return base::TimeTicks::Now(); }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(FeatureExtractorClock);
+};
+
+}  // namespace safe_browsing
+
+#endif  // CHROME_RENDERER_SAFE_BROWSING_FEATURE_EXTRACTOR_CLOCK_H_
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
index 7dbf765..db17e1d 100644
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc
@@ -8,7 +8,10 @@
 #include "base/hash_tables.h"
 #include "base/histogram.h"
 #include "base/logging.h"
+#include "base/message_loop.h"
+#include "base/time.h"
 #include "chrome/renderer/render_view.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
 #include "chrome/renderer/safe_browsing/features.h"
 #include "net/base/registry_controlled_domain.h"
 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h"
@@ -20,6 +23,19 @@
 
 namespace safe_browsing {
 
+// This time should be short enough that it doesn't noticeably disrupt the
+// user's interaction with the page.
+const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 50;
+
+// Experimenting shows that we get a reasonable gain in performance by
+// increasing this up to around 10, but there's not much benefit in
+// increasing it past that.
+const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
+
+// This should be longer than we expect feature extraction to take on any
+// actual phishing page.
+const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
+
 // Intermediate state used for computing features.  See features.h for
 // descriptions of the DOM features that are computed.
 struct PhishingDOMFeatureExtractor::PageFeatureState {
@@ -45,7 +61,13 @@ struct PhishingDOMFeatureExtractor::PageFeatureState {
   // How many script tags
   int num_script_tags;
 
-  PageFeatureState()
+  // The time at which we started feature extraction for the current page.
+  base::TimeTicks start_time;
+
+  // The number of iterations we've done for the current extraction.
+  int num_iterations;
+
+  explicit PageFeatureState(base::TimeTicks start_time_ticks)
       : external_links(0),
         secure_links(0),
         total_links(0),
@@ -58,7 +80,9 @@ struct PhishingDOMFeatureExtractor::PageFeatureState {
         total_actions(0),
         img_other_domain(0),
         total_imgs(0),
-        num_script_tags(0) {}
+        num_script_tags(0),
+        start_time(start_time_ticks),
+        num_iterations(0) {}
 
   ~PageFeatureState() {}
 };
@@ -74,8 +98,10 @@ struct PhishingDOMFeatureExtractor::FrameData {
 };
 
 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
-    RenderView* render_view)
+    RenderView* render_view,
+    FeatureExtractorClock* clock)
     : render_view_(render_view),
+      clock_(clock),
       ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
   Clear();
 }
@@ -98,6 +124,8 @@ void PhishingDOMFeatureExtractor::ExtractFeatures(
 
   features_ = features;
   done_callback_.reset(done_callback);
+
+  page_feature_state_.reset(new PageFeatureState(clock_->Now()));
   MessageLoop::current()->PostTask(
       FROM_HERE,
       method_factory_.NewRunnableMethod(
@@ -111,6 +139,10 @@ void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
 }
 
 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
+  DCHECK(page_feature_state_.get());
+  ++page_feature_state_->num_iterations;
+  base::TimeTicks current_chunk_start_time = clock_->Now();
+
   if (!cur_frame_) {
     WebKit::WebView* web_view = render_view_->webview();
     if (!web_view) {
@@ -122,15 +154,21 @@ void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
       return;
     }
     cur_frame_ = web_view->mainFrame();
-    page_feature_state_.reset(new PageFeatureState);
   }
 
+  int num_elements = 0;
   for (; cur_frame_;
        cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) {
     WebKit::WebNode cur_node;
     if (cur_frame_data_.get()) {
       // We're resuming traversal of a frame, so just advance to the next node.
       cur_node = cur_frame_data_->elements.nextItem();
+      // When we resume the traversal, the first call to nextItem() potentially
+      // has to walk through the document again from the beginning, if it was
+      // modified between our chunks of work.  Log how long this takes, so we
+      // can tell if it's too slow.
+      UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
+                          clock_->Now() - current_chunk_start_time);
     } else {
       // We just moved to a new frame, so update our frame state
       // and advance to the first element.
@@ -160,8 +198,36 @@ void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
         HandleScript(element);
       }
 
-      // TODO(bryner): stop if too much time has elapsed, and add histograms
-      // for the time spent processing.
+      if (++num_elements >= kClockCheckGranularity) {
+        num_elements = 0;
+        base::TimeTicks now = clock_->Now();
+        if (now - page_feature_state_->start_time >=
+            base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
+          DLOG(ERROR) << "Feature extraction took too long, giving up";
+          // We expect this to happen infrequently, so record when it does.
+          UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
+          RunCallback(false);
+          return;
+        }
+        base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
+        if (chunk_elapsed >=
+            base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
+          // The time limit for the current chunk is up, so post a task to
+          // continue extraction.
+          //
+          // Record how much time we actually spent on the chunk. If this is
+          // much higher than kMaxTimePerChunkMs, we may need to adjust the
+          // clock granularity.
+          UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
+                              chunk_elapsed);
+          MessageLoop::current()->PostTask(
+              FROM_HERE,
+              method_factory_.NewRunnableMethod(
+                  &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
+          return;
+        }
+        // Otherwise, continue.
+      }
     }
 
     // We're done with this frame, recalculate the FrameData when we
@@ -296,6 +362,14 @@ void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
 }
 
 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
+  // Record some timing stats that we can use to evaluate feature extraction
+  // performance.  These include both successful and failed extractions.
+  DCHECK(page_feature_state_.get());
+  UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
+                       page_feature_state_->num_iterations);
+  UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
+                      clock_->Now() - page_feature_state_->start_time);
+
   DCHECK(done_callback_.get());
   done_callback_->Run(success);
   Clear();
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
index bc9d599..ef0e42e 100644
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
@@ -26,6 +26,7 @@ class WebFrame;
 }
 
 namespace safe_browsing {
+class FeatureExtractorClock;
 class FeatureMap;
 
 class PhishingDOMFeatureExtractor {
@@ -36,8 +37,11 @@ class PhishingDOMFeatureExtractor {
 
   // Creates a PhishingDOMFeatureExtractor for the specified RenderView.
   // The PhishingDOMFeatureExtrator should be destroyed prior to destroying
-  // the RenderView.
-  explicit PhishingDOMFeatureExtractor(RenderView* render_view);
+  // the RenderView.  |clock| is used for timing feature extractor operations,
+  // and may be mocked for testing.  PhishingDOMFeatureExtractor takes
+  // ownership of the clock.
+  PhishingDOMFeatureExtractor(RenderView* render_view,
+                              FeatureExtractorClock* clock);
   ~PhishingDOMFeatureExtractor();
 
   // Begins extracting features into the given FeatureMap for the page
@@ -57,6 +61,19 @@ class PhishingDOMFeatureExtractor {
   struct FrameData;
   struct PageFeatureState;
 
+  // The maximum amount of time that we will spend on a single extraction
+  // iteration before pausing to let other MessageLoop tasks run.
+  static const int kMaxTimePerChunkMs;
+
+  // The number of elements that we will process before checking to see whether
+  // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
+  // slow, we don't do this on every element processed.
+  static const int kClockCheckGranularity;
+
+  // The maximum total amount of time that the feature extractor will run
+  // before giving up on the current page.
+  static const int kMaxTotalTimeMs;
+
   // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
   // until a predefined maximum amount of time has elapsed, then posts a task
   // to the current MessageLoop to continue extraction.  When extraction
@@ -101,6 +118,9 @@ class PhishingDOMFeatureExtractor {
   // Non-owned pointer to the view that we will extract features from.
   RenderView* render_view_;
 
+  // Owned pointer to our clock.
+  scoped_ptr<FeatureExtractorClock> clock_;
+
   // The output parameters from the most recent call to ExtractFeatures().
   FeatureMap* features_;  // The caller keeps ownership of this.
   scoped_ptr<DoneCallback> done_callback_;
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc
index 9eae2ef..995ba01 100644
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor_browsertest.cc
@@ -18,6 +18,7 @@
 #include "base/message_loop.h"
 #include "base/process.h"
 #include "base/string_util.h"
+#include "base/time.h"
 #include "chrome/common/main_function_params.h"
 #include "chrome/common/render_messages.h"
 #include "chrome/common/render_messages_params.h"
@@ -27,6 +28,7 @@
 #include "chrome/renderer/render_view.h"
 #include "chrome/renderer/render_view_visitor.h"
 #include "chrome/renderer/renderer_main_platform_delegate.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
 #include "chrome/renderer/safe_browsing/features.h"
 #include "googleurl/src/gurl.h"
 #include "ipc/ipc_channel.h"
@@ -40,6 +42,7 @@
 #include "webkit/glue/webkit_glue.h"
 
 using ::testing::ContainerEq;
+using ::testing::Return;
 
 namespace safe_browsing {
 
@@ -63,6 +66,11 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test,
   }
 
  protected:
+  class MockClock : public FeatureExtractorClock {
+   public:
+    MOCK_METHOD0(Now, base::TimeTicks());
+  };
+
   virtual void SetUp() {
     // Set up the renderer.  This code is largely adapted from
     // render_view_test.cc and renderer_main.cc.  Note that we use a
@@ -98,7 +106,8 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test,
     ASSERT_TRUE(channel_->Send(new ViewMsg_New(params)));
     msg_loop_.Run();
 
-    extractor_.reset(new PhishingDOMFeatureExtractor(view_));
+    clock_ = new MockClock();
+    extractor_.reset(new PhishingDOMFeatureExtractor(view_, clock_));
   }
 
   virtual void TearDown() {
@@ -234,6 +243,7 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test,
   scoped_ptr<SandboxInitWrapper> sandbox_init_wrapper_;
 
   scoped_ptr<PhishingDOMFeatureExtractor> extractor_;
+  MockClock* clock_;  // owned by extractor_
   // Map of URL -> response body for network requests from the renderer.
   // Any URLs not in this map are served a 404 error.
   std::map<std::string, std::string> responses_;
@@ -243,6 +253,8 @@ class PhishingDOMFeatureExtractorTest : public ::testing::Test,
 int PhishingDOMFeatureExtractorTest::next_thread_id_ = 0;
 
 TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) {
+  // This test doesn't exercise the extraction timing.
+  EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
   responses_["http://host.com/"] =
       "<html><head><body>"
       "<form action=\"query\"><input type=text><input type=checkbox></form>"
@@ -299,6 +311,8 @@ TEST_F(PhishingDOMFeatureExtractorTest, FormFeatures) {
 }
 
 TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) {
+  // This test doesn't exercise the extraction timing.
+  EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
   responses_["http://www.host.com/"] =
       "<html><head><body>"
       "<a href=\"http://www2.host.com/abc\">link</a>"
@@ -339,6 +353,8 @@ TEST_F(PhishingDOMFeatureExtractorTest, LinkFeatures) {
 }
 
 TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) {
+  // This test doesn't exercise the extraction timing.
+  EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
   responses_["http://host.com/"] =
       "<html><head><script></script><script></script></head></html>";
 
@@ -368,6 +384,9 @@ TEST_F(PhishingDOMFeatureExtractorTest, ScriptAndImageFeatures) {
 }
 
 TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) {
+  // This test doesn't exercise the extraction timing.
+  EXPECT_CALL(*clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
+
   // Test that features are aggregated across all frames.
   responses_["http://host.com/"] =
       "<html><body><input type=text><a href=\"info.html\">link</a>"
@@ -416,7 +435,87 @@ TEST_F(PhishingDOMFeatureExtractorTest, SubFrames) {
   EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 }
 
-// TODO(bryner): Test extraction with multiple passes, including the case where
-// the node we stopped on is removed from the document.
+TEST_F(PhishingDOMFeatureExtractorTest, Continuation) {
+  // For this test, we'll cause the feature extraction to run multiple
+  // iterations by incrementing the clock.
+
+  // This page has a total of 50 elements.  For the external forms feature to
+  // be computed correctly, the extractor has to examine the whole document.
+  // Note: the empty HEAD is important -- WebKit will synthesize a HEAD if
+  // there isn't one present, which can be confusing for the element counts.
+  std::string response = "<html><head></head><body>"
+      "<form action=\"ondomain\"></form>";
+  for (int i = 0; i < 45; ++i) {
+    response.append("<p>");
+  }
+  response.append("<form action=\"http://host2.com/\"></form></body></html>");
+  responses_["http://host.com/"] = response;
+
+  // Advance the clock 30 ms every 10 elements processed, 10 ms between chunks.
+  // Note that this assumes kClockCheckGranularity = 10 and
+  // kMaxTimePerChunkMs = 50.
+  base::TimeTicks now = base::TimeTicks::Now();
+  EXPECT_CALL(*clock_, Now())
+      // Time check at the start of extraction.
+      .WillOnce(Return(now))
+      // Time check at the start of the first chunk of work.
+      .WillOnce(Return(now))
+      // Time check after the first 10 elements.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
+      // Time check after the next 10 elements.  This is over the chunk
+      // time limit, so a continuation task will be posted.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60)))
+      // Time check at the start of the second chunk of work.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70)))
+      // Time check after resuming iteration for the second chunk.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(72)))
+      // Time check after the next 10 elements.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100)))
+      // Time check after the next 10 elements.  This will trigger another
+      // continuation task.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(130)))
+      // Time check at the start of the third chunk of work.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(140)))
+      // Time check after resuming iteration for the third chunk.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(142)))
+      // Time check after the last 10 elements.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(170)))
+      // A final time check for the histograms.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(180)));
+
+  FeatureMap expected_features;
+  expected_features.AddBooleanFeature(features::kPageHasForms);
+  expected_features.AddRealFeature(features::kPageActionOtherDomainFreq, 0.5);
+
+  FeatureMap features;
+  LoadURL("http://host.com/");
+  ASSERT_TRUE(ExtractFeatures(&features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+  // Make sure none of the mock expectations carry over to the next test.
+  ::testing::Mock::VerifyAndClearExpectations(clock_);
+
+  // Now repeat the test with the same page, but advance the clock faster so
+  // that the extraction time exceeds the maximum total time for the feature
+  // extractor.  Extraction should fail.  Note that this assumes
+  // kMaxTotalTimeMs = 500.
+  EXPECT_CALL(*clock_, Now())
+      // Time check at the start of extraction.
+      .WillOnce(Return(now))
+      // Time check at the start of the first chunk of work.
+      .WillOnce(Return(now))
+      // Time check after the first 10 elements.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
+      // Time check at the start of the second chunk of work.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
+      // Time check after resuming iteration for the second chunk.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(360)))
+      // Time check after the next 10 elements.  This is over the limit.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
+      // A final time check for the histograms.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
+
+  features.Clear();
+  EXPECT_FALSE(ExtractFeatures(&features));
+}
 
 }  // namespace safe_browsing