Send the ref as part of the client-side phishing detection pingback.

This will help determine whether differing scores on the same URL are related to data that is passed through the ref (as in Google Instant queries). Also add a test for the url feature extractor to verify that the ref is not used to form any token features. BUG=none TEST=PhishingClassifierDelegateTest,PhishingUrlFeatureExtractorTest Review URL: http://codereview.chromium.org/6812009 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@80869 0039d316-1c4b-4281-b951-d872f2087c98
author: bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-04-07 23:46:43 +0000
committer: bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-04-07 23:46:43 +0000
commit: c90aeec875b69a034e64240b3d2892302a4fdc59 (patch)
tree: 5476f7a7cc230df41b32a2b5bd5a61324f6e0598
parent: 70e982ad39c6aae31a71d02bf5c6e848d538c576 (diff)
download: chromium_src-c90aeec875b69a034e64240b3d2892302a4fdc59.zip
chromium_src-c90aeec875b69a034e64240b3d2892302a4fdc59.tar.gz
chromium_src-c90aeec875b69a034e64240b3d2892302a4fdc59.tar.bz2
4 files changed, 28 insertions, 13 deletions
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc b/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc
index 39fa50c..d9c3174 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc
@@ -144,7 +144,7 @@ void PhishingClassifierDelegate::PageCaptured(const string16& page_text,
     return;
   }
   last_finished_load_id_ = render_view()->page_id();
-  last_finished_load_url_ = StripToplevelUrl();
+  last_finished_load_url_ = GetToplevelUrl();
   classifier_page_text_ = page_text;
   MaybeStartClassification();
 }
@@ -182,8 +182,8 @@ void PhishingClassifierDelegate::ClassificationDone(bool is_phishy,
       phishy_score));
 }
 
-GURL PhishingClassifierDelegate::StripToplevelUrl() {
-  return StripRef(render_view()->webview()->mainFrame()->url());
+GURL PhishingClassifierDelegate::GetToplevelUrl() {
+  return render_view()->webview()->mainFrame()->url();
 }
 
 void PhishingClassifierDelegate::MaybeStartClassification() {
@@ -220,9 +220,10 @@ void PhishingClassifierDelegate::MaybeStartClassification() {
     return;
   }
   // If the page id is unchanged, the toplevel URL should also be unchanged.
-  DCHECK_EQ(StripToplevelUrl(), last_finished_load_url_);
+  GURL stripped_last_load_url(StripRef(last_finished_load_url_));
+  DCHECK_EQ(StripRef(GetToplevelUrl()), stripped_last_load_url);
 
-  if (last_finished_load_url_ == last_url_sent_to_classifier_) {
+  if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) {
     // We've already classified this toplevel URL, so this was likely an
     // in-page navigation or a subframe navigation.  The browser should not
     // send a StartPhishingDetection IPC in this case.
@@ -231,9 +232,11 @@ void PhishingClassifierDelegate::MaybeStartClassification() {
     return;
   }
 
-  if (last_url_received_from_browser_ != last_finished_load_url_) {
+  if (last_url_received_from_browser_ != stripped_last_load_url) {
     // The browser has not yet confirmed that this URL should be classified,
-    // so defer classification for now.
+    // so defer classification for now.  Note: the ref does not affect
+    // any of the browser's preclassification checks, so we don't require it
+    // to match.
     VLOG(2) << "Not starting classification, last url from browser is "
             << last_url_received_from_browser_ << ", last finished load is "
             << last_finished_load_url_;
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_delegate.h b/chrome/renderer/safe_browsing/phishing_classifier_delegate.h
index 8907bee..04309ba 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_delegate.h
+++ b/chrome/renderer/safe_browsing/phishing_classifier_delegate.h
@@ -70,8 +70,8 @@ class PhishingClassifierDelegate : public RenderViewObserver {
   // Called when classification for the current page finishes.
   void ClassificationDone(bool is_phishy, double phishy_score);
 
-  // Returns the RenderView's toplevel URL, with the ref stripped.
-  GURL StripToplevelUrl();
+  // Returns the RenderView's toplevel URL.
+  GURL GetToplevelUrl();
 
   // Shared code to begin classification if all conditions are met.
   void MaybeStartClassification();
@@ -80,7 +80,8 @@ class PhishingClassifierDelegate : public RenderViewObserver {
   // a scorer is made available via SetPhishingScorer().
   scoped_ptr<PhishingClassifier> classifier_;
 
-  // The last URL that the browser instructed us to classify.
+  // The last URL that the browser instructed us to classify,
+  // with the ref stripped.
   GURL last_url_received_from_browser_;
 
   // The last URL and page id that have finished loading in the RenderView.
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_delegate_browsertest.cc b/chrome/renderer/safe_browsing/phishing_classifier_delegate_browsertest.cc
index 5af67bd..305e907 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_delegate_browsertest.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier_delegate_browsertest.cc
@@ -321,19 +321,19 @@ TEST_F(PhishingClassifierDelegateTest, DetectedPhishingSite) {
   // Start by loading a page to populate the delegate's state.
   responses_["http://host.com/"] = "<html><body>phish</body></html>";
   EXPECT_CALL(*classifier, CancelPendingClassification());
-  LoadURL("http://host.com/");
+  LoadURL("http://host.com/#a");
   Mock::VerifyAndClearExpectations(classifier);
   string16 page_text = ASCIIToUTF16("phish");
   EXPECT_CALL(*classifier, BeginClassification(Pointee(page_text), _)).
       WillOnce(DeleteArg<1>());
-  OnStartPhishingDetection(delegate, GURL("http://host.com/"));
+  OnStartPhishingDetection(delegate, GURL("http://host.com/#a"));
   delegate->PageCaptured(page_text, false);
   Mock::VerifyAndClearExpectations(classifier);
 
   // Now run the callback to simulate the classifier finishing.
   RunClassificationDone(delegate, true, 0.8);
   EXPECT_TRUE(detected_phishing_site_);
-  EXPECT_EQ(GURL("http://host.com/"), detected_url_);
+  EXPECT_EQ(GURL("http://host.com/#a"), detected_url_);
   EXPECT_EQ(0.8, detected_score_);
 
   // The delegate will cancel pending classification on destruction.
diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc
index 48e324f..a07f3ca 100644
--- a/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc
+++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc
@@ -73,6 +73,17 @@ TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) {
   ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
   EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
 
+  url = "http://witharef.com/#abc";
+  expected_features.Clear();
+  expected_features.AddBooleanFeature(features::kUrlTldToken +
+                                      std::string("com"));
+  expected_features.AddBooleanFeature(features::kUrlDomainToken +
+                                      std::string("witharef"));
+
+  features.Clear();
+  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
   url = "http://...www..lotsodots....com./";
   expected_features.Clear();
   expected_features.AddBooleanFeature(features::kUrlTldToken +
author	bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-04-07 23:46:43 +0000
committer	bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-04-07 23:46:43 +0000
commit	c90aeec875b69a034e64240b3d2892302a4fdc59 (patch)
tree	5476f7a7cc230df41b32a2b5bd5a61324f6e0598
parent	70e982ad39c6aae31a71d02bf5c6e848d538c576 (diff)
download	chromium_src-c90aeec875b69a034e64240b3d2892302a4fdc59.zip chromium_src-c90aeec875b69a034e64240b3d2892302a4fdc59.tar.gz chromium_src-c90aeec875b69a034e64240b3d2892302a4fdc59.tar.bz2