Change the client-side phishing detection request format. Instead of

just sending the URL with the computed score we also send back all the extracted features. BUG=NONE TEST=ClientSideDetectionHostTest,ClientSideDetectionServiceTest,PhishingClassifierTest,PhishingClassifierDelegateTest Review URL: http://codereview.chromium.org/6810020 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@81657 0039d316-1c4b-4281-b951-d872f2087c98
author: noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2011-04-14 22:14:13 +0000
committer: noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2011-04-14 22:14:13 +0000
commit: 1331397732bfd1a2ed9d63d92bbc2a61185bd666 (patch)
tree: 8f1f3367488e6749911a4db78f8ee5e67be91b6e /chrome/renderer/safe_browsing
parent: 28b791a577a84f316d78b37d3bea97a119215a0f (diff)
download: chromium_src-1331397732bfd1a2ed9d63d92bbc2a61185bd666.zip
chromium_src-1331397732bfd1a2ed9d63d92bbc2a61185bd666.tar.gz
chromium_src-1331397732bfd1a2ed9d63d92bbc2a61185bd666.tar.bz2
8 files changed, 128 insertions, 69 deletions
diff --git a/chrome/renderer/safe_browsing/malware_dom_details.cc b/chrome/renderer/safe_browsing/malware_dom_details.cc
index 6a03295..80be708 100644
--- a/chrome/renderer/safe_browsing/malware_dom_details.cc
+++ b/chrome/renderer/safe_browsing/malware_dom_details.cc
@@ -6,7 +6,7 @@
 
 #include "base/compiler_specific.h"
 #include "chrome/common/chrome_constants.h"
-#include "chrome/common/safebrowsing_messages.h"
+#include "chrome/common/safe_browsing/safebrowsing_messages.h"
 #include "content/renderer/render_view.h"
 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h"
 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h"
diff --git a/chrome/renderer/safe_browsing/malware_dom_details_browsertest.cc b/chrome/renderer/safe_browsing/malware_dom_details_browsertest.cc
index dc7d6ef..01e93f3 100644
--- a/chrome/renderer/safe_browsing/malware_dom_details_browsertest.cc
+++ b/chrome/renderer/safe_browsing/malware_dom_details_browsertest.cc
@@ -3,7 +3,7 @@
 // found in the LICENSE file.
 
 #include "base/stringprintf.h"
-#include "chrome/common/safebrowsing_messages.h"
+#include "chrome/common/safe_browsing/safebrowsing_messages.h"
 #include "chrome/renderer/safe_browsing/malware_dom_details.h"
 #include "chrome/test/render_view_test.h"
 #include "net/base/escape.h"
diff --git a/chrome/renderer/safe_browsing/phishing_classifier.cc b/chrome/renderer/safe_browsing/phishing_classifier.cc
index ad34053..997d81a 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier.cc
@@ -10,7 +10,7 @@
 #include "base/compiler_specific.h"
 #include "base/logging.h"
 #include "base/string_util.h"
-#include "crypto/sha2.h"
+#include "chrome/common/safe_browsing/csd.pb.h"
 #include "chrome/common/url_constants.h"
 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
 #include "chrome/renderer/safe_browsing/features.h"
@@ -19,6 +19,7 @@
 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
 #include "chrome/renderer/safe_browsing/scorer.h"
 #include "content/renderer/render_view.h"
+#include "crypto/sha2.h"
 #include "googleurl/src/gurl.h"
 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDataSource.h"
 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h"
@@ -28,8 +29,8 @@
 
 namespace safe_browsing {
 
-const double PhishingClassifier::kInvalidScore = -1.0;
-const double PhishingClassifier::kPhishyThreshold = 0.5;
+const float PhishingClassifier::kInvalidScore = -1.0;
+const float PhishingClassifier::kPhishyThreshold = 0.5;
 
 PhishingClassifier::PhishingClassifier(RenderView* render_view,
                                        FeatureExtractorClock* clock)
@@ -155,6 +156,8 @@ void PhishingClassifier::TermExtractionFinished(bool success) {
     // Hash all of the features so that they match the model, then compute
     // the score.
     FeatureMap hashed_features;
+    ClientPhishingRequest verdict;
+    verdict.set_url(render_view_->webview()->mainFrame()->url().spec());
     for (base::hash_map<std::string, double>::const_iterator it =
              features_->features().begin();
          it != features_->features().end(); ++it) {
@@ -162,10 +165,14 @@ void PhishingClassifier::TermExtractionFinished(bool success) {
       bool result = hashed_features.AddRealFeature(
           crypto::SHA256HashString(it->first), it->second);
       DCHECK(result);
+      ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
+      feature->set_name(it->first);
+      feature->set_value(it->second);
     }
-
-    double score = scorer_->ComputeScore(hashed_features);
-    RunCallback(score >= kPhishyThreshold, score);
+    float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
+    verdict.set_client_score(score);
+    verdict.set_is_phishing(score >= kPhishyThreshold);
+    RunCallback(verdict);
   } else {
     RunFailureCallback();
   }
@@ -180,13 +187,19 @@ void PhishingClassifier::CheckNoPendingClassification() {
   }
 }
 
-void PhishingClassifier::RunCallback(bool phishy, double phishy_score) {
-  done_callback_->Run(phishy, phishy_score);
+void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
+  done_callback_->Run(verdict);
   Clear();
 }
 
 void PhishingClassifier::RunFailureCallback() {
-  RunCallback(false /* not phishy */, kInvalidScore);
+  ClientPhishingRequest verdict;
+  // In this case we're not guaranteed to have a valid URL.  Just set it
+  // to the empty string to make sure we have a valid protocol buffer.
+  verdict.set_url("");
+  verdict.set_client_score(kInvalidScore);
+  verdict.set_is_phishing(false);
+  RunCallback(verdict);
 }
 
 void PhishingClassifier::Clear() {
diff --git a/chrome/renderer/safe_browsing/phishing_classifier.h b/chrome/renderer/safe_browsing/phishing_classifier.h
index 32168de..8aeda98 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier.h
+++ b/chrome/renderer/safe_browsing/phishing_classifier.h
@@ -27,6 +27,7 @@
 class RenderView;
 
 namespace safe_browsing {
+class ClientPhishingRequest;
 class FeatureExtractorClock;
 class FeatureMap;
 class PhishingDOMFeatureExtractor;
@@ -36,15 +37,16 @@ class Scorer;
 
 class PhishingClassifier {
  public:
-  // Callback to be run when phishing classification finishes.  If the first
-  // argument is true, the page is considered phishy by the client-side model,
-  // and the browser should ping back to get a final verdict.  The second
-  // argument gives the phishyness score which is used in the pingback,
-  // or kInvalidScore if classification failed.
-  typedef Callback2<bool /* phishy */, double /* phishy_score */>::Type
-    DoneCallback;
+  // Callback to be run when phishing classification finishes. The verdict
+  // is a ClientPhishingRequest which contains the verdict computed by the
+  // classifier as well as the extracted features.  If the verdict.is_phishing()
+  // is true, the page is considered phishy by the client-side model,
+  // and the browser should ping back to get a final verdict.  The
+  // verdict.client_score() is set to kInvalidScore if classification failed.
+  typedef Callback1<const ClientPhishingRequest& /* verdict */>::Type
+      DoneCallback;
 
-  static const double kInvalidScore;
+  static const float kInvalidScore;
 
   // Creates a new PhishingClassifier object that will operate on
   // |render_view|.  |clock| is used to time feature extractor operations, and
@@ -87,7 +89,7 @@ class PhishingClassifier {
 
  private:
   // Any score equal to or above this value is considered phishy.
-  static const double kPhishyThreshold;
+  static const float kPhishyThreshold;
 
   // Begins the feature extraction process, by extracting URL features and
   // beginning DOM feature extraction.
@@ -110,7 +112,7 @@ class PhishingClassifier {
   void CheckNoPendingClassification();
 
   // Helper method to run the DoneCallback and clear the state.
-  void RunCallback(bool phishy, double phishy_score);
+  void RunCallback(const ClientPhishingRequest& verdict);
 
   // Helper to run the DoneCallback when feature extraction has failed.
   // This always signals a non-phishy verdict for the page, with kInvalidScore.
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc b/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
index b3eb58e..9c6c60e 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
@@ -14,18 +14,30 @@
 #include "base/memory/scoped_ptr.h"
 #include "base/string16.h"
 #include "base/utf_string_conversions.h"
-#include "crypto/sha2.h"
+#include "chrome/common/safe_browsing/csd.pb.h"
 #include "chrome/renderer/safe_browsing/client_model.pb.h"
 #include "chrome/renderer/safe_browsing/features.h"
 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
 #include "chrome/renderer/safe_browsing/render_view_fake_resources_test.h"
 #include "chrome/renderer/safe_browsing/scorer.h"
+#include "crypto/sha2.h"
 #include "testing/gmock/include/gmock/gmock.h"
 
+using ::testing::AllOf;
+using ::testing::Contains;
+using ::testing::Not;
+using ::testing::Pair;
+
 namespace safe_browsing {
 
 class PhishingClassifierTest : public RenderViewFakeResourcesTest {
  protected:
+  PhishingClassifierTest()
+      : url_tld_token_net_(features::kUrlTldToken + std::string("net")),
+        page_link_domain_phishing_(features::kPageLinkDomain +
+                                   std::string("phishing.com")),
+        page_term_login_(features::kPageTerm + std::string("login")) {}
+
   virtual void SetUp() {
     // Set up WebKit and the RenderView.
     RenderViewFakeResourcesTest::SetUp();
@@ -33,6 +45,11 @@ class PhishingClassifierTest : public RenderViewFakeResourcesTest {
     // Construct a model to test with.  We include one feature from each of
     // the feature extractors, which allows us to verify that they all ran.
     ClientSideModel model;
+
+    model.add_hashes(crypto::SHA256HashString(url_tld_token_net_));
+    model.add_hashes(crypto::SHA256HashString(page_link_domain_phishing_));
+    model.add_hashes(crypto::SHA256HashString(page_term_login_));
+    model.add_hashes(crypto::SHA256HashString("login"));
     model.add_hashes(crypto::SHA256HashString(features::kUrlTldToken +
                                               std::string("net")));
     model.add_hashes(crypto::SHA256HashString(features::kPageLinkDomain +
@@ -69,25 +86,31 @@ class PhishingClassifierTest : public RenderViewFakeResourcesTest {
   }
 
   // Helper method to start phishing classification and wait for it to
-  // complete.  Returns the success value from the PhishingClassifier's
-  // DoneCallback, and fills in phishy_score with the score.
-  bool RunPhishingClassifier(const string16* page_text, double* phishy_score) {
-    success_ = false;
+  // complete.  Returns the true if the page is classified as phishy and
+  // false otherwise.
+  bool RunPhishingClassifier(const string16* page_text,
+                             float* phishy_score,
+                             FeatureMap* features) {
+    verdict_.Clear();
     *phishy_score = PhishingClassifier::kInvalidScore;
+    features->Clear();
 
     classifier_->BeginClassification(
         page_text,
         NewCallback(this, &PhishingClassifierTest::ClassificationFinished));
     message_loop_.Run();
 
-    *phishy_score = phishy_score_;
-    return success_;
+    *phishy_score = verdict_.client_score();
+    for (int i = 0; i < verdict_.feature_map_size(); ++i) {
+      features->AddRealFeature(verdict_.feature_map(i).name(),
+                               verdict_.feature_map(i).value());
+    }
+    return verdict_.is_phishing();
   }
 
   // Completion callback for classification.
-  void ClassificationFinished(bool success, double phishy_score) {
-    success_ = success;
-    phishy_score_ = phishy_score;
+  void ClassificationFinished(const ClientPhishingRequest& verdict) {
+    verdict_ = verdict;  // copy the verdict.
     message_loop_.Quit();
   }
 
@@ -95,10 +118,14 @@ class PhishingClassifierTest : public RenderViewFakeResourcesTest {
   scoped_ptr<PhishingClassifier> classifier_;
   MockFeatureExtractorClock* clock_;  // owned by classifier_
 
-  // These members hold the status from the most recent call to the
+  // Features that are in the model.
+  const std::string url_tld_token_net_;
+  const std::string page_link_domain_phishing_;
+  const std::string page_term_login_;
+
+  // This member holds the status from the most recent call to the
   // ClassificationFinished callback.
-  bool success_;
-  double phishy_score_;
+  ClientPhishingRequest verdict_;
 };
 
 TEST_F(PhishingClassifierTest, TestClassification) {
@@ -118,34 +145,49 @@ TEST_F(PhishingClassifierTest, TestClassification) {
   LoadURL("http://host.net/");
 
   string16 page_text = ASCIIToUTF16("login");
-  double phishy_score;
-  EXPECT_TRUE(RunPhishingClassifier(&page_text, &phishy_score));
-  EXPECT_DOUBLE_EQ(0.5, phishy_score);
+  float phishy_score;
+  FeatureMap features;
+  EXPECT_TRUE(RunPhishingClassifier(&page_text, &phishy_score, &features));
+  // Note: features.features() might contain other features that simply aren't
+  // in the model.
+  EXPECT_THAT(features.features(),
+              AllOf(Contains(Pair(url_tld_token_net_, 1.0)),
+                    Contains(Pair(page_link_domain_phishing_, 1.0)),
+                    Contains(Pair(page_term_login_, 1.0))));
+  EXPECT_FLOAT_EQ(0.5, phishy_score);
 
   // Change the link domain to something non-phishy.
   responses_["http://host.net/"] =
       "<html><body><a href=\"http://safe.com/\">login</a></body></html>";
   LoadURL("http://host.net/");
 
-  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score));
+  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
+  EXPECT_THAT(features.features(),
+              AllOf(Contains(Pair(url_tld_token_net_, 1.0)),
+                    Contains(Pair(page_term_login_, 1.0))));
+  EXPECT_THAT(features.features(),
+              Not(Contains(Pair(page_link_domain_phishing_, 1.0))));
   EXPECT_GE(phishy_score, 0.0);
   EXPECT_LT(phishy_score, 0.5);
 
   // Extraction should fail for this case, since there is no TLD.
   responses_["http://localhost/"] = "<html><body>content</body></html>";
   LoadURL("http://localhost/");
-  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score));
+  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
+  EXPECT_EQ(0U, features.features().size());
   EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
 
   // Extraction should also fail for this case, because the URL is not http.
   responses_["https://host.net/"] = "<html><body>secure</body></html>";
   LoadURL("https://host.net/");
-  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score));
+  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
+  EXPECT_EQ(0U, features.features().size());
   EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
 
   // Extraction should fail for this case because the URL is a POST request.
   LoadURLWithPost("http://host.net/");
-  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score));
+  EXPECT_FALSE(RunPhishingClassifier(&page_text, &phishy_score, &features));
+  EXPECT_EQ(0U, features.features().size());
   EXPECT_EQ(PhishingClassifier::kInvalidScore, phishy_score);
 }
 
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc b/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc
index 358507a..efd5d8a 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier_delegate.cc
@@ -11,7 +11,8 @@
 #include "base/logging.h"
 #include "base/metrics/histogram.h"
 #include "base/memory/scoped_callback_factory.h"
-#include "chrome/common/safebrowsing_messages.h"
+#include "chrome/common/safe_browsing/csd.pb.h"
+#include "chrome/common/safe_browsing/safebrowsing_messages.h"
 #include "chrome/renderer/render_thread.h"
 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
@@ -177,19 +178,18 @@ bool PhishingClassifierDelegate::OnMessageReceived(
   return handled;
 }
 
-void PhishingClassifierDelegate::ClassificationDone(bool is_phishy,
-                                                    double phishy_score) {
+void PhishingClassifierDelegate::ClassificationDone(
+    const ClientPhishingRequest& verdict) {
   // We no longer need the page text.
   classifier_page_text_.clear();
-  VLOG(2) << "Phishy verdict = " << is_phishy << " score = " << phishy_score;
-  if (!is_phishy) {
+  VLOG(2) << "Phishy verdict = " << verdict.is_phishing()
+          << " score = " << verdict.client_score();
+  if (!verdict.is_phishing()) {
     return;
   }
-
+  DCHECK(last_url_sent_to_classifier_.spec() == verdict.url());
   Send(new SafeBrowsingHostMsg_DetectedPhishingSite(
-      routing_id(),
-      last_url_sent_to_classifier_,
-      phishy_score));
+      routing_id(), verdict.SerializeAsString()));
 }
 
 GURL PhishingClassifierDelegate::GetToplevelUrl() {
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_delegate.h b/chrome/renderer/safe_browsing/phishing_classifier_delegate.h
index a07cc8b..e6eb0f2 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_delegate.h
+++ b/chrome/renderer/safe_browsing/phishing_classifier_delegate.h
@@ -16,6 +16,7 @@
 #include "ipc/ipc_platform_file.h"
 
 namespace safe_browsing {
+class ClientPhishingRequest;
 class PhishingClassifier;
 class Scorer;
 
@@ -72,7 +73,7 @@ class PhishingClassifierDelegate : public RenderViewObserver {
   void OnStartPhishingDetection(const GURL& url);
 
   // Called when classification for the current page finishes.
-  void ClassificationDone(bool is_phishy, double phishy_score);
+  void ClassificationDone(const ClientPhishingRequest& verdict);
 
   // Returns the RenderView's toplevel URL.
   GURL GetToplevelUrl();
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_delegate_browsertest.cc b/chrome/renderer/safe_browsing/phishing_classifier_delegate_browsertest.cc
index 90578a4..f422309 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_delegate_browsertest.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier_delegate_browsertest.cc
@@ -9,7 +9,8 @@
 
 #include "base/memory/scoped_ptr.h"
 #include "base/utf_string_conversions.h"
-#include "chrome/common/safebrowsing_messages.h"
+#include "chrome/common/safe_browsing/csd.pb.h"
+#include "chrome/common/safe_browsing/safebrowsing_messages.h"
 #include "chrome/renderer/safe_browsing/features.h"
 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
 #include "chrome/renderer/safe_browsing/render_view_fake_resources_test.h"
@@ -69,23 +70,22 @@ class PhishingClassifierDelegateTest : public RenderViewFakeResourcesTest {
     return handled;
   }
 
-  void OnDetectedPhishingSite(GURL phishing_url, double phishing_score) {
-    detected_phishing_site_ = true;
-    detected_url_ = phishing_url;
-    detected_score_ = phishing_score;
+  void OnDetectedPhishingSite(const std::string& verdict_str) {
+    scoped_ptr<ClientPhishingRequest> verdict(new ClientPhishingRequest);
+    if (verdict->ParseFromString(verdict_str) &&
+        verdict->IsInitialized()) {
+      verdict_.swap(verdict);
+    }
     message_loop_.Quit();
   }
 
   // Runs the ClassificationDone callback, then waits for the
   // DetectedPhishingSite IPC to arrive.
   void RunClassificationDone(PhishingClassifierDelegate* delegate,
-                             bool is_phishy, double phishy_score) {
+                             const ClientPhishingRequest& verdict) {
     // Clear out any previous state.
-    detected_phishing_site_ = false;
-    detected_url_ = GURL();
-    detected_score_ = -1.0;
-
-    delegate->ClassificationDone(is_phishy, phishy_score);
+    verdict_.reset();
+    delegate->ClassificationDone(verdict);
     message_loop_.Run();
   }
 
@@ -94,9 +94,7 @@ class PhishingClassifierDelegateTest : public RenderViewFakeResourcesTest {
     delegate->OnStartPhishingDetection(url);
   }
 
-  bool detected_phishing_site_;
-  GURL detected_url_;
-  double detected_score_;
+  scoped_ptr<ClientPhishingRequest> verdict_;
 };
 
 TEST_F(PhishingClassifierDelegateTest, Navigation) {
@@ -340,10 +338,13 @@ TEST_F(PhishingClassifierDelegateTest, DetectedPhishingSite) {
   Mock::VerifyAndClearExpectations(classifier);
 
   // Now run the callback to simulate the classifier finishing.
-  RunClassificationDone(delegate, true, 0.8);
-  EXPECT_TRUE(detected_phishing_site_);
-  EXPECT_EQ(GURL("http://host.com/#a"), detected_url_);
-  EXPECT_EQ(0.8, detected_score_);
+  ClientPhishingRequest verdict;
+  verdict.set_url("http://host.com/#a");
+  verdict.set_client_score(0.8f);
+  verdict.set_is_phishing(true);
+  RunClassificationDone(delegate, verdict);
+  ASSERT_TRUE(verdict_.get());
+  EXPECT_EQ(verdict.SerializeAsString(), verdict_->SerializeAsString());
 
   // The delegate will cancel pending classification on destruction.
   EXPECT_CALL(*classifier, CancelPendingClassification());
author	noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2011-04-14 22:14:13 +0000
committer	noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2011-04-14 22:14:13 +0000
commit	1331397732bfd1a2ed9d63d92bbc2a61185bd666 (patch)
tree	8f1f3367488e6749911a4db78f8ee5e67be91b6e /chrome/renderer/safe_browsing
parent	28b791a577a84f316d78b37d3bea97a119215a0f (diff)
download	chromium_src-1331397732bfd1a2ed9d63d92bbc2a61185bd666.zip chromium_src-1331397732bfd1a2ed9d63d92bbc2a61185bd666.tar.gz chromium_src-1331397732bfd1a2ed9d63d92bbc2a61185bd666.tar.bz2