Add a scorer class for client-side phishing detection.

The Scorer class loads a machine learning model and lets you compute a phishing score for a set of features that were extracted from the current page. The phishing score corresponds to the probability that the page is phishing. The features are extracted by the various feature extractor classes that live in the same folder. The current format of the model is a protocol buffer (see: client_mode.proto). BUG=NONE TEST=PhishingScorerTest Review URL: http://codereview.chromium.org/3363004 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@59019 0039d316-1c4b-4281-b951-d872f2087c98
author: noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-09 23:33:05 +0000
committer: noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-09 23:33:05 +0000
commit: 51260c7719da871adae2cfe3085adddf8e5f8658 (patch)
tree: d56088c21d5d56461fa47a0beec8532691314bf0 /chrome/renderer/safe_browsing
parent: 9d6d6e51c6bd220c5ab6a337424e312c1fdc7ad5 (diff)
download: chromium_src-51260c7719da871adae2cfe3085adddf8e5f8658.zip
chromium_src-51260c7719da871adae2cfe3085adddf8e5f8658.tar.gz
chromium_src-51260c7719da871adae2cfe3085adddf8e5f8658.tar.bz2
4 files changed, 386 insertions, 0 deletions
diff --git a/chrome/renderer/safe_browsing/client_model.proto b/chrome/renderer/safe_browsing/client_model.proto
new file mode 100644
index 0000000..0a224dc
--- /dev/null
+++ b/chrome/renderer/safe_browsing/client_model.proto
@@ -0,0 +1,71 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// This proto represents a machine learning model which is used to compute
+// the probability that a particular page visited by Chrome is phishing.
+//
+// Note: sine the machine learning model is trained on the server-side and then
+// downloaded onto the client it is important that this proto file stays in
+// sync with the server-side copy.  Otherwise, the client may not be able to
+// parse the server generated model anymore.  If you want to change this
+// protocol definition or you have questions regarding its format please contact
+// chrome-anti-phishing@googlegroups.com.
+
+syntax = "proto2";
+
+option optimize_for = LITE_RUNTIME;
+
+package safe_browsing;
+
+// This protocol buffer represents a machine learning model that is used in
+// client-side phishing detection (in Chrome).  The client extracts a set
+// of features from every website the user visits.  Extracted features map
+// feature names to floating point values (e.g., PageSecureLinksFreq -> 0.9).
+//
+// To compute the phishing score (i.e., the probability that the website is
+// phishing) a scorer will simply compute the sum of all rule scores for a
+// given set of extracted features.  The score of a particular rule corresponds
+// to the product of all feature values that are part of the rule times the
+// rule weight.  If a feature has no value (i.e., is not part of the extracted
+// features) its value will be set to zero.  The overall score is computed
+// by summing up all the rule scores.  This overall score is a logodds and can
+// be converted to a probability like this:
+// p = exp(logodds) / (exp(logodds) + 1).
+//
+// To make it harder for phishers to reverse engineer our machine learning model
+// all the features in the model are hashed with a sha256 hash function.  The
+// feature extractors also hash the extracted features before scoring happens.
+message ClientSideModel {
+  // In order to save some space we store all the hashed strings in a
+  // single repeated field and then the rules as well as page terms
+  // and page words refer to an index in that repeated field.  All
+  // hashes are sha256 hashes stored in binary format.
+  repeated bytes hashes = 1;
+
+  message Rule {
+    // List of indexes into hashes above which are basically hashed
+    // features that form the current rule.
+    repeated int32 feature = 1;
+
+    // The weight for this particular rule.
+    required float weight = 2;
+  }
+
+  // List of rules which make up the model
+  repeated Rule rule = 2;
+
+  // List of indexes that point to the hashed page terms that appear in
+  // the model.  The hashes are computed over page terms that are encoded
+  // as lowercase UTF-8 strings.
+  repeated int32 page_term = 3;
+
+  // List of indexes that point to the hashed page words.  The page words
+  // correspond to all words that appear in page terms.  If the term
+  // "one two" is in the list of page terms then "one" and "two" will be
+  // in the list of page words.
+  repeated int32 page_word = 4;
+
+  // Page terms in page_term contain at most this many page words.
+  required int32 max_words_per_term = 5;
+}
diff --git a/chrome/renderer/safe_browsing/scorer.cc b/chrome/renderer/safe_browsing/scorer.cc
new file mode 100644
index 0000000..a967f38
--- /dev/null
+++ b/chrome/renderer/safe_browsing/scorer.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/scorer.h"
+
+#include <math.h>
+
+#include "base/logging.h"
+#include "base/string_piece.h"
+#include "chrome/renderer/safe_browsing/client_model.pb.h"
+#include "chrome/renderer/safe_browsing/features.h"
+
+namespace safe_browsing {
+
+// Helper function which converts log odds to a probability in the range
+// [0.0,1.0].
+static double LogOdds2Prob(double log_odds) {
+  // 709 = floor(1023*ln(2)).  2**1023 is the largest finite double.
+  // Small log odds aren't a problem.  as the odds will be 0.  It's only
+  // when we get +infinity for the odds, that odds/(odds+1) would be NaN.
+  if (log_odds >= 709) {
+    return 1.0;
+  }
+  double odds = exp(log_odds);
+  return odds/(odds+1.0);
+}
+
+Scorer::Scorer(const base::StringPiece& model_str) {
+  model_.reset(new ClientSideModel());
+  if (!model_->ParseFromArray(model_str.data(), model_str.size()) ||
+      !model_->IsInitialized()) {
+    DLOG(ERROR) << "Unable to parse phishing model.  This Scorer object is "
+                << "invalid.";
+    model_.reset(NULL);
+    return;
+  }
+  for (int i = 0; i < model_->page_term_size(); ++i) {
+    page_terms_.insert(model_->hashes(model_->page_term(i)));
+  }
+  for (int i = 0; i < model_->page_word_size(); ++i) {
+    page_words_.insert(model_->hashes(model_->page_word(i)));
+  }
+}
+
+Scorer::~Scorer() {}
+
+bool Scorer::HasValidModel() const {
+  return (model_.get() != NULL);
+}
+
+double Scorer::ComputeScore(const FeatureMap& features) const {
+  DCHECK(HasValidModel());
+  if (!HasValidModel()) {
+    return 0.0;  // Better safe than sorry.  We don't want to crash the browser.
+  }
+  double logodds = 0.0;
+  for (int i = 0; i < model_->rule_size(); ++i) {
+    logodds += ComputeRuleScore(model_->rule(i), features);
+  }
+  return LogOdds2Prob(logodds);
+}
+
+const base::hash_set<std::string>& Scorer::page_terms() const {
+  DCHECK(HasValidModel());
+  return page_terms_;
+}
+
+const base::hash_set<std::string>& Scorer::page_words() const {
+  DCHECK(HasValidModel());
+  return page_words_;
+}
+
+size_t Scorer::max_words_per_term() const {
+  DCHECK(HasValidModel());
+  return (HasValidModel() ? model_->max_words_per_term() : 0);
+}
+
+double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule,
+                                const FeatureMap& features) const {
+  DCHECK(HasValidModel());
+  const base::hash_map<std::string, double>& feature_map = features.features();
+  double rule_score = 1.0;
+  for (int i = 0; i < rule.feature_size(); ++i) {
+    base::hash_map<std::string, double>::const_iterator it = feature_map.find(
+        model_->hashes(rule.feature(i)));
+    if (it == feature_map.end() || it->second == 0.0) {
+      // If the feature of the rule does not exist in the given feature map the
+      // feature weight is considered to be zero.  If the feature weight is zero
+      // we leave early since we know that the rule score will be zero.
+      return 0.0;
+    }
+    rule_score *= it->second;
+  }
+  return rule_score * rule.weight();
+}
+}  // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/scorer.h b/chrome/renderer/safe_browsing/scorer.h
new file mode 100644
index 0000000..42a2ec8
--- /dev/null
+++ b/chrome/renderer/safe_browsing/scorer.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// This class loads a client-side model and lets you compute a phishing score
+// for a set of previously extracted features.  The phishing score corresponds
+// to the probability that the features are indicative of a phishing site.
+//
+// For more details on how the score is actually computed for a given model
+// and a given set of features read the comments in client_model.proto file.
+//
+// See features.h for a list of features that are currently used.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
+#define CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
+#pragma once
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/hash_tables.h"
+#include "base/scoped_ptr.h"
+#include "base/string_piece.h"
+#include "chrome/renderer/safe_browsing/client_model.pb.h"
+
+namespace safe_browsing {
+class FeatureMap;
+
+// Scorer methods are virtual to simplify mocking of this class.
+class Scorer {
+ public:
+  // The constructor parses the given model.  If parsing fails for some reason
+  // HasValidModel() will return false.
+  explicit Scorer(const base::StringPiece& model_str);
+  virtual ~Scorer();
+
+  // Returns true iff the model was successfully loaded and is valid.
+  virtual bool HasValidModel() const;
+
+  // This method computes the probability that the given features are indicative
+  // of phishing.  It returns a score value that falls in the range [0.0,1.0]
+  // (range is inclusive on both ends).
+  // PRE: HasValidModel() is true;
+  virtual double ComputeScore(const FeatureMap& features) const;
+
+  // -- Accessors used by the page feature extractor ---------------------------
+
+  // Returns a set of hashed page terms that appear in the model in binary
+  // format.  PRE: HasValidModel() is true.
+  const base::hash_set<std::string>& page_terms() const;
+
+  // Returns a set of hashed page words that appear in the model in binary
+  // format.  PRE: HasValidModel() is true.
+  const base::hash_set<std::string>& page_words() const;
+
+  // Return the maximum number of words per term for the loaded model.
+  // PRE: HasValidModel() is true.
+  size_t max_words_per_term() const;
+
+ private:
+  // Computes the score for a given rule and feature map.  The score is computed
+  // by multiplying the rule weight with the product of feature weights for the
+  // given rule.  The feature weights are stored in the feature map.  If a
+  // particular feature does not exist in the feature map we set its weight to
+  // zero.
+  double ComputeRuleScore(const ClientSideModel::Rule& rule,
+                          const FeatureMap& features) const;
+
+  // This will be NULL if we are unable to load the model (i.e., if
+  // HasValidModel() is false)
+  scoped_ptr<ClientSideModel> model_;
+
+  base::hash_set<std::string> page_terms_;
+  base::hash_set<std::string> page_words_;
+
+  DISALLOW_COPY_AND_ASSIGN(Scorer);
+};
+}  // namepsace safe_browsing
+
+#endif  // CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
diff --git a/chrome/renderer/safe_browsing/scorer_unittest.cc b/chrome/renderer/safe_browsing/scorer_unittest.cc
new file mode 100644
index 0000000..e54cac5
--- /dev/null
+++ b/chrome/renderer/safe_browsing/scorer_unittest.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/scorer.h"
+
+#include "base/format_macros.h"
+#include "base/hash_tables.h"
+#include "base/scoped_ptr.h"
+#include "chrome/renderer/safe_browsing/client_model.pb.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace safe_browsing {
+
+class PhishingScorerTest : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    // Setup a simple model.  Note that the scorer does not care about
+    // how features are encoded so we use readable strings here to make
+    // the test simpler to follow.
+    model_.Clear();
+    model_.add_hashes("feature1");
+    model_.add_hashes("feature2");
+    model_.add_hashes("feature3");
+    model_.add_hashes("token one");
+    model_.add_hashes("token two");
+    model_.add_hashes("token");
+    model_.add_hashes("one");
+    model_.add_hashes("two");
+
+    ClientSideModel::Rule* rule;
+    rule = model_.add_rule();
+    rule->set_weight(0.5);
+
+    rule = model_.add_rule();
+    rule->add_feature(0);  // feature1
+    rule->set_weight(2.0);
+
+    rule = model_.add_rule();
+    rule->add_feature(0);  // feature1
+    rule->add_feature(1);  // feature2
+    rule->set_weight(3.0);
+
+    model_.add_page_term(3);  // token one
+    model_.add_page_term(4);  // token two
+
+    model_.add_page_word(5);  // token
+    model_.add_page_word(6);  // one
+    model_.add_page_word(7);  // two
+
+    model_.set_max_words_per_term(2);
+  }
+
+  ClientSideModel model_;
+};
+
+TEST_F(PhishingScorerTest, HasValidModel) {
+  scoped_ptr<Scorer> scorer;
+  scorer.reset(new Scorer(model_.SerializeAsString()));
+  EXPECT_TRUE(scorer->HasValidModel());
+
+  FeatureMap dummy_features;
+  base::hash_set<std::string> empty_map;
+  // Invalid model string.
+  scorer.reset(new Scorer("bogus string"));
+  EXPECT_FALSE(scorer->HasValidModel());
+  // By using this notation we can both test the debug and the opt
+  // behavior.  E.g., if we run the test in opt-mode we'll check that
+  // the score is 0.0 and if we run it in dbg-mode we'll test that the
+  // call to ComputeScore fails.
+  EXPECT_DEBUG_DEATH(
+      { EXPECT_DOUBLE_EQ(0.0, scorer->ComputeScore(dummy_features)); }, "");
+  EXPECT_DEBUG_DEATH(
+      { EXPECT_THAT(scorer->page_terms(), testing::Eq(empty_map)); }, "");
+  EXPECT_DEBUG_DEATH(
+      { EXPECT_THAT(scorer->page_words(), testing::Eq(empty_map)); }, "");
+  EXPECT_DEBUG_DEATH(
+      { EXPECT_EQ(static_cast<size_t>(0), scorer->max_words_per_term()); }, "");
+
+  // Mode is missing a required field.
+  model_.clear_max_words_per_term();
+  scorer.reset(new Scorer(model_.SerializePartialAsString()));
+  EXPECT_FALSE(scorer->HasValidModel());
+  EXPECT_DEBUG_DEATH(
+      { EXPECT_DOUBLE_EQ(0.0, scorer->ComputeScore(dummy_features)); }, "");
+  EXPECT_DEBUG_DEATH(
+      { EXPECT_THAT(scorer->page_terms(), testing::Eq(empty_map)); }, "");
+  EXPECT_DEBUG_DEATH(
+      { EXPECT_THAT(scorer->page_words(), testing::Eq(empty_map)); }, "");
+  EXPECT_DEBUG_DEATH(
+      { EXPECT_EQ(static_cast<size_t>(0), scorer->max_words_per_term()); }, "");
+}
+
+TEST_F(PhishingScorerTest, PageTerms) {
+  Scorer scorer(model_.SerializeAsString());
+  base::hash_set<std::string> expected_page_terms;
+  expected_page_terms.insert("token one");
+  expected_page_terms.insert("token two");
+  EXPECT_THAT(scorer.page_terms(), ::testing::ContainerEq(expected_page_terms));
+}
+
+TEST_F(PhishingScorerTest, PageWords) {
+  Scorer scorer(model_.SerializeAsString());
+  base::hash_set<std::string> expected_page_words;
+  expected_page_words.insert("token");
+  expected_page_words.insert("one");
+  expected_page_words.insert("two");
+  EXPECT_THAT(scorer.page_words(), ::testing::ContainerEq(expected_page_words));
+}
+
+TEST_F(PhishingScorerTest, ComputeScore) {
+  Scorer scorer(model_.SerializeAsString());
+
+  // An empty feature map should match the empty rule.
+  FeatureMap features;
+  // The expected logodds is 0.5 (empty rule) => p = exp(0.5) / (exp(0.5) + 1)
+  // => 0.62245933120185459
+  EXPECT_DOUBLE_EQ(0.62245933120185459, scorer.ComputeScore(features));
+  // Same if the feature does not match any rule.
+  EXPECT_TRUE(features.AddBooleanFeature("not existing feature"));
+  EXPECT_DOUBLE_EQ(0.62245933120185459, scorer.ComputeScore(features));
+
+  // Feature 1 matches which means that the logodds will be:
+  //   0.5 (empty rule) + 2.0 (rule weight) * 0.15 (feature weight) = 0.8
+  //   => p = 0.6899744811276125
+  EXPECT_TRUE(features.AddRealFeature("feature1", 0.15));
+  EXPECT_DOUBLE_EQ(0.6899744811276125, scorer.ComputeScore(features));
+
+  // Now, both feature 1 and feature 2 match.  Expected logodds:
+  //   0.5 (empty rule) + 2.0 (rule weight) * 0.15 (feature weight) +
+  //   3.0 (rule weight) * 0.15 (feature1 weight) * 1.0 (feature2) weight = 9.8
+  //   => p = 0.99999627336071584
+  EXPECT_TRUE(features.AddBooleanFeature("feature2"));
+  EXPECT_DOUBLE_EQ(0.77729986117469119, scorer.ComputeScore(features));
+}
+}  // namespace safe_browsing
author	noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-09 23:33:05 +0000
committer	noelutz@google.com <noelutz@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-09 23:33:05 +0000
commit	51260c7719da871adae2cfe3085adddf8e5f8658 (patch)
tree	d56088c21d5d56461fa47a0beec8532691314bf0 /chrome/renderer/safe_browsing
parent	9d6d6e51c6bd220c5ab6a337424e312c1fdc7ad5 (diff)
download	chromium_src-51260c7719da871adae2cfe3085adddf8e5f8658.zip chromium_src-51260c7719da871adae2cfe3085adddf8e5f8658.tar.gz chromium_src-51260c7719da871adae2cfe3085adddf8e5f8658.tar.bz2