Add URL-based features for client-side phishing detection.

This change implements the URL-based features that we'll use for client-side phishing detection. Right now, the features are simply inserted into a map. Later changes will add content-based features, scoring, and hook up the code to run after page load is finished. BUG=none TEST=PhishingUrlFeatureExtractorTest, PhishingFeaturesTest Review URL: http://codereview.chromium.org/2843036 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@51922 0039d316-1c4b-4281-b951-d872f2087c98
author: bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-07-09 03:28:58 +0000
committer: bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-07-09 03:28:58 +0000
commit: f4658320a60cf5fb85b3d77b0542dd93144c67fe (patch)
tree: 674300bc926366eb1cf44657db747715e2309e96
parent: 6859659e223aeaac973a52c358374bc9bd7f71af (diff)
download: chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.zip
chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.tar.gz
chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.tar.bz2
8 files changed, 482 insertions, 0 deletions
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi
index 4c94873..1b27e29 100644
--- a/chrome/chrome_renderer.gypi
+++ b/chrome/chrome_renderer.gypi
@@ -160,6 +160,10 @@
         'renderer/renderer_webstoragearea_impl.h',
         'renderer/renderer_webstoragenamespace_impl.cc',
         'renderer/renderer_webstoragenamespace_impl.h',
+        'renderer/safe_browsing/features.cc',
+        'renderer/safe_browsing/features.h',
+        'renderer/safe_browsing/phishing_url_feature_extractor.cc',
+        'renderer/safe_browsing/phishing_url_feature_extractor.h',
         'renderer/spellchecker/spellcheck.cc',
         'renderer/spellchecker/spellcheck.h',
         'renderer/spellchecker/spellcheck_worditerator.cc',
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index ce32a15..b21d7c35 100644
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -1074,6 +1074,8 @@
         'renderer/render_widget_unittest.cc',
         'renderer/renderer_about_handler_unittest.cc',
         'renderer/renderer_main_unittest.cc',
+        'renderer/safe_browsing/features_unittest.cc',
+        'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc',
         'renderer/spellchecker/spellcheck_unittest.cc',
         'renderer/spellchecker/spellcheck_worditerator_unittest.cc',
         'renderer/translate_helper_unittest.cc',
diff --git a/chrome/renderer/safe_browsing/features.cc b/chrome/renderer/safe_browsing/features.cc
new file mode 100644
index 0000000..47a093c
--- /dev/null
+++ b/chrome/renderer/safe_browsing/features.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/features.h"
+
+#include "base/histogram.h"
+#include "base/logging.h"
+
+namespace safe_browsing {
+
+const size_t FeatureMap::kMaxFeatureMapSize = 10000;
+
+FeatureMap::FeatureMap() {}
+FeatureMap::~FeatureMap() {}
+
+bool FeatureMap::AddBooleanFeature(const std::string& name) {
+  if (features_.size() >= kMaxFeatureMapSize) {
+    // If we hit this case, it indicates that either kMaxFeatureMapSize is
+    // too small, or there is a bug causing too many features to be added.
+    // In this case, we'll log to a histogram so we can see that this is
+    // happening, and make phishing classification fail silently.
+    LOG(ERROR) << "Not adding feature: " << name << " because the "
+               << "feature map is too large.";
+    UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1);
+    return false;
+  }
+  features_[name] = 1.0;
+  return true;
+}
+
+void FeatureMap::Clear() {
+  features_.clear();
+}
+
+namespace features {
+// URL host features
+const char kUrlHostIsIpAddress[] = "UrlHostIsIpAddress";
+const char kUrlTldToken[] = "UrlTld=";
+const char kUrlDomainToken[] = "UrlDomain=";
+const char kUrlOtherHostToken[] = "UrlOtherHostToken=";
+
+// URL host aggregate features
+const char kUrlNumOtherHostTokensGTOne[] = "UrlNumOtherHostTokens>1";
+const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3";
+
+// URL path features
+const char kUrlPathToken[] = "UrlPathToken=";
+
+}  // namespace features
+}  // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/features.h b/chrome/renderer/safe_browsing/features.h
new file mode 100644
index 0000000..4354a7e
--- /dev/null
+++ b/chrome/renderer/safe_browsing/features.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Common types and constants for extracting and evaluating features in the
+// client-side phishing detection model.  A feature is simply a string and an
+// associated floating-point value between 0 and 1.  The phishing
+// classification model contains rules which give an appropriate weight to each
+// feature or combination of features.  These values can then be summed to
+// compute a final phishiness score.
+//
+// Some features are boolean features.  If these features are set, they always
+// have a value of 0.0 or 1.0.  In practice, the features are only set if the
+// value is true (1.0).
+//
+// We also use token features.  These features have a unique name that is
+// constructed from the URL or page contents that we are classifying, for
+// example, "UrlDomain=chromium".  These features are also always set to 1.0
+// if they are present.
+//
+// The intermediate storage of the features for a URL is a FeatureMap, which is
+// just a thin wrapper around a map of feature name to value.  The entire set
+// of features for a URL is extracted before we do any scoring.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
+#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
+
+#include <string>
+#include "base/basictypes.h"
+#include "base/hash_tables.h"
+
+namespace safe_browsing {
+
+// Container for a map of features to values, which enforces behavior
+// such as a maximum number of features in the map.
+class FeatureMap {
+ public:
+  FeatureMap();
+  ~FeatureMap();
+
+  // Adds a boolean feature to a FeatureMap with a value of 1.0.
+  // Returns true on success, or false if the feature map exceeds
+  // kMaxFeatureMapSize.
+  bool AddBooleanFeature(const std::string& name);
+
+  // Provides read-only access to the current set of features.
+  const base::hash_map<std::string, double>& features() const {
+    return features_;
+  }
+
+  // Clears the set of features in the map.
+  void Clear();
+
+  // This is an upper bound on the number of features that will be extracted.
+  // We should never hit this cap; it is intended as a sanity check to prevent
+  // the FeatureMap from growing too large.
+  static const size_t kMaxFeatureMapSize;
+
+ private:
+  base::hash_map<std::string, double> features_;
+
+  DISALLOW_COPY_AND_ASSIGN(FeatureMap);
+};
+
+namespace features {
+// Constants for the various feature names that we use.
+
+////////////////////////////////////////////////////
+// URL host features
+////////////////////////////////////////////////////
+
+// Set if the URL's hostname is an IP address.
+extern const char kUrlHostIsIpAddress[];
+// Token feature containing the portion of the hostname controlled by a
+// registrar, for example "com" or "co.uk".
+extern const char kUrlTldToken[];
+// Token feature containing the first host component below the registrar.
+// For example, in "www.google.com", the domain would be "google".
+extern const char kUrlDomainToken[];
+// Token feature containing each host component below the domain.
+// For example, in "www.host.example.com", both "www" and "host" would be
+// "other host tokens".
+extern const char kUrlOtherHostToken[];
+
+////////////////////////////////////////////////////
+// Aggregate features for URL host tokens
+////////////////////////////////////////////////////
+
+// Set if the number of "other" host tokens for a URL is greater than one.
+// Longer hostnames, regardless of the specific tokens, can be a signal that
+// the URL is phishy.
+extern const char kUrlNumOtherHostTokensGTOne[];
+// Set if the number of "other" host tokens for a URL is greater than three.
+extern const char kUrlNumOtherHostTokensGTThree[];
+
+////////////////////////////////////////////////////
+// URL path token features
+////////////////////////////////////////////////////
+
+// Token feature containing each alphanumeric string in the path that is at
+// least 3 characters long.  For example, "/abc/d/efg" would have 2 path
+// token features, "abc" and "efg".  Query parameters are not included.
+extern const char kUrlPathToken[];
+
+}  // namespace features
+}  // namepsace safe_browsing
+
+#endif  // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
diff --git a/chrome/renderer/safe_browsing/features_unittest.cc b/chrome/renderer/safe_browsing/features_unittest.cc
new file mode 100644
index 0000000..ad07ba2
--- /dev/null
+++ b/chrome/renderer/safe_browsing/features_unittest.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/features.h"
+
+#include "base/format_macros.h"
+#include "base/string_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace safe_browsing {
+
+TEST(PhishingFeaturesTest, TooManyFeatures) {
+  FeatureMap features;
+  for (size_t i = 0; i < FeatureMap::kMaxFeatureMapSize; ++i) {
+    EXPECT_TRUE(features.AddBooleanFeature(StringPrintf("Feature%" PRIuS, i)));
+  }
+  EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size());
+
+  // Attempting to add more features should fail.
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_FALSE(features.AddBooleanFeature(StringPrintf("Extra%" PRIuS, i)));
+  }
+  EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size());
+}
+
+}  // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc
new file mode 100644
index 0000000..7937cea
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "base/histogram.h"
+#include "base/logging.h"
+#include "base/perftimer.h"
+#include "base/string_util.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "googleurl/src/gurl.h"
+#include "net/base/registry_controlled_domain.h"
+
+namespace safe_browsing {
+
+PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}
+
+PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}
+
+bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url,
+                                                  FeatureMap* features) {
+  PerfTimer timer;
+  if (url.HostIsIPAddress()) {
+    if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) {
+      return false;
+    }
+  } else {
+    std::string host;
+    TrimString(url.host(), ".", &host);  // Remove any leading/trailing dots.
+
+    // TODO(bryner): Ensure that the url encoding is consistent with
+    // the features in the model.
+    size_t registry_length =
+        net::RegistryControlledDomainService::GetRegistryLength(
+            host,
+            true /* allow_unknown_registries */);
+
+    if (registry_length == 0 || registry_length == std::string::npos) {
+      LOG(ERROR) << "Could not find TLD for host: " << host;
+      return false;
+    }
+    DCHECK_LT(registry_length, host.size())
+        << "Non-zero registry length, but host is only a TLD: " << host;
+    size_t tld_start = host.size() - registry_length;
+    if (!features->AddBooleanFeature(features::kUrlTldToken +
+                                     host.substr(tld_start))) {
+      return false;
+    }
+
+    // Pull off the TLD and the preceeding dot.
+    host.erase(tld_start - 1);
+    std::vector<std::string> host_tokens;
+    SplitStringDontTrim(host, '.', &host_tokens);
+    // Get rid of any empty components.
+    std::vector<std::string>::iterator new_end =
+        std::remove(host_tokens.begin(), host_tokens.end(), "");
+    host_tokens.erase(new_end, host_tokens.end());
+    if (host_tokens.empty()) {
+      LOG(ERROR) << "Could not find domain for host: " << host;
+      return false;
+    }
+    if (!features->AddBooleanFeature(features::kUrlDomainToken +
+                                     host_tokens.back())) {
+      return false;
+    }
+    host_tokens.pop_back();
+
+    // Now we're just left with the "other" host tokens.
+    for (std::vector<std::string>::iterator it = host_tokens.begin();
+         it != host_tokens.end(); ++it) {
+      if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it)) {
+        return false;
+      }
+    }
+
+    if (host_tokens.size() > 1) {
+      if (!features->AddBooleanFeature(
+              features::kUrlNumOtherHostTokensGTOne)) {
+        return false;
+      }
+      if (host_tokens.size() > 3) {
+        if (!features->AddBooleanFeature(
+                features::kUrlNumOtherHostTokensGTThree)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  std::vector<std::string> long_tokens;
+  SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens);
+  for (std::vector<std::string>::iterator it = long_tokens.begin();
+       it != long_tokens.end(); ++it) {
+    if (!features->AddBooleanFeature(features::kUrlPathToken + *it)) {
+      return false;
+    }
+  }
+
+  UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed());
+  return true;
+}
+
+// static
+void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(
+    const std::string& full,
+    std::vector<std::string>* tokens) {
+  // Split on common non-alphanumerics.
+  // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.
+  static const char kTokenSeparators[] = ".,\\/_-|=%:!&";
+  std::vector<std::string> raw_splits;
+  Tokenize(full, kTokenSeparators, &raw_splits);
+
+  // Copy over only the splits that are 3 or more chars long.
+  // TODO(bryner): Determine a meaningful min size.
+  for (std::vector<std::string>::iterator it = raw_splits.begin();
+       it != raw_splits.end(); ++it) {
+    if (it->length() >= kMinPathComponentLength) {
+      tokens->push_back(*it);
+    }
+  }
+}
+
+}  // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h
new file mode 100644
index 0000000..df45136
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// PhishingUrlFeatureExtractor handles computing URL-based features for
+// the client-side phishing detection model.  These include tokens in the
+// host and path, features pertaining to host length, and IP addresses.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
+#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
+
+#include <string>
+#include <vector>
+
+#include "base/basictypes.h"
+
+class GURL;
+
+namespace safe_browsing {
+class FeatureMap;
+
+class PhishingUrlFeatureExtractor {
+ public:
+  PhishingUrlFeatureExtractor();
+  ~PhishingUrlFeatureExtractor();
+
+  // Extracts features for |url| into the given feature map.
+  // Returns true on success.
+  bool ExtractFeatures(const GURL& url, FeatureMap* features);
+
+ private:
+  friend class PhishingUrlFeatureExtractorTest;
+
+  static const size_t kMinPathComponentLength = 3;
+
+  // Given a string, finds all substrings of consecutive alphanumeric
+  // characters of length >= kMinPathComponentLength and inserts them into
+  // tokens.
+  static void SplitStringIntoLongAlphanumTokens(
+      const std::string& full,
+      std::vector<std::string>* tokens);
+
+  DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor);
+};
+
+}  // namespace safe_browsing
+
+#endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc
new file mode 100644
index 0000000..c71a3a2
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
+
+#include <string>
+#include <vector>
+#include "chrome/renderer/safe_browsing/features.h"
+#include "googleurl/src/gurl.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using ::testing::ContainerEq;
+using ::testing::ElementsAre;
+
+namespace safe_browsing {
+
+class PhishingUrlFeatureExtractorTest : public ::testing::Test {
+ protected:
+  PhishingUrlFeatureExtractor extractor_;
+
+  void SplitStringIntoLongAlphanumTokens(const std::string& full,
+                                         std::vector<std::string>* tokens) {
+    PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full,
+                                                                   tokens);
+  }
+};
+
+TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) {
+  std::string url = "http://123.0.0.1/mydocuments/a.file.html";
+  FeatureMap expected_features;
+  expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress);
+  expected_features.AddBooleanFeature(features::kUrlPathToken +
+                                      std::string("mydocuments"));
+  expected_features.AddBooleanFeature(features::kUrlPathToken +
+                                      std::string("file"));
+  expected_features.AddBooleanFeature(features::kUrlPathToken +
+                                      std::string("html"));
+
+  FeatureMap features;
+  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear";
+  expected_features.Clear();
+  expected_features.AddBooleanFeature(features::kUrlTldToken +
+                                      std::string("co.uk"));
+  expected_features.AddBooleanFeature(features::kUrlDomainToken +
+                                      std::string("cnn"));
+  expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
+                                      std::string("www"));
+  expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne);
+  expected_features.AddBooleanFeature(features::kUrlPathToken +
+                                      std::string("sports"));
+  expected_features.AddBooleanFeature(features::kUrlPathToken +
+                                      std::string("index"));
+  expected_features.AddBooleanFeature(features::kUrlPathToken +
+                                      std::string("html"));
+
+  features.Clear();
+  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  url = "http://justadomain.com/";
+  expected_features.Clear();
+  expected_features.AddBooleanFeature(features::kUrlTldToken +
+                                      std::string("com"));
+  expected_features.AddBooleanFeature(features::kUrlDomainToken +
+                                      std::string("justadomain"));
+
+  features.Clear();
+  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  url = "http://...www..lotsodots....com./";
+  expected_features.Clear();
+  expected_features.AddBooleanFeature(features::kUrlTldToken +
+                                      std::string("com"));
+  expected_features.AddBooleanFeature(features::kUrlDomainToken +
+                                      std::string("lotsodots"));
+  expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
+                                      std::string("www"));
+
+  features.Clear();
+  ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  url = "http://com/123";
+  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
+
+  url = "http://.co.uk/";
+  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
+
+  url = "file:///nohost.txt";
+  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
+
+  url = "not:valid:at:all";
+  EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
+}
+
+TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) {
+  std::string full = "This.is/a_pretty\\unusual-!path,indeed";
+  std::vector<std::string> long_tokens;
+  SplitStringIntoLongAlphanumTokens(full, &long_tokens);
+  EXPECT_THAT(long_tokens,
+              ElementsAre("This", "pretty", "unusual", "path", "indeed"));
+
+  long_tokens.clear();
+  full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20";
+  SplitStringIntoLongAlphanumTokens(full, &long_tokens);
+  EXPECT_THAT(long_tokens, ElementsAre());
+}
+
+}  // namespace safe_browsing
author	bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-07-09 03:28:58 +0000
committer	bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-07-09 03:28:58 +0000
commit	f4658320a60cf5fb85b3d77b0542dd93144c67fe (patch)
tree	674300bc926366eb1cf44657db747715e2309e96
parent	6859659e223aeaac973a52c358374bc9bd7f71af (diff)
download	chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.zip chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.tar.gz chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.tar.bz2