summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-07-09 03:28:58 +0000
committerbryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-07-09 03:28:58 +0000
commitf4658320a60cf5fb85b3d77b0542dd93144c67fe (patch)
tree674300bc926366eb1cf44657db747715e2309e96
parent6859659e223aeaac973a52c358374bc9bd7f71af (diff)
downloadchromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.zip
chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.tar.gz
chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.tar.bz2
Add URL-based features for client-side phishing detection.
This change implements the URL-based features that we'll use for client-side phishing detection. Right now, the features are simply inserted into a map. Later changes will add content-based features, scoring, and hook up the code to run after page load is finished. BUG=none TEST=PhishingUrlFeatureExtractorTest, PhishingFeaturesTest Review URL: http://codereview.chromium.org/2843036 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@51922 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--chrome/chrome_renderer.gypi4
-rw-r--r--chrome/chrome_tests.gypi2
-rw-r--r--chrome/renderer/safe_browsing/features.cc51
-rw-r--r--chrome/renderer/safe_browsing/features.h108
-rw-r--r--chrome/renderer/safe_browsing/features_unittest.cc27
-rw-r--r--chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc127
-rw-r--r--chrome/renderer/safe_browsing/phishing_url_feature_extractor.h48
-rw-r--r--chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc115
8 files changed, 482 insertions, 0 deletions
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi
index 4c94873..1b27e29 100644
--- a/chrome/chrome_renderer.gypi
+++ b/chrome/chrome_renderer.gypi
@@ -160,6 +160,10 @@
'renderer/renderer_webstoragearea_impl.h',
'renderer/renderer_webstoragenamespace_impl.cc',
'renderer/renderer_webstoragenamespace_impl.h',
+ 'renderer/safe_browsing/features.cc',
+ 'renderer/safe_browsing/features.h',
+ 'renderer/safe_browsing/phishing_url_feature_extractor.cc',
+ 'renderer/safe_browsing/phishing_url_feature_extractor.h',
'renderer/spellchecker/spellcheck.cc',
'renderer/spellchecker/spellcheck.h',
'renderer/spellchecker/spellcheck_worditerator.cc',
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index ce32a15..b21d7c35 100644
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -1074,6 +1074,8 @@
'renderer/render_widget_unittest.cc',
'renderer/renderer_about_handler_unittest.cc',
'renderer/renderer_main_unittest.cc',
+ 'renderer/safe_browsing/features_unittest.cc',
+ 'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc',
'renderer/spellchecker/spellcheck_unittest.cc',
'renderer/spellchecker/spellcheck_worditerator_unittest.cc',
'renderer/translate_helper_unittest.cc',
diff --git a/chrome/renderer/safe_browsing/features.cc b/chrome/renderer/safe_browsing/features.cc
new file mode 100644
index 0000000..47a093c
--- /dev/null
+++ b/chrome/renderer/safe_browsing/features.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/features.h"
+
+#include "base/histogram.h"
+#include "base/logging.h"
+
+namespace safe_browsing {
+
+const size_t FeatureMap::kMaxFeatureMapSize = 10000;
+
+FeatureMap::FeatureMap() {}
+FeatureMap::~FeatureMap() {}
+
+bool FeatureMap::AddBooleanFeature(const std::string& name) {
+ if (features_.size() >= kMaxFeatureMapSize) {
+ // If we hit this case, it indicates that either kMaxFeatureMapSize is
+ // too small, or there is a bug causing too many features to be added.
+ // In this case, we'll log to a histogram so we can see that this is
+ // happening, and make phishing classification fail silently.
+ LOG(ERROR) << "Not adding feature: " << name << " because the "
+ << "feature map is too large.";
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1);
+ return false;
+ }
+ features_[name] = 1.0;
+ return true;
+}
+
+void FeatureMap::Clear() {
+ features_.clear();
+}
+
+namespace features {
+// URL host features
+const char kUrlHostIsIpAddress[] = "UrlHostIsIpAddress";
+const char kUrlTldToken[] = "UrlTld=";
+const char kUrlDomainToken[] = "UrlDomain=";
+const char kUrlOtherHostToken[] = "UrlOtherHostToken=";
+
+// URL host aggregate features
+const char kUrlNumOtherHostTokensGTOne[] = "UrlNumOtherHostTokens>1";
+const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3";
+
+// URL path features
+const char kUrlPathToken[] = "UrlPathToken=";
+
+} // namespace features
+} // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/features.h b/chrome/renderer/safe_browsing/features.h
new file mode 100644
index 0000000..4354a7e
--- /dev/null
+++ b/chrome/renderer/safe_browsing/features.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Common types and constants for extracting and evaluating features in the
+// client-side phishing detection model. A feature is simply a string and an
+// associated floating-point value between 0 and 1. The phishing
+// classification model contains rules which give an appropriate weight to each
+// feature or combination of features. These values can then be summed to
+// compute a final phishiness score.
+//
+// Some features are boolean features. If these features are set, they always
+// have a value of 0.0 or 1.0. In practice, the features are only set if the
+// value is true (1.0).
+//
+// We also use token features. These features have a unique name that is
+// constructed from the URL or page contents that we are classifying, for
+// example, "UrlDomain=chromium". These features are also always set to 1.0
+// if they are present.
+//
+// The intermediate storage of the features for a URL is a FeatureMap, which is
+// just a thin wrapper around a map of feature name to value. The entire set
+// of features for a URL is extracted before we do any scoring.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
+#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
+
+#include <string>
+#include "base/basictypes.h"
+#include "base/hash_tables.h"
+
+namespace safe_browsing {
+
+// Container for a map of features to values, which enforces behavior
+// such as a maximum number of features in the map.
+class FeatureMap {
+ public:
+ FeatureMap();
+ ~FeatureMap();
+
+ // Adds a boolean feature to a FeatureMap with a value of 1.0.
+ // Returns true on success, or false if the feature map exceeds
+ // kMaxFeatureMapSize.
+ bool AddBooleanFeature(const std::string& name);
+
+ // Provides read-only access to the current set of features.
+ const base::hash_map<std::string, double>& features() const {
+ return features_;
+ }
+
+ // Clears the set of features in the map.
+ void Clear();
+
+ // This is an upper bound on the number of features that will be extracted.
+ // We should never hit this cap; it is intended as a sanity check to prevent
+ // the FeatureMap from growing too large.
+ static const size_t kMaxFeatureMapSize;
+
+ private:
+ base::hash_map<std::string, double> features_;
+
+ DISALLOW_COPY_AND_ASSIGN(FeatureMap);
+};
+
+namespace features {
+// Constants for the various feature names that we use.
+
+////////////////////////////////////////////////////
+// URL host features
+////////////////////////////////////////////////////
+
+// Set if the URL's hostname is an IP address.
+extern const char kUrlHostIsIpAddress[];
+// Token feature containing the portion of the hostname controlled by a
+// registrar, for example "com" or "co.uk".
+extern const char kUrlTldToken[];
+// Token feature containing the first host component below the registrar.
+// For example, in "www.google.com", the domain would be "google".
+extern const char kUrlDomainToken[];
+// Token feature containing each host component below the domain.
+// For example, in "www.host.example.com", both "www" and "host" would be
+// "other host tokens".
+extern const char kUrlOtherHostToken[];
+
+////////////////////////////////////////////////////
+// Aggregate features for URL host tokens
+////////////////////////////////////////////////////
+
+// Set if the number of "other" host tokens for a URL is greater than one.
+// Longer hostnames, regardless of the specific tokens, can be a signal that
+// the URL is phishy.
+extern const char kUrlNumOtherHostTokensGTOne[];
+// Set if the number of "other" host tokens for a URL is greater than three.
+extern const char kUrlNumOtherHostTokensGTThree[];
+
+////////////////////////////////////////////////////
+// URL path token features
+////////////////////////////////////////////////////
+
+// Token feature containing each alphanumeric string in the path that is at
+// least 3 characters long. For example, "/abc/d/efg" would have 2 path
+// token features, "abc" and "efg". Query parameters are not included.
+extern const char kUrlPathToken[];
+
+} // namespace features
+} // namepsace safe_browsing
+
+#endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
diff --git a/chrome/renderer/safe_browsing/features_unittest.cc b/chrome/renderer/safe_browsing/features_unittest.cc
new file mode 100644
index 0000000..ad07ba2
--- /dev/null
+++ b/chrome/renderer/safe_browsing/features_unittest.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/features.h"
+
+#include "base/format_macros.h"
+#include "base/string_util.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace safe_browsing {
+
+TEST(PhishingFeaturesTest, TooManyFeatures) {
+ FeatureMap features;
+ for (size_t i = 0; i < FeatureMap::kMaxFeatureMapSize; ++i) {
+ EXPECT_TRUE(features.AddBooleanFeature(StringPrintf("Feature%" PRIuS, i)));
+ }
+ EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size());
+
+ // Attempting to add more features should fail.
+ for (size_t i = 0; i < 3; ++i) {
+ EXPECT_FALSE(features.AddBooleanFeature(StringPrintf("Extra%" PRIuS, i)));
+ }
+ EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size());
+}
+
+} // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc
new file mode 100644
index 0000000..7937cea
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "base/histogram.h"
+#include "base/logging.h"
+#include "base/perftimer.h"
+#include "base/string_util.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "googleurl/src/gurl.h"
+#include "net/base/registry_controlled_domain.h"
+
+namespace safe_browsing {
+
+PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}
+
+PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}
+
+bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url,
+ FeatureMap* features) {
+ PerfTimer timer;
+ if (url.HostIsIPAddress()) {
+ if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) {
+ return false;
+ }
+ } else {
+ std::string host;
+ TrimString(url.host(), ".", &host); // Remove any leading/trailing dots.
+
+ // TODO(bryner): Ensure that the url encoding is consistent with
+ // the features in the model.
+ size_t registry_length =
+ net::RegistryControlledDomainService::GetRegistryLength(
+ host,
+ true /* allow_unknown_registries */);
+
+ if (registry_length == 0 || registry_length == std::string::npos) {
+ LOG(ERROR) << "Could not find TLD for host: " << host;
+ return false;
+ }
+ DCHECK_LT(registry_length, host.size())
+ << "Non-zero registry length, but host is only a TLD: " << host;
+ size_t tld_start = host.size() - registry_length;
+ if (!features->AddBooleanFeature(features::kUrlTldToken +
+ host.substr(tld_start))) {
+ return false;
+ }
+
+ // Pull off the TLD and the preceeding dot.
+ host.erase(tld_start - 1);
+ std::vector<std::string> host_tokens;
+ SplitStringDontTrim(host, '.', &host_tokens);
+ // Get rid of any empty components.
+ std::vector<std::string>::iterator new_end =
+ std::remove(host_tokens.begin(), host_tokens.end(), "");
+ host_tokens.erase(new_end, host_tokens.end());
+ if (host_tokens.empty()) {
+ LOG(ERROR) << "Could not find domain for host: " << host;
+ return false;
+ }
+ if (!features->AddBooleanFeature(features::kUrlDomainToken +
+ host_tokens.back())) {
+ return false;
+ }
+ host_tokens.pop_back();
+
+ // Now we're just left with the "other" host tokens.
+ for (std::vector<std::string>::iterator it = host_tokens.begin();
+ it != host_tokens.end(); ++it) {
+ if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it)) {
+ return false;
+ }
+ }
+
+ if (host_tokens.size() > 1) {
+ if (!features->AddBooleanFeature(
+ features::kUrlNumOtherHostTokensGTOne)) {
+ return false;
+ }
+ if (host_tokens.size() > 3) {
+ if (!features->AddBooleanFeature(
+ features::kUrlNumOtherHostTokensGTThree)) {
+ return false;
+ }
+ }
+ }
+ }
+
+ std::vector<std::string> long_tokens;
+ SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens);
+ for (std::vector<std::string>::iterator it = long_tokens.begin();
+ it != long_tokens.end(); ++it) {
+ if (!features->AddBooleanFeature(features::kUrlPathToken + *it)) {
+ return false;
+ }
+ }
+
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed());
+ return true;
+}
+
+// static
+void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(
+ const std::string& full,
+ std::vector<std::string>* tokens) {
+ // Split on common non-alphanumerics.
+ // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.
+ static const char kTokenSeparators[] = ".,\\/_-|=%:!&";
+ std::vector<std::string> raw_splits;
+ Tokenize(full, kTokenSeparators, &raw_splits);
+
+ // Copy over only the splits that are 3 or more chars long.
+ // TODO(bryner): Determine a meaningful min size.
+ for (std::vector<std::string>::iterator it = raw_splits.begin();
+ it != raw_splits.end(); ++it) {
+ if (it->length() >= kMinPathComponentLength) {
+ tokens->push_back(*it);
+ }
+ }
+}
+
+} // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h
new file mode 100644
index 0000000..df45136
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// PhishingUrlFeatureExtractor handles computing URL-based features for
+// the client-side phishing detection model. These include tokens in the
+// host and path, features pertaining to host length, and IP addresses.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
+#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
+
+#include <string>
+#include <vector>
+
+#include "base/basictypes.h"
+
+class GURL;
+
+namespace safe_browsing {
+class FeatureMap;
+
+class PhishingUrlFeatureExtractor {
+ public:
+ PhishingUrlFeatureExtractor();
+ ~PhishingUrlFeatureExtractor();
+
+ // Extracts features for |url| into the given feature map.
+ // Returns true on success.
+ bool ExtractFeatures(const GURL& url, FeatureMap* features);
+
+ private:
+ friend class PhishingUrlFeatureExtractorTest;
+
+ static const size_t kMinPathComponentLength = 3;
+
+ // Given a string, finds all substrings of consecutive alphanumeric
+ // characters of length >= kMinPathComponentLength and inserts them into
+ // tokens.
+ static void SplitStringIntoLongAlphanumTokens(
+ const std::string& full,
+ std::vector<std::string>* tokens);
+
+ DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor);
+};
+
+} // namespace safe_browsing
+
+#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc
new file mode 100644
index 0000000..c71a3a2
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
+
+#include <string>
+#include <vector>
+#include "chrome/renderer/safe_browsing/features.h"
+#include "googleurl/src/gurl.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using ::testing::ContainerEq;
+using ::testing::ElementsAre;
+
+namespace safe_browsing {
+
+class PhishingUrlFeatureExtractorTest : public ::testing::Test {
+ protected:
+ PhishingUrlFeatureExtractor extractor_;
+
+ void SplitStringIntoLongAlphanumTokens(const std::string& full,
+ std::vector<std::string>* tokens) {
+ PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full,
+ tokens);
+ }
+};
+
+TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) {
+ std::string url = "http://123.0.0.1/mydocuments/a.file.html";
+ FeatureMap expected_features;
+ expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress);
+ expected_features.AddBooleanFeature(features::kUrlPathToken +
+ std::string("mydocuments"));
+ expected_features.AddBooleanFeature(features::kUrlPathToken +
+ std::string("file"));
+ expected_features.AddBooleanFeature(features::kUrlPathToken +
+ std::string("html"));
+
+ FeatureMap features;
+ ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear";
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kUrlTldToken +
+ std::string("co.uk"));
+ expected_features.AddBooleanFeature(features::kUrlDomainToken +
+ std::string("cnn"));
+ expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
+ std::string("www"));
+ expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne);
+ expected_features.AddBooleanFeature(features::kUrlPathToken +
+ std::string("sports"));
+ expected_features.AddBooleanFeature(features::kUrlPathToken +
+ std::string("index"));
+ expected_features.AddBooleanFeature(features::kUrlPathToken +
+ std::string("html"));
+
+ features.Clear();
+ ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ url = "http://justadomain.com/";
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kUrlTldToken +
+ std::string("com"));
+ expected_features.AddBooleanFeature(features::kUrlDomainToken +
+ std::string("justadomain"));
+
+ features.Clear();
+ ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ url = "http://...www..lotsodots....com./";
+ expected_features.Clear();
+ expected_features.AddBooleanFeature(features::kUrlTldToken +
+ std::string("com"));
+ expected_features.AddBooleanFeature(features::kUrlDomainToken +
+ std::string("lotsodots"));
+ expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
+ std::string("www"));
+
+ features.Clear();
+ ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
+ EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+ url = "http://com/123";
+ EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
+
+ url = "http://.co.uk/";
+ EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
+
+ url = "file:///nohost.txt";
+ EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
+
+ url = "not:valid:at:all";
+ EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
+}
+
+TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) {
+ std::string full = "This.is/a_pretty\\unusual-!path,indeed";
+ std::vector<std::string> long_tokens;
+ SplitStringIntoLongAlphanumTokens(full, &long_tokens);
+ EXPECT_THAT(long_tokens,
+ ElementsAre("This", "pretty", "unusual", "path", "indeed"));
+
+ long_tokens.clear();
+ full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20";
+ SplitStringIntoLongAlphanumTokens(full, &long_tokens);
+ EXPECT_THAT(long_tokens, ElementsAre());
+}
+
+} // namespace safe_browsing