diff options
author | bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-07-09 03:28:58 +0000 |
---|---|---|
committer | bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-07-09 03:28:58 +0000 |
commit | f4658320a60cf5fb85b3d77b0542dd93144c67fe (patch) | |
tree | 674300bc926366eb1cf44657db747715e2309e96 | |
parent | 6859659e223aeaac973a52c358374bc9bd7f71af (diff) | |
download | chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.zip chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.tar.gz chromium_src-f4658320a60cf5fb85b3d77b0542dd93144c67fe.tar.bz2 |
Add URL-based features for client-side phishing detection.
This change implements the URL-based features that we'll use for client-side
phishing detection. Right now, the features are simply inserted into a map.
Later changes will add content-based features, scoring, and hook up the code to
run after page load is finished.
BUG=none
TEST=PhishingUrlFeatureExtractorTest, PhishingFeaturesTest
Review URL: http://codereview.chromium.org/2843036
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@51922 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | chrome/chrome_renderer.gypi | 4 | ||||
-rw-r--r-- | chrome/chrome_tests.gypi | 2 | ||||
-rw-r--r-- | chrome/renderer/safe_browsing/features.cc | 51 | ||||
-rw-r--r-- | chrome/renderer/safe_browsing/features.h | 108 | ||||
-rw-r--r-- | chrome/renderer/safe_browsing/features_unittest.cc | 27 | ||||
-rw-r--r-- | chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc | 127 | ||||
-rw-r--r-- | chrome/renderer/safe_browsing/phishing_url_feature_extractor.h | 48 | ||||
-rw-r--r-- | chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc | 115 |
8 files changed, 482 insertions, 0 deletions
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi index 4c94873..1b27e29 100644 --- a/chrome/chrome_renderer.gypi +++ b/chrome/chrome_renderer.gypi @@ -160,6 +160,10 @@ 'renderer/renderer_webstoragearea_impl.h', 'renderer/renderer_webstoragenamespace_impl.cc', 'renderer/renderer_webstoragenamespace_impl.h', + 'renderer/safe_browsing/features.cc', + 'renderer/safe_browsing/features.h', + 'renderer/safe_browsing/phishing_url_feature_extractor.cc', + 'renderer/safe_browsing/phishing_url_feature_extractor.h', 'renderer/spellchecker/spellcheck.cc', 'renderer/spellchecker/spellcheck.h', 'renderer/spellchecker/spellcheck_worditerator.cc', diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi index ce32a15..b21d7c35 100644 --- a/chrome/chrome_tests.gypi +++ b/chrome/chrome_tests.gypi @@ -1074,6 +1074,8 @@ 'renderer/render_widget_unittest.cc', 'renderer/renderer_about_handler_unittest.cc', 'renderer/renderer_main_unittest.cc', + 'renderer/safe_browsing/features_unittest.cc', + 'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc', 'renderer/spellchecker/spellcheck_unittest.cc', 'renderer/spellchecker/spellcheck_worditerator_unittest.cc', 'renderer/translate_helper_unittest.cc', diff --git a/chrome/renderer/safe_browsing/features.cc b/chrome/renderer/safe_browsing/features.cc new file mode 100644 index 0000000..47a093c --- /dev/null +++ b/chrome/renderer/safe_browsing/features.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/renderer/safe_browsing/features.h" + +#include "base/histogram.h" +#include "base/logging.h" + +namespace safe_browsing { + +const size_t FeatureMap::kMaxFeatureMapSize = 10000; + +FeatureMap::FeatureMap() {} +FeatureMap::~FeatureMap() {} + +bool FeatureMap::AddBooleanFeature(const std::string& name) { + if (features_.size() >= kMaxFeatureMapSize) { + // If we hit this case, it indicates that either kMaxFeatureMapSize is + // too small, or there is a bug causing too many features to be added. + // In this case, we'll log to a histogram so we can see that this is + // happening, and make phishing classification fail silently. + LOG(ERROR) << "Not adding feature: " << name << " because the " + << "feature map is too large."; + UMA_HISTOGRAM_COUNTS("SBClientPhishing.TooManyFeatures", 1); + return false; + } + features_[name] = 1.0; + return true; +} + +void FeatureMap::Clear() { + features_.clear(); +} + +namespace features { +// URL host features +const char kUrlHostIsIpAddress[] = "UrlHostIsIpAddress"; +const char kUrlTldToken[] = "UrlTld="; +const char kUrlDomainToken[] = "UrlDomain="; +const char kUrlOtherHostToken[] = "UrlOtherHostToken="; + +// URL host aggregate features +const char kUrlNumOtherHostTokensGTOne[] = "UrlNumOtherHostTokens>1"; +const char kUrlNumOtherHostTokensGTThree[] = "UrlNumOtherHostTokens>3"; + +// URL path features +const char kUrlPathToken[] = "UrlPathToken="; + +} // namespace features +} // namespace safe_browsing diff --git a/chrome/renderer/safe_browsing/features.h b/chrome/renderer/safe_browsing/features.h new file mode 100644 index 0000000..4354a7e --- /dev/null +++ b/chrome/renderer/safe_browsing/features.h @@ -0,0 +1,108 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Common types and constants for extracting and evaluating features in the +// client-side phishing detection model. A feature is simply a string and an +// associated floating-point value between 0 and 1. The phishing +// classification model contains rules which give an appropriate weight to each +// feature or combination of features. These values can then be summed to +// compute a final phishiness score. +// +// Some features are boolean features. If these features are set, they always +// have a value of 0.0 or 1.0. In practice, the features are only set if the +// value is true (1.0). +// +// We also use token features. These features have a unique name that is +// constructed from the URL or page contents that we are classifying, for +// example, "UrlDomain=chromium". These features are also always set to 1.0 +// if they are present. +// +// The intermediate storage of the features for a URL is a FeatureMap, which is +// just a thin wrapper around a map of feature name to value. The entire set +// of features for a URL is extracted before we do any scoring. + +#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ +#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ + +#include <string> +#include "base/basictypes.h" +#include "base/hash_tables.h" + +namespace safe_browsing { + +// Container for a map of features to values, which enforces behavior +// such as a maximum number of features in the map. +class FeatureMap { + public: + FeatureMap(); + ~FeatureMap(); + + // Adds a boolean feature to a FeatureMap with a value of 1.0. + // Returns true on success, or false if the feature map exceeds + // kMaxFeatureMapSize. + bool AddBooleanFeature(const std::string& name); + + // Provides read-only access to the current set of features. + const base::hash_map<std::string, double>& features() const { + return features_; + } + + // Clears the set of features in the map. + void Clear(); + + // This is an upper bound on the number of features that will be extracted. + // We should never hit this cap; it is intended as a sanity check to prevent + // the FeatureMap from growing too large. + static const size_t kMaxFeatureMapSize; + + private: + base::hash_map<std::string, double> features_; + + DISALLOW_COPY_AND_ASSIGN(FeatureMap); +}; + +namespace features { +// Constants for the various feature names that we use. + +//////////////////////////////////////////////////// +// URL host features +//////////////////////////////////////////////////// + +// Set if the URL's hostname is an IP address. +extern const char kUrlHostIsIpAddress[]; +// Token feature containing the portion of the hostname controlled by a +// registrar, for example "com" or "co.uk". +extern const char kUrlTldToken[]; +// Token feature containing the first host component below the registrar. +// For example, in "www.google.com", the domain would be "google". +extern const char kUrlDomainToken[]; +// Token feature containing each host component below the domain. +// For example, in "www.host.example.com", both "www" and "host" would be +// "other host tokens". +extern const char kUrlOtherHostToken[]; + +//////////////////////////////////////////////////// +// Aggregate features for URL host tokens +//////////////////////////////////////////////////// + +// Set if the number of "other" host tokens for a URL is greater than one. +// Longer hostnames, regardless of the specific tokens, can be a signal that +// the URL is phishy. +extern const char kUrlNumOtherHostTokensGTOne[]; +// Set if the number of "other" host tokens for a URL is greater than three. +extern const char kUrlNumOtherHostTokensGTThree[]; + +//////////////////////////////////////////////////// +// URL path token features +//////////////////////////////////////////////////// + +// Token feature containing each alphanumeric string in the path that is at +// least 3 characters long. For example, "/abc/d/efg" would have 2 path +// token features, "abc" and "efg". Query parameters are not included. +extern const char kUrlPathToken[]; + +} // namespace features +} // namepsace safe_browsing + +#endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_ diff --git a/chrome/renderer/safe_browsing/features_unittest.cc b/chrome/renderer/safe_browsing/features_unittest.cc new file mode 100644 index 0000000..ad07ba2 --- /dev/null +++ b/chrome/renderer/safe_browsing/features_unittest.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/renderer/safe_browsing/features.h" + +#include "base/format_macros.h" +#include "base/string_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace safe_browsing { + +TEST(PhishingFeaturesTest, TooManyFeatures) { + FeatureMap features; + for (size_t i = 0; i < FeatureMap::kMaxFeatureMapSize; ++i) { + EXPECT_TRUE(features.AddBooleanFeature(StringPrintf("Feature%" PRIuS, i))); + } + EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size()); + + // Attempting to add more features should fail. + for (size_t i = 0; i < 3; ++i) { + EXPECT_FALSE(features.AddBooleanFeature(StringPrintf("Extra%" PRIuS, i))); + } + EXPECT_EQ(FeatureMap::kMaxFeatureMapSize, features.features().size()); +} + +} // namespace safe_browsing diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc new file mode 100644 index 0000000..7937cea --- /dev/null +++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" + +#include <algorithm> +#include <string> +#include <vector> +#include "base/histogram.h" +#include "base/logging.h" +#include "base/perftimer.h" +#include "base/string_util.h" +#include "chrome/renderer/safe_browsing/features.h" +#include "googleurl/src/gurl.h" +#include "net/base/registry_controlled_domain.h" + +namespace safe_browsing { + +PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {} + +PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {} + +bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url, + FeatureMap* features) { + PerfTimer timer; + if (url.HostIsIPAddress()) { + if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress)) { + return false; + } + } else { + std::string host; + TrimString(url.host(), ".", &host); // Remove any leading/trailing dots. + + // TODO(bryner): Ensure that the url encoding is consistent with + // the features in the model. + size_t registry_length = + net::RegistryControlledDomainService::GetRegistryLength( + host, + true /* allow_unknown_registries */); + + if (registry_length == 0 || registry_length == std::string::npos) { + LOG(ERROR) << "Could not find TLD for host: " << host; + return false; + } + DCHECK_LT(registry_length, host.size()) + << "Non-zero registry length, but host is only a TLD: " << host; + size_t tld_start = host.size() - registry_length; + if (!features->AddBooleanFeature(features::kUrlTldToken + + host.substr(tld_start))) { + return false; + } + + // Pull off the TLD and the preceeding dot. + host.erase(tld_start - 1); + std::vector<std::string> host_tokens; + SplitStringDontTrim(host, '.', &host_tokens); + // Get rid of any empty components. + std::vector<std::string>::iterator new_end = + std::remove(host_tokens.begin(), host_tokens.end(), ""); + host_tokens.erase(new_end, host_tokens.end()); + if (host_tokens.empty()) { + LOG(ERROR) << "Could not find domain for host: " << host; + return false; + } + if (!features->AddBooleanFeature(features::kUrlDomainToken + + host_tokens.back())) { + return false; + } + host_tokens.pop_back(); + + // Now we're just left with the "other" host tokens. + for (std::vector<std::string>::iterator it = host_tokens.begin(); + it != host_tokens.end(); ++it) { + if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it)) { + return false; + } + } + + if (host_tokens.size() > 1) { + if (!features->AddBooleanFeature( + features::kUrlNumOtherHostTokensGTOne)) { + return false; + } + if (host_tokens.size() > 3) { + if (!features->AddBooleanFeature( + features::kUrlNumOtherHostTokensGTThree)) { + return false; + } + } + } + } + + std::vector<std::string> long_tokens; + SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens); + for (std::vector<std::string>::iterator it = long_tokens.begin(); + it != long_tokens.end(); ++it) { + if (!features->AddBooleanFeature(features::kUrlPathToken + *it)) { + return false; + } + } + + UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed()); + return true; +} + +// static +void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens( + const std::string& full, + std::vector<std::string>* tokens) { + // Split on common non-alphanumerics. + // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly. + static const char kTokenSeparators[] = ".,\\/_-|=%:!&"; + std::vector<std::string> raw_splits; + Tokenize(full, kTokenSeparators, &raw_splits); + + // Copy over only the splits that are 3 or more chars long. + // TODO(bryner): Determine a meaningful min size. + for (std::vector<std::string>::iterator it = raw_splits.begin(); + it != raw_splits.end(); ++it) { + if (it->length() >= kMinPathComponentLength) { + tokens->push_back(*it); + } + } +} + +} // namespace safe_browsing diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h new file mode 100644 index 0000000..df45136 --- /dev/null +++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor.h @@ -0,0 +1,48 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// PhishingUrlFeatureExtractor handles computing URL-based features for +// the client-side phishing detection model. These include tokens in the +// host and path, features pertaining to host length, and IP addresses. + +#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_ +#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_ + +#include <string> +#include <vector> + +#include "base/basictypes.h" + +class GURL; + +namespace safe_browsing { +class FeatureMap; + +class PhishingUrlFeatureExtractor { + public: + PhishingUrlFeatureExtractor(); + ~PhishingUrlFeatureExtractor(); + + // Extracts features for |url| into the given feature map. + // Returns true on success. + bool ExtractFeatures(const GURL& url, FeatureMap* features); + + private: + friend class PhishingUrlFeatureExtractorTest; + + static const size_t kMinPathComponentLength = 3; + + // Given a string, finds all substrings of consecutive alphanumeric + // characters of length >= kMinPathComponentLength and inserts them into + // tokens. + static void SplitStringIntoLongAlphanumTokens( + const std::string& full, + std::vector<std::string>* tokens); + + DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor); +}; + +} // namespace safe_browsing + +#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_ diff --git a/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc new file mode 100644 index 0000000..c71a3a2 --- /dev/null +++ b/chrome/renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h" + +#include <string> +#include <vector> +#include "chrome/renderer/safe_browsing/features.h" +#include "googleurl/src/gurl.h" +#include "testing/gmock/include/gmock/gmock.h" +#include "testing/gtest/include/gtest/gtest.h" + +using ::testing::ContainerEq; +using ::testing::ElementsAre; + +namespace safe_browsing { + +class PhishingUrlFeatureExtractorTest : public ::testing::Test { + protected: + PhishingUrlFeatureExtractor extractor_; + + void SplitStringIntoLongAlphanumTokens(const std::string& full, + std::vector<std::string>* tokens) { + PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full, + tokens); + } +}; + +TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) { + std::string url = "http://123.0.0.1/mydocuments/a.file.html"; + FeatureMap expected_features; + expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress); + expected_features.AddBooleanFeature(features::kUrlPathToken + + std::string("mydocuments")); + expected_features.AddBooleanFeature(features::kUrlPathToken + + std::string("file")); + expected_features.AddBooleanFeature(features::kUrlPathToken + + std::string("html")); + + FeatureMap features; + ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear"; + expected_features.Clear(); + expected_features.AddBooleanFeature(features::kUrlTldToken + + std::string("co.uk")); + expected_features.AddBooleanFeature(features::kUrlDomainToken + + std::string("cnn")); + expected_features.AddBooleanFeature(features::kUrlOtherHostToken + + std::string("www")); + expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne); + expected_features.AddBooleanFeature(features::kUrlPathToken + + std::string("sports")); + expected_features.AddBooleanFeature(features::kUrlPathToken + + std::string("index")); + expected_features.AddBooleanFeature(features::kUrlPathToken + + std::string("html")); + + features.Clear(); + ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + url = "http://justadomain.com/"; + expected_features.Clear(); + expected_features.AddBooleanFeature(features::kUrlTldToken + + std::string("com")); + expected_features.AddBooleanFeature(features::kUrlDomainToken + + std::string("justadomain")); + + features.Clear(); + ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + url = "http://...www..lotsodots....com./"; + expected_features.Clear(); + expected_features.AddBooleanFeature(features::kUrlTldToken + + std::string("com")); + expected_features.AddBooleanFeature(features::kUrlDomainToken + + std::string("lotsodots")); + expected_features.AddBooleanFeature(features::kUrlOtherHostToken + + std::string("www")); + + features.Clear(); + ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features)); + EXPECT_THAT(features.features(), ContainerEq(expected_features.features())); + + url = "http://com/123"; + EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); + + url = "http://.co.uk/"; + EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); + + url = "file:///nohost.txt"; + EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); + + url = "not:valid:at:all"; + EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features)); +} + +TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) { + std::string full = "This.is/a_pretty\\unusual-!path,indeed"; + std::vector<std::string> long_tokens; + SplitStringIntoLongAlphanumTokens(full, &long_tokens); + EXPECT_THAT(long_tokens, + ElementsAre("This", "pretty", "unusual", "path", "indeed")); + + long_tokens.clear(); + full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20"; + SplitStringIntoLongAlphanumTokens(full, &long_tokens); + EXPECT_THAT(long_tokens, ElementsAre()); +} + +} // namespace safe_browsing |