diff options
author | bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-09-13 04:21:47 +0000 |
---|---|---|
committer | bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-09-13 04:21:47 +0000 |
commit | 3571205b49a19bf6a2420cb3164f9e386f0758f9 (patch) | |
tree | 08c7d6c403c7607781ed46863e2b3268adab4c1a | |
parent | ffd7d7365317669d36edaaec4c49b1c0842a1dfc (diff) | |
download | chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.zip chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.tar.gz chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.tar.bz2 |
Switch to the new client-side phishing model that uses Murmurhash for word hashes.
This gives about a 50% performance improvement on extraction of page term features.
BUG=96060
TEST=updated existing tests
Review URL: http://codereview.chromium.org/7866011
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@100858 0039d316-1c4b-4281-b951-d872f2087c98
16 files changed, 121 insertions, 53 deletions
diff --git a/chrome/browser/safe_browsing/client_side_detection_service.cc b/chrome/browser/safe_browsing/client_side_detection_service.cc index 1a7b8f7..9bc0be2 100644 --- a/chrome/browser/safe_browsing/client_side_detection_service.cc +++ b/chrome/browser/safe_browsing/client_side_detection_service.cc @@ -50,11 +50,8 @@ const base::TimeDelta ClientSideDetectionService::kPositiveCacheInterval = const char ClientSideDetectionService::kClientReportPhishingUrl[] = "https://sb-ssl.google.com/safebrowsing/clientreport/phishing"; -// Note: when updatng the model version, don't forget to change the filename -// in chrome/common/chrome_constants.cc as well, or else existing users won't -// download the new model. const char ClientSideDetectionService::kClientModelUrl[] = - "https://ssl.gstatic.com/safebrowsing/csd/client_model_v3.pb"; + "https://ssl.gstatic.com/safebrowsing/csd/client_model_v4.pb"; struct ClientSideDetectionService::ClientReportInfo { scoped_ptr<ClientReportPhishingRequestCallback> callback; @@ -600,11 +597,6 @@ bool ClientSideDetectionService::ModelHasValidHashIds( return false; } } - for (int i = 0; i < model.page_word_size(); ++i) { - if (model.page_word(i) < 0 || model.page_word(i) > max_index) { - return false; - } - } return true; } diff --git a/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc b/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc index 09a3716..4dcbf79 100644 --- a/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc +++ b/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc @@ -565,7 +565,6 @@ TEST_F(ClientSideDetectionServiceTest, ModelHasValidHashIds) { model.add_hashes("bla"); EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model)); model.add_page_term(0); - model.add_page_word(0); EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model)); model.add_page_term(-1); @@ -575,13 +574,6 @@ TEST_F(ClientSideDetectionServiceTest, ModelHasValidHashIds) { model.set_page_term(1, 0); EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model)); - model.add_page_word(-2); - EXPECT_FALSE(ClientSideDetectionService::ModelHasValidHashIds(model)); - model.set_page_word(1, 2); - EXPECT_FALSE(ClientSideDetectionService::ModelHasValidHashIds(model)); - model.set_page_word(1, 0); - EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model)); - // Test bad rules. model.add_hashes("blu"); ClientSideModel::Rule* rule = model.add_rule(); diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi index 8f63b70..63195b1 100644 --- a/chrome/chrome_renderer.gypi +++ b/chrome/chrome_renderer.gypi @@ -24,6 +24,7 @@ '../third_party/icu/icu.gyp:icui18n', '../third_party/icu/icu.gyp:icuuc', '../third_party/npapi/npapi.gyp:npapi', + '../third_party/smhasher/smhasher.gyp:murmurhash3', '../third_party/WebKit/Source/WebKit/chromium/WebKit.gyp:webkit', '../ui/gfx/surface/surface.gyp:surface', '../webkit/support/webkit_support.gyp:glue', @@ -140,6 +141,8 @@ 'renderer/safe_browsing/features.h', 'renderer/safe_browsing/malware_dom_details.cc', 'renderer/safe_browsing/malware_dom_details.h', + 'renderer/safe_browsing/murmurhash3_util.cc', + 'renderer/safe_browsing/murmurhash3_util.h', 'renderer/safe_browsing/phishing_classifier.cc', 'renderer/safe_browsing/phishing_classifier.h', 'renderer/safe_browsing/phishing_classifier_delegate.cc', diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi index 386bcfd..6026795 100644 --- a/chrome/chrome_tests.gypi +++ b/chrome/chrome_tests.gypi @@ -1806,6 +1806,7 @@ 'renderer/plugin_uma_unittest.cc', 'renderer/renderer_about_handler_unittest.cc', 'renderer/safe_browsing/features_unittest.cc', + 'renderer/safe_browsing/murmurhash3_util_unittest.cc', 'renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc', 'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc', 'renderer/safe_browsing/scorer_unittest.cc', diff --git a/chrome/common/safe_browsing/client_model.proto b/chrome/common/safe_browsing/client_model.proto index 622ba8f..6a1a912 100644 --- a/chrome/common/safe_browsing/client_model.proto +++ b/chrome/common/safe_browsing/client_model.proto @@ -60,11 +60,12 @@ message ClientSideModel { // as lowercase UTF-8 strings. repeated int32 page_term = 3; - // List of indexes that point to the hashed page words. The page words - // correspond to all words that appear in page terms. If the term - // "one two" is in the list of page terms then "one" and "two" will be - // in the list of page words. - repeated int32 page_word = 4; + // List of hashed page words. The page words correspond to all words that + // appear in page terms. If the term "one two" is in the list of page terms + // then "one" and "two" will be in the list of page words. For page words + // we don't use SHA256 because it is too expensive. We use MurmurHash3 + // instead. See: http://code.google.com/p/smhasher. + repeated fixed32 page_word = 4; // Page terms in page_term contain at most this many page words. required int32 max_words_per_term = 5; @@ -84,4 +85,7 @@ message ClientSideModel { optional int32 size = 2 [default = 128]; }; repeated IPSubnet bad_subnet = 7; + + // Murmur hash seed that was used to hash the page words. + optional fixed32 murmur_hash_seed = 8; } diff --git a/chrome/renderer/safe_browsing/murmurhash3_util.cc b/chrome/renderer/safe_browsing/murmurhash3_util.cc new file mode 100644 index 0000000..9569b32 --- /dev/null +++ b/chrome/renderer/safe_browsing/murmurhash3_util.cc @@ -0,0 +1,16 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/renderer/safe_browsing/murmurhash3_util.h" +#include "third_party/smhasher/src/MurmurHash3.h" + +namespace safe_browsing { + +uint32 MurmurHash3String(const std::string& str, uint32 seed) { + uint32 output; + MurmurHash3_x86_32(str.data(), str.size(), seed, &output); + return output; +} + +} // namespace safe_browsing diff --git a/chrome/renderer/safe_browsing/murmurhash3_util.h b/chrome/renderer/safe_browsing/murmurhash3_util.h new file mode 100644 index 0000000..8569c77 --- /dev/null +++ b/chrome/renderer/safe_browsing/murmurhash3_util.h @@ -0,0 +1,19 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_ +#define CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_ + +#include <string> +#include "base/basictypes.h" + +namespace safe_browsing { + +// Runs the 32-bit murmurhash3 function on the given string and returns the +// output as a uint32. +uint32 MurmurHash3String(const std::string& str, uint32 seed); + +} // namespace safe_browsing + +#endif // CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_ diff --git a/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc b/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc new file mode 100644 index 0000000..f550041 --- /dev/null +++ b/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Unit test to verify basic operation of murmurhash3. + +#include "chrome/renderer/safe_browsing/murmurhash3_util.h" + +#include <string> +#include "testing/gtest/include/gtest/gtest.h" + +namespace safe_browsing { + +TEST(MurmurHash3UtilTest, MurmurHash3String) { + EXPECT_EQ(893017187U, MurmurHash3String("abcd", 1234U)); + EXPECT_EQ(3322282861U, MurmurHash3String("abcde", 56789U)); +} + +} // namespace safe_browsing diff --git a/chrome/renderer/safe_browsing/phishing_classifier.cc b/chrome/renderer/safe_browsing/phishing_classifier.cc index f2323d1..1da1a50 100644 --- a/chrome/renderer/safe_browsing/phishing_classifier.cc +++ b/chrome/renderer/safe_browsing/phishing_classifier.cc @@ -60,6 +60,7 @@ void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) { &scorer_->page_terms(), &scorer_->page_words(), scorer_->max_words_per_term(), + scorer_->murmurhash3_seed(), clock_.get())); } diff --git a/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc b/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc index e92ed70..d2a75df 100644 --- a/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc +++ b/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc @@ -18,6 +18,7 @@ #include "chrome/common/safe_browsing/csd.pb.h" #include "chrome/renderer/safe_browsing/features.h" #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" +#include "chrome/renderer/safe_browsing/murmurhash3_util.h" #include "chrome/renderer/safe_browsing/render_view_fake_resources_test.h" #include "chrome/renderer/safe_browsing/scorer.h" #include "crypto/sha2.h" @@ -72,7 +73,8 @@ class PhishingClassifierTest : public RenderViewFakeResourcesTest { rule->set_weight(1.0); model.add_page_term(3); - model.add_page_word(3); + model.set_murmur_hash_seed(2777808611U); + model.add_page_word(MurmurHash3String("login", model.murmur_hash_seed())); model.set_max_words_per_term(1); clock_ = new MockFeatureExtractorClock; diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc index 404a0b3..ff3a50e8 100644 --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc @@ -17,6 +17,7 @@ #include "crypto/sha2.h" #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" #include "chrome/renderer/safe_browsing/features.h" +#include "chrome/renderer/safe_browsing/murmurhash3_util.h" #include "ui/base/l10n/l10n_util.h" #include "unicode/ubrk.h" @@ -40,7 +41,7 @@ const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000; // All of the state pertaining to the current feature extraction. struct PhishingTermFeatureExtractor::ExtractionState { - // Stores up to max_words_per_ngram_ previous words separated by spaces. + // Stores up to max_words_per_term_ previous words separated by spaces. std::string previous_words; // Stores the sizes of the words in previous_words. Note: the size includes @@ -90,12 +91,14 @@ struct PhishingTermFeatureExtractor::ExtractionState { PhishingTermFeatureExtractor::PhishingTermFeatureExtractor( const base::hash_set<std::string>* page_term_hashes, - const base::hash_set<std::string>* page_word_hashes, + const base::hash_set<uint32>* page_word_hashes, size_t max_words_per_term, + uint32 murmurhash3_seed, FeatureExtractorClock* clock) : page_term_hashes_(page_term_hashes), page_word_hashes_(page_word_hashes), max_words_per_term_(max_words_per_term), + murmurhash3_seed_(murmurhash3_seed), negative_word_cache_(kMaxNegativeWordCacheSize), clock_(clock), ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { @@ -206,8 +209,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() { void PhishingTermFeatureExtractor::HandleWord( const base::StringPiece16& word) { // Quickest out if we have seen this word before and know that it's not - // part of any term. This avoids the SHA256, lowercasing, and UTF conversion, - // all of which are relatively expensive. + // part of any term. This avoids the lowercasing and UTF conversion, both of + // which are relatively expensive. if (negative_word_cache_.Get(word) != negative_word_cache_.end()) { // We know we're no longer in a possible n-gram, so clear the previous word // state. @@ -217,7 +220,7 @@ void PhishingTermFeatureExtractor::HandleWord( } std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word)); - std::string word_hash = crypto::SHA256HashString(word_lower); + uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_); // Quick out if the word is not part of any term, which is the common case. if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) { @@ -229,11 +232,11 @@ void PhishingTermFeatureExtractor::HandleWord( return; } - // Find all of the n-grams that we need to check and compute their hashes. - // We already have the hash for word_lower, so we don't compute that again. + // Find all of the n-grams that we need to check and compute their SHA-256 + // hashes. std::map<std::string /* hash */, std::string /* plaintext */> hashes_to_check; - hashes_to_check[word_hash] = word_lower; + hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower; // Combine the new word with the previous words to find additional n-grams. // Note that we don't yet add the new word length to previous_word_sizes, diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h index d10b575..4fb26c1 100644 --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h @@ -41,8 +41,8 @@ class PhishingTermFeatureExtractor { // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These // terms may be multi-word n-grams, with at most |max_words_per_term| words. // - // |page_word_hashes| contains the hashes for all of the individual words - // that make up the terms. Both sets of strings are UTF-8 encoded and + // |page_word_hashes| contains the murmur3 hashes for all of the individual + // words that make up the terms. Both sets of strings are UTF-8 encoded and // lowercased prior to hashing. The caller owns both sets of strings, and // must ensure that they are valid until the PhishingTermFeatureExtractor is // destroyed. @@ -51,8 +51,9 @@ class PhishingTermFeatureExtractor { // for testing. The caller keeps ownership of the clock. PhishingTermFeatureExtractor( const base::hash_set<std::string>* page_term_hashes, - const base::hash_set<std::string>* page_word_hashes, + const base::hash_set<uint32>* page_word_hashes, size_t max_words_per_term, + uint32 murmurhash3_seed, FeatureExtractorClock* clock); ~PhishingTermFeatureExtractor(); @@ -121,15 +122,18 @@ class PhishingTermFeatureExtractor { // All of the term hashes that we are looking for in the page. const base::hash_set<std::string>* page_term_hashes_; - // Hashes of all the individual words in page_term_hashes_. If + // Murmur3 hashes of all the individual words in page_term_hashes_. If // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_ // would contain (hashed) "one" and "two". We do this so that we can have a // quick out in the common case that the current word we are processing // doesn't contain any part of one of our terms. - const base::hash_set<std::string>* page_word_hashes_; + const base::hash_set<uint32>* page_word_hashes_; // The maximum number of words in an n-gram. - size_t max_words_per_term_; + const size_t max_words_per_term_; + + // The seed for murmurhash3. + const uint32 murmurhash3_seed_; // This cache is used to see if we need to check the word at all, as // converting to UTF8, lowercasing, and hashing are all relatively expensive diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc index ee8d6ed..41c7d6f 100644 --- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc +++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc @@ -18,6 +18,7 @@ #include "crypto/sha2.h" #include "chrome/renderer/safe_browsing/features.h" #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h" +#include "chrome/renderer/safe_browsing/murmurhash3_util.h" #include "testing/gmock/include/gmock/gmock.h" #include "testing/gtest/include/gtest/gtest.h" @@ -61,15 +62,17 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test { words.insert("\xe4\xbd\xa0\xe5\xa5\xbd"); words.insert("\xe5\x86\x8d\xe8\xa7\x81"); + static const uint32 kMurmurHash3Seed = 2777808611U; for (base::hash_set<std::string>::iterator it = words.begin(); it != words.end(); ++it) { - word_hashes_.insert(crypto::SHA256HashString(*it)); + word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed)); } extractor_.reset(new PhishingTermFeatureExtractor( &term_hashes_, &word_hashes_, 3 /* max_words_per_term */, + kMurmurHash3Seed, &clock_)); } @@ -112,7 +115,7 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test { MockFeatureExtractorClock clock_; scoped_ptr<PhishingTermFeatureExtractor> extractor_; base::hash_set<std::string> term_hashes_; - base::hash_set<std::string> word_hashes_; + base::hash_set<uint32> word_hashes_; bool success_; // holds the success value from ExtractFeatures }; diff --git a/chrome/renderer/safe_browsing/scorer.cc b/chrome/renderer/safe_browsing/scorer.cc index 6a8eaaf..e56bc39 100644 --- a/chrome/renderer/safe_browsing/scorer.cc +++ b/chrome/renderer/safe_browsing/scorer.cc @@ -70,7 +70,7 @@ Scorer* Scorer::Create(const base::StringPiece& model_str) { scorer->page_terms_.insert(model.hashes(model.page_term(i))); } for (int i = 0; i < model.page_word_size(); ++i) { - scorer->page_words_.insert(model.hashes(model.page_word(i))); + scorer->page_words_.insert(model.page_word(i)); } return scorer.release(); } @@ -91,7 +91,7 @@ const base::hash_set<std::string>& Scorer::page_terms() const { return page_terms_; } -const base::hash_set<std::string>& Scorer::page_words() const { +const base::hash_set<uint32>& Scorer::page_words() const { return page_words_; } @@ -99,6 +99,10 @@ size_t Scorer::max_words_per_term() const { return model_.max_words_per_term(); } +uint32 Scorer::murmurhash3_seed() const { + return model_.murmur_hash_seed(); +} + double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule, const FeatureMap& features) const { const base::hash_map<std::string, double>& feature_map = features.features(); diff --git a/chrome/renderer/safe_browsing/scorer.h b/chrome/renderer/safe_browsing/scorer.h index eef5419..e925935 100644 --- a/chrome/renderer/safe_browsing/scorer.h +++ b/chrome/renderer/safe_browsing/scorer.h @@ -50,11 +50,14 @@ class Scorer { // Returns a set of hashed page words that appear in the model in binary // format. - const base::hash_set<std::string>& page_words() const; + const base::hash_set<uint32>& page_words() const; // Return the maximum number of words per term for the loaded model. size_t max_words_per_term() const; + // Returns the murmurhash3 seed for the loaded model. + uint32 murmurhash3_seed() const; + protected: // Most clients should use the factory method. This constructor is public // to allow for mock implementations. @@ -73,7 +76,7 @@ class Scorer { ClientSideModel model_; base::hash_set<std::string> page_terms_; - base::hash_set<std::string> page_words_; + base::hash_set<uint32> page_words_; DISALLOW_COPY_AND_ASSIGN(Scorer); }; diff --git a/chrome/renderer/safe_browsing/scorer_unittest.cc b/chrome/renderer/safe_browsing/scorer_unittest.cc index e017426..d588017 100644 --- a/chrome/renderer/safe_browsing/scorer_unittest.cc +++ b/chrome/renderer/safe_browsing/scorer_unittest.cc @@ -31,9 +31,6 @@ class PhishingScorerTest : public ::testing::Test { model_.add_hashes("feature3"); model_.add_hashes("token one"); model_.add_hashes("token two"); - model_.add_hashes("token"); - model_.add_hashes("one"); - model_.add_hashes("two"); ClientSideModel::Rule* rule; rule = model_.add_rule(); @@ -51,11 +48,14 @@ class PhishingScorerTest : public ::testing::Test { model_.add_page_term(3); // token one model_.add_page_term(4); // token two - model_.add_page_word(5); // token - model_.add_page_word(6); // one - model_.add_page_word(7); // two + // These will be murmur3 hashes, but for this test it's not necessary + // that the hashes correspond to actual words. + model_.add_page_word(1000U); + model_.add_page_word(2000U); + model_.add_page_word(3000U); model_.set_max_words_per_term(2); + model_.set_murmur_hash_seed(12345U); } ClientSideModel model_; @@ -89,12 +89,14 @@ TEST_F(PhishingScorerTest, PageTerms) { TEST_F(PhishingScorerTest, PageWords) { scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString())); ASSERT_TRUE(scorer.get()); - base::hash_set<std::string> expected_page_words; - expected_page_words.insert("token"); - expected_page_words.insert("one"); - expected_page_words.insert("two"); + base::hash_set<uint32> expected_page_words; + expected_page_words.insert(1000U); + expected_page_words.insert(2000U); + expected_page_words.insert(3000U); EXPECT_THAT(scorer->page_words(), ::testing::ContainerEq(expected_page_words)); + EXPECT_EQ(2U, scorer->max_words_per_term()); + EXPECT_EQ(12345U, scorer->murmurhash3_seed()); } TEST_F(PhishingScorerTest, ComputeScore) { |