summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-09-13 04:21:47 +0000
committerbryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-09-13 04:21:47 +0000
commit3571205b49a19bf6a2420cb3164f9e386f0758f9 (patch)
tree08c7d6c403c7607781ed46863e2b3268adab4c1a
parentffd7d7365317669d36edaaec4c49b1c0842a1dfc (diff)
downloadchromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.zip
chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.tar.gz
chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.tar.bz2
Switch to the new client-side phishing model that uses Murmurhash for word hashes.
This gives about a 50% performance improvement on extraction of page term features. BUG=96060 TEST=updated existing tests Review URL: http://codereview.chromium.org/7866011 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@100858 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--chrome/browser/safe_browsing/client_side_detection_service.cc10
-rw-r--r--chrome/browser/safe_browsing/client_side_detection_service_unittest.cc8
-rw-r--r--chrome/chrome_renderer.gypi3
-rw-r--r--chrome/chrome_tests.gypi1
-rw-r--r--chrome/common/safe_browsing/client_model.proto14
-rw-r--r--chrome/renderer/safe_browsing/murmurhash3_util.cc16
-rw-r--r--chrome/renderer/safe_browsing/murmurhash3_util.h19
-rw-r--r--chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc19
-rw-r--r--chrome/renderer/safe_browsing/phishing_classifier.cc1
-rw-r--r--chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc4
-rw-r--r--chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc19
-rw-r--r--chrome/renderer/safe_browsing/phishing_term_feature_extractor.h16
-rw-r--r--chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc7
-rw-r--r--chrome/renderer/safe_browsing/scorer.cc8
-rw-r--r--chrome/renderer/safe_browsing/scorer.h7
-rw-r--r--chrome/renderer/safe_browsing/scorer_unittest.cc22
16 files changed, 121 insertions, 53 deletions
diff --git a/chrome/browser/safe_browsing/client_side_detection_service.cc b/chrome/browser/safe_browsing/client_side_detection_service.cc
index 1a7b8f7..9bc0be2 100644
--- a/chrome/browser/safe_browsing/client_side_detection_service.cc
+++ b/chrome/browser/safe_browsing/client_side_detection_service.cc
@@ -50,11 +50,8 @@ const base::TimeDelta ClientSideDetectionService::kPositiveCacheInterval =
const char ClientSideDetectionService::kClientReportPhishingUrl[] =
"https://sb-ssl.google.com/safebrowsing/clientreport/phishing";
-// Note: when updatng the model version, don't forget to change the filename
-// in chrome/common/chrome_constants.cc as well, or else existing users won't
-// download the new model.
const char ClientSideDetectionService::kClientModelUrl[] =
- "https://ssl.gstatic.com/safebrowsing/csd/client_model_v3.pb";
+ "https://ssl.gstatic.com/safebrowsing/csd/client_model_v4.pb";
struct ClientSideDetectionService::ClientReportInfo {
scoped_ptr<ClientReportPhishingRequestCallback> callback;
@@ -600,11 +597,6 @@ bool ClientSideDetectionService::ModelHasValidHashIds(
return false;
}
}
- for (int i = 0; i < model.page_word_size(); ++i) {
- if (model.page_word(i) < 0 || model.page_word(i) > max_index) {
- return false;
- }
- }
return true;
}
diff --git a/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc b/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc
index 09a3716..4dcbf79 100644
--- a/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc
+++ b/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc
@@ -565,7 +565,6 @@ TEST_F(ClientSideDetectionServiceTest, ModelHasValidHashIds) {
model.add_hashes("bla");
EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model));
model.add_page_term(0);
- model.add_page_word(0);
EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model));
model.add_page_term(-1);
@@ -575,13 +574,6 @@ TEST_F(ClientSideDetectionServiceTest, ModelHasValidHashIds) {
model.set_page_term(1, 0);
EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model));
- model.add_page_word(-2);
- EXPECT_FALSE(ClientSideDetectionService::ModelHasValidHashIds(model));
- model.set_page_word(1, 2);
- EXPECT_FALSE(ClientSideDetectionService::ModelHasValidHashIds(model));
- model.set_page_word(1, 0);
- EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model));
-
// Test bad rules.
model.add_hashes("blu");
ClientSideModel::Rule* rule = model.add_rule();
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi
index 8f63b70..63195b1 100644
--- a/chrome/chrome_renderer.gypi
+++ b/chrome/chrome_renderer.gypi
@@ -24,6 +24,7 @@
'../third_party/icu/icu.gyp:icui18n',
'../third_party/icu/icu.gyp:icuuc',
'../third_party/npapi/npapi.gyp:npapi',
+ '../third_party/smhasher/smhasher.gyp:murmurhash3',
'../third_party/WebKit/Source/WebKit/chromium/WebKit.gyp:webkit',
'../ui/gfx/surface/surface.gyp:surface',
'../webkit/support/webkit_support.gyp:glue',
@@ -140,6 +141,8 @@
'renderer/safe_browsing/features.h',
'renderer/safe_browsing/malware_dom_details.cc',
'renderer/safe_browsing/malware_dom_details.h',
+ 'renderer/safe_browsing/murmurhash3_util.cc',
+ 'renderer/safe_browsing/murmurhash3_util.h',
'renderer/safe_browsing/phishing_classifier.cc',
'renderer/safe_browsing/phishing_classifier.h',
'renderer/safe_browsing/phishing_classifier_delegate.cc',
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index 386bcfd..6026795 100644
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -1806,6 +1806,7 @@
'renderer/plugin_uma_unittest.cc',
'renderer/renderer_about_handler_unittest.cc',
'renderer/safe_browsing/features_unittest.cc',
+ 'renderer/safe_browsing/murmurhash3_util_unittest.cc',
'renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc',
'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc',
'renderer/safe_browsing/scorer_unittest.cc',
diff --git a/chrome/common/safe_browsing/client_model.proto b/chrome/common/safe_browsing/client_model.proto
index 622ba8f..6a1a912 100644
--- a/chrome/common/safe_browsing/client_model.proto
+++ b/chrome/common/safe_browsing/client_model.proto
@@ -60,11 +60,12 @@ message ClientSideModel {
// as lowercase UTF-8 strings.
repeated int32 page_term = 3;
- // List of indexes that point to the hashed page words. The page words
- // correspond to all words that appear in page terms. If the term
- // "one two" is in the list of page terms then "one" and "two" will be
- // in the list of page words.
- repeated int32 page_word = 4;
+ // List of hashed page words. The page words correspond to all words that
+ // appear in page terms. If the term "one two" is in the list of page terms
+ // then "one" and "two" will be in the list of page words. For page words
+ // we don't use SHA256 because it is too expensive. We use MurmurHash3
+ // instead. See: http://code.google.com/p/smhasher.
+ repeated fixed32 page_word = 4;
// Page terms in page_term contain at most this many page words.
required int32 max_words_per_term = 5;
@@ -84,4 +85,7 @@ message ClientSideModel {
optional int32 size = 2 [default = 128];
};
repeated IPSubnet bad_subnet = 7;
+
+ // Murmur hash seed that was used to hash the page words.
+ optional fixed32 murmur_hash_seed = 8;
}
diff --git a/chrome/renderer/safe_browsing/murmurhash3_util.cc b/chrome/renderer/safe_browsing/murmurhash3_util.cc
new file mode 100644
index 0000000..9569b32
--- /dev/null
+++ b/chrome/renderer/safe_browsing/murmurhash3_util.cc
@@ -0,0 +1,16 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
+#include "third_party/smhasher/src/MurmurHash3.h"
+
+namespace safe_browsing {
+
+uint32 MurmurHash3String(const std::string& str, uint32 seed) {
+ uint32 output;
+ MurmurHash3_x86_32(str.data(), str.size(), seed, &output);
+ return output;
+}
+
+} // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/murmurhash3_util.h b/chrome/renderer/safe_browsing/murmurhash3_util.h
new file mode 100644
index 0000000..8569c77
--- /dev/null
+++ b/chrome/renderer/safe_browsing/murmurhash3_util.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_
+#define CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_
+
+#include <string>
+#include "base/basictypes.h"
+
+namespace safe_browsing {
+
+// Runs the 32-bit murmurhash3 function on the given string and returns the
+// output as a uint32.
+uint32 MurmurHash3String(const std::string& str, uint32 seed);
+
+} // namespace safe_browsing
+
+#endif // CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_
diff --git a/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc b/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc
new file mode 100644
index 0000000..f550041
--- /dev/null
+++ b/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Unit test to verify basic operation of murmurhash3.
+
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
+
+#include <string>
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace safe_browsing {
+
+TEST(MurmurHash3UtilTest, MurmurHash3String) {
+ EXPECT_EQ(893017187U, MurmurHash3String("abcd", 1234U));
+ EXPECT_EQ(3322282861U, MurmurHash3String("abcde", 56789U));
+}
+
+} // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_classifier.cc b/chrome/renderer/safe_browsing/phishing_classifier.cc
index f2323d1..1da1a50 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier.cc
@@ -60,6 +60,7 @@ void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
&scorer_->page_terms(),
&scorer_->page_words(),
scorer_->max_words_per_term(),
+ scorer_->murmurhash3_seed(),
clock_.get()));
}
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc b/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
index e92ed70..d2a75df 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
@@ -18,6 +18,7 @@
#include "chrome/common/safe_browsing/csd.pb.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
#include "chrome/renderer/safe_browsing/render_view_fake_resources_test.h"
#include "chrome/renderer/safe_browsing/scorer.h"
#include "crypto/sha2.h"
@@ -72,7 +73,8 @@ class PhishingClassifierTest : public RenderViewFakeResourcesTest {
rule->set_weight(1.0);
model.add_page_term(3);
- model.add_page_word(3);
+ model.set_murmur_hash_seed(2777808611U);
+ model.add_page_word(MurmurHash3String("login", model.murmur_hash_seed()));
model.set_max_words_per_term(1);
clock_ = new MockFeatureExtractorClock;
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
index 404a0b3..ff3a50e8 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
@@ -17,6 +17,7 @@
#include "crypto/sha2.h"
#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/features.h"
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
#include "ui/base/l10n/l10n_util.h"
#include "unicode/ubrk.h"
@@ -40,7 +41,7 @@ const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
// All of the state pertaining to the current feature extraction.
struct PhishingTermFeatureExtractor::ExtractionState {
- // Stores up to max_words_per_ngram_ previous words separated by spaces.
+ // Stores up to max_words_per_term_ previous words separated by spaces.
std::string previous_words;
// Stores the sizes of the words in previous_words. Note: the size includes
@@ -90,12 +91,14 @@ struct PhishingTermFeatureExtractor::ExtractionState {
PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
const base::hash_set<std::string>* page_term_hashes,
- const base::hash_set<std::string>* page_word_hashes,
+ const base::hash_set<uint32>* page_word_hashes,
size_t max_words_per_term,
+ uint32 murmurhash3_seed,
FeatureExtractorClock* clock)
: page_term_hashes_(page_term_hashes),
page_word_hashes_(page_word_hashes),
max_words_per_term_(max_words_per_term),
+ murmurhash3_seed_(murmurhash3_seed),
negative_word_cache_(kMaxNegativeWordCacheSize),
clock_(clock),
ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
@@ -206,8 +209,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
void PhishingTermFeatureExtractor::HandleWord(
const base::StringPiece16& word) {
// Quickest out if we have seen this word before and know that it's not
- // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,
- // all of which are relatively expensive.
+ // part of any term. This avoids the lowercasing and UTF conversion, both of
+ // which are relatively expensive.
if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
// We know we're no longer in a possible n-gram, so clear the previous word
// state.
@@ -217,7 +220,7 @@ void PhishingTermFeatureExtractor::HandleWord(
}
std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
- std::string word_hash = crypto::SHA256HashString(word_lower);
+ uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
// Quick out if the word is not part of any term, which is the common case.
if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
@@ -229,11 +232,11 @@ void PhishingTermFeatureExtractor::HandleWord(
return;
}
- // Find all of the n-grams that we need to check and compute their hashes.
- // We already have the hash for word_lower, so we don't compute that again.
+ // Find all of the n-grams that we need to check and compute their SHA-256
+ // hashes.
std::map<std::string /* hash */, std::string /* plaintext */>
hashes_to_check;
- hashes_to_check[word_hash] = word_lower;
+ hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
// Combine the new word with the previous words to find additional n-grams.
// Note that we don't yet add the new word length to previous_word_sizes,
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
index d10b575..4fb26c1 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
@@ -41,8 +41,8 @@ class PhishingTermFeatureExtractor {
// all of the terms whose SHA-256 hashes are in |page_term_hashes|. These
// terms may be multi-word n-grams, with at most |max_words_per_term| words.
//
- // |page_word_hashes| contains the hashes for all of the individual words
- // that make up the terms. Both sets of strings are UTF-8 encoded and
+ // |page_word_hashes| contains the murmur3 hashes for all of the individual
+ // words that make up the terms. Both sets of strings are UTF-8 encoded and
// lowercased prior to hashing. The caller owns both sets of strings, and
// must ensure that they are valid until the PhishingTermFeatureExtractor is
// destroyed.
@@ -51,8 +51,9 @@ class PhishingTermFeatureExtractor {
// for testing. The caller keeps ownership of the clock.
PhishingTermFeatureExtractor(
const base::hash_set<std::string>* page_term_hashes,
- const base::hash_set<std::string>* page_word_hashes,
+ const base::hash_set<uint32>* page_word_hashes,
size_t max_words_per_term,
+ uint32 murmurhash3_seed,
FeatureExtractorClock* clock);
~PhishingTermFeatureExtractor();
@@ -121,15 +122,18 @@ class PhishingTermFeatureExtractor {
// All of the term hashes that we are looking for in the page.
const base::hash_set<std::string>* page_term_hashes_;
- // Hashes of all the individual words in page_term_hashes_. If
+ // Murmur3 hashes of all the individual words in page_term_hashes_. If
// page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
// would contain (hashed) "one" and "two". We do this so that we can have a
// quick out in the common case that the current word we are processing
// doesn't contain any part of one of our terms.
- const base::hash_set<std::string>* page_word_hashes_;
+ const base::hash_set<uint32>* page_word_hashes_;
// The maximum number of words in an n-gram.
- size_t max_words_per_term_;
+ const size_t max_words_per_term_;
+
+ // The seed for murmurhash3.
+ const uint32 murmurhash3_seed_;
// This cache is used to see if we need to check the word at all, as
// converting to UTF8, lowercasing, and hashing are all relatively expensive
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
index ee8d6ed..41c7d6f 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
@@ -18,6 +18,7 @@
#include "crypto/sha2.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
@@ -61,15 +62,17 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
words.insert("\xe5\x86\x8d\xe8\xa7\x81");
+ static const uint32 kMurmurHash3Seed = 2777808611U;
for (base::hash_set<std::string>::iterator it = words.begin();
it != words.end(); ++it) {
- word_hashes_.insert(crypto::SHA256HashString(*it));
+ word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
}
extractor_.reset(new PhishingTermFeatureExtractor(
&term_hashes_,
&word_hashes_,
3 /* max_words_per_term */,
+ kMurmurHash3Seed,
&clock_));
}
@@ -112,7 +115,7 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
MockFeatureExtractorClock clock_;
scoped_ptr<PhishingTermFeatureExtractor> extractor_;
base::hash_set<std::string> term_hashes_;
- base::hash_set<std::string> word_hashes_;
+ base::hash_set<uint32> word_hashes_;
bool success_; // holds the success value from ExtractFeatures
};
diff --git a/chrome/renderer/safe_browsing/scorer.cc b/chrome/renderer/safe_browsing/scorer.cc
index 6a8eaaf..e56bc39 100644
--- a/chrome/renderer/safe_browsing/scorer.cc
+++ b/chrome/renderer/safe_browsing/scorer.cc
@@ -70,7 +70,7 @@ Scorer* Scorer::Create(const base::StringPiece& model_str) {
scorer->page_terms_.insert(model.hashes(model.page_term(i)));
}
for (int i = 0; i < model.page_word_size(); ++i) {
- scorer->page_words_.insert(model.hashes(model.page_word(i)));
+ scorer->page_words_.insert(model.page_word(i));
}
return scorer.release();
}
@@ -91,7 +91,7 @@ const base::hash_set<std::string>& Scorer::page_terms() const {
return page_terms_;
}
-const base::hash_set<std::string>& Scorer::page_words() const {
+const base::hash_set<uint32>& Scorer::page_words() const {
return page_words_;
}
@@ -99,6 +99,10 @@ size_t Scorer::max_words_per_term() const {
return model_.max_words_per_term();
}
+uint32 Scorer::murmurhash3_seed() const {
+ return model_.murmur_hash_seed();
+}
+
double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule,
const FeatureMap& features) const {
const base::hash_map<std::string, double>& feature_map = features.features();
diff --git a/chrome/renderer/safe_browsing/scorer.h b/chrome/renderer/safe_browsing/scorer.h
index eef5419..e925935 100644
--- a/chrome/renderer/safe_browsing/scorer.h
+++ b/chrome/renderer/safe_browsing/scorer.h
@@ -50,11 +50,14 @@ class Scorer {
// Returns a set of hashed page words that appear in the model in binary
// format.
- const base::hash_set<std::string>& page_words() const;
+ const base::hash_set<uint32>& page_words() const;
// Return the maximum number of words per term for the loaded model.
size_t max_words_per_term() const;
+ // Returns the murmurhash3 seed for the loaded model.
+ uint32 murmurhash3_seed() const;
+
protected:
// Most clients should use the factory method. This constructor is public
// to allow for mock implementations.
@@ -73,7 +76,7 @@ class Scorer {
ClientSideModel model_;
base::hash_set<std::string> page_terms_;
- base::hash_set<std::string> page_words_;
+ base::hash_set<uint32> page_words_;
DISALLOW_COPY_AND_ASSIGN(Scorer);
};
diff --git a/chrome/renderer/safe_browsing/scorer_unittest.cc b/chrome/renderer/safe_browsing/scorer_unittest.cc
index e017426..d588017 100644
--- a/chrome/renderer/safe_browsing/scorer_unittest.cc
+++ b/chrome/renderer/safe_browsing/scorer_unittest.cc
@@ -31,9 +31,6 @@ class PhishingScorerTest : public ::testing::Test {
model_.add_hashes("feature3");
model_.add_hashes("token one");
model_.add_hashes("token two");
- model_.add_hashes("token");
- model_.add_hashes("one");
- model_.add_hashes("two");
ClientSideModel::Rule* rule;
rule = model_.add_rule();
@@ -51,11 +48,14 @@ class PhishingScorerTest : public ::testing::Test {
model_.add_page_term(3); // token one
model_.add_page_term(4); // token two
- model_.add_page_word(5); // token
- model_.add_page_word(6); // one
- model_.add_page_word(7); // two
+ // These will be murmur3 hashes, but for this test it's not necessary
+ // that the hashes correspond to actual words.
+ model_.add_page_word(1000U);
+ model_.add_page_word(2000U);
+ model_.add_page_word(3000U);
model_.set_max_words_per_term(2);
+ model_.set_murmur_hash_seed(12345U);
}
ClientSideModel model_;
@@ -89,12 +89,14 @@ TEST_F(PhishingScorerTest, PageTerms) {
TEST_F(PhishingScorerTest, PageWords) {
scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString()));
ASSERT_TRUE(scorer.get());
- base::hash_set<std::string> expected_page_words;
- expected_page_words.insert("token");
- expected_page_words.insert("one");
- expected_page_words.insert("two");
+ base::hash_set<uint32> expected_page_words;
+ expected_page_words.insert(1000U);
+ expected_page_words.insert(2000U);
+ expected_page_words.insert(3000U);
EXPECT_THAT(scorer->page_words(),
::testing::ContainerEq(expected_page_words));
+ EXPECT_EQ(2U, scorer->max_words_per_term());
+ EXPECT_EQ(12345U, scorer->murmurhash3_seed());
}
TEST_F(PhishingScorerTest, ComputeScore) {