Switch to the new client-side phishing model that uses Murmurhash for word hashes.

This gives about a 50% performance improvement on extraction of page term features. BUG=96060 TEST=updated existing tests Review URL: http://codereview.chromium.org/7866011 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@100858 0039d316-1c4b-4281-b951-d872f2087c98
author: bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-13 04:21:47 +0000
committer: bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-13 04:21:47 +0000
commit: 3571205b49a19bf6a2420cb3164f9e386f0758f9 (patch)
tree: 08c7d6c403c7607781ed46863e2b3268adab4c1a
parent: ffd7d7365317669d36edaaec4c49b1c0842a1dfc (diff)
download: chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.zip
chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.tar.gz
chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.tar.bz2
16 files changed, 121 insertions, 53 deletions
diff --git a/chrome/browser/safe_browsing/client_side_detection_service.cc b/chrome/browser/safe_browsing/client_side_detection_service.cc
index 1a7b8f7..9bc0be2 100644
--- a/chrome/browser/safe_browsing/client_side_detection_service.cc
+++ b/chrome/browser/safe_browsing/client_side_detection_service.cc
@@ -50,11 +50,8 @@ const base::TimeDelta ClientSideDetectionService::kPositiveCacheInterval =
 
 const char ClientSideDetectionService::kClientReportPhishingUrl[] =
     "https://sb-ssl.google.com/safebrowsing/clientreport/phishing";
-// Note: when updatng the model version, don't forget to change the filename
-// in chrome/common/chrome_constants.cc as well, or else existing users won't
-// download the new model.
 const char ClientSideDetectionService::kClientModelUrl[] =
-    "https://ssl.gstatic.com/safebrowsing/csd/client_model_v3.pb";
+    "https://ssl.gstatic.com/safebrowsing/csd/client_model_v4.pb";
 
 struct ClientSideDetectionService::ClientReportInfo {
   scoped_ptr<ClientReportPhishingRequestCallback> callback;
@@ -600,11 +597,6 @@ bool ClientSideDetectionService::ModelHasValidHashIds(
       return false;
     }
   }
-  for (int i = 0; i < model.page_word_size(); ++i) {
-    if (model.page_word(i) < 0 || model.page_word(i) > max_index) {
-      return false;
-    }
-  }
   return true;
 }
 
diff --git a/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc b/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc
index 09a3716..4dcbf79 100644
--- a/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc
+++ b/chrome/browser/safe_browsing/client_side_detection_service_unittest.cc
@@ -565,7 +565,6 @@ TEST_F(ClientSideDetectionServiceTest, ModelHasValidHashIds) {
   model.add_hashes("bla");
   EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model));
   model.add_page_term(0);
-  model.add_page_word(0);
   EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model));
 
   model.add_page_term(-1);
@@ -575,13 +574,6 @@ TEST_F(ClientSideDetectionServiceTest, ModelHasValidHashIds) {
   model.set_page_term(1, 0);
   EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model));
 
-  model.add_page_word(-2);
-  EXPECT_FALSE(ClientSideDetectionService::ModelHasValidHashIds(model));
-  model.set_page_word(1, 2);
-  EXPECT_FALSE(ClientSideDetectionService::ModelHasValidHashIds(model));
-  model.set_page_word(1, 0);
-  EXPECT_TRUE(ClientSideDetectionService::ModelHasValidHashIds(model));
-
   // Test bad rules.
   model.add_hashes("blu");
   ClientSideModel::Rule* rule = model.add_rule();
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi
index 8f63b70..63195b1 100644
--- a/chrome/chrome_renderer.gypi
+++ b/chrome/chrome_renderer.gypi
@@ -24,6 +24,7 @@
         '../third_party/icu/icu.gyp:icui18n',
         '../third_party/icu/icu.gyp:icuuc',
         '../third_party/npapi/npapi.gyp:npapi',
+        '../third_party/smhasher/smhasher.gyp:murmurhash3',
         '../third_party/WebKit/Source/WebKit/chromium/WebKit.gyp:webkit',
         '../ui/gfx/surface/surface.gyp:surface',
         '../webkit/support/webkit_support.gyp:glue',
@@ -140,6 +141,8 @@
         'renderer/safe_browsing/features.h',
         'renderer/safe_browsing/malware_dom_details.cc',
         'renderer/safe_browsing/malware_dom_details.h',
+        'renderer/safe_browsing/murmurhash3_util.cc',
+        'renderer/safe_browsing/murmurhash3_util.h',
         'renderer/safe_browsing/phishing_classifier.cc',
         'renderer/safe_browsing/phishing_classifier.h',
         'renderer/safe_browsing/phishing_classifier_delegate.cc',
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index 386bcfd..6026795 100644
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -1806,6 +1806,7 @@
         'renderer/plugin_uma_unittest.cc',
         'renderer/renderer_about_handler_unittest.cc',
         'renderer/safe_browsing/features_unittest.cc',
+        'renderer/safe_browsing/murmurhash3_util_unittest.cc',
         'renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc',
         'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc',
         'renderer/safe_browsing/scorer_unittest.cc',
diff --git a/chrome/common/safe_browsing/client_model.proto b/chrome/common/safe_browsing/client_model.proto
index 622ba8f..6a1a912 100644
--- a/chrome/common/safe_browsing/client_model.proto
+++ b/chrome/common/safe_browsing/client_model.proto
@@ -60,11 +60,12 @@ message ClientSideModel {
   // as lowercase UTF-8 strings.
   repeated int32 page_term = 3;
 
-  // List of indexes that point to the hashed page words.  The page words
-  // correspond to all words that appear in page terms.  If the term
-  // "one two" is in the list of page terms then "one" and "two" will be
-  // in the list of page words.
-  repeated int32 page_word = 4;
+  // List of hashed page words.  The page words correspond to all words that
+  // appear in page terms.  If the term "one two" is in the list of page terms
+  // then "one" and "two" will be in the list of page words.  For page words
+  // we don't use SHA256 because it is too expensive.  We use MurmurHash3
+  // instead.  See: http://code.google.com/p/smhasher.
+  repeated fixed32 page_word = 4;
 
   // Page terms in page_term contain at most this many page words.
   required int32 max_words_per_term = 5;
@@ -84,4 +85,7 @@ message ClientSideModel {
     optional int32 size = 2 [default = 128];
   };
   repeated IPSubnet bad_subnet = 7;
+
+  // Murmur hash seed that was used to hash the page words.
+  optional fixed32 murmur_hash_seed = 8;
 }
diff --git a/chrome/renderer/safe_browsing/murmurhash3_util.cc b/chrome/renderer/safe_browsing/murmurhash3_util.cc
new file mode 100644
index 0000000..9569b32
--- /dev/null
+++ b/chrome/renderer/safe_browsing/murmurhash3_util.cc
@@ -0,0 +1,16 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
+#include "third_party/smhasher/src/MurmurHash3.h"
+
+namespace safe_browsing {
+
+uint32 MurmurHash3String(const std::string& str, uint32 seed) {
+  uint32 output;
+  MurmurHash3_x86_32(str.data(), str.size(), seed, &output);
+  return output;
+}
+
+}  // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/murmurhash3_util.h b/chrome/renderer/safe_browsing/murmurhash3_util.h
new file mode 100644
index 0000000..8569c77
--- /dev/null
+++ b/chrome/renderer/safe_browsing/murmurhash3_util.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_
+#define CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_
+
+#include <string>
+#include "base/basictypes.h"
+
+namespace safe_browsing {
+
+// Runs the 32-bit murmurhash3 function on the given string and returns the
+// output as a uint32.
+uint32 MurmurHash3String(const std::string& str, uint32 seed);
+
+}  // namespace safe_browsing
+
+#endif  // CHROME_RENDERER_SAFE_BROWSING_MURMURHASH3_UTIL_H_
diff --git a/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc b/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc
new file mode 100644
index 0000000..f550041
--- /dev/null
+++ b/chrome/renderer/safe_browsing/murmurhash3_util_unittest.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Unit test to verify basic operation of murmurhash3.
+
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
+
+#include <string>
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace safe_browsing {
+
+TEST(MurmurHash3UtilTest, MurmurHash3String) {
+  EXPECT_EQ(893017187U, MurmurHash3String("abcd", 1234U));
+  EXPECT_EQ(3322282861U, MurmurHash3String("abcde", 56789U));
+}
+
+}  // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_classifier.cc b/chrome/renderer/safe_browsing/phishing_classifier.cc
index f2323d1..1da1a50 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier.cc
@@ -60,6 +60,7 @@ void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
       &scorer_->page_terms(),
       &scorer_->page_words(),
       scorer_->max_words_per_term(),
+      scorer_->murmurhash3_seed(),
       clock_.get()));
 }
 
diff --git a/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc b/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
index e92ed70..d2a75df 100644
--- a/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
+++ b/chrome/renderer/safe_browsing/phishing_classifier_browsertest.cc
@@ -18,6 +18,7 @@
 #include "chrome/common/safe_browsing/csd.pb.h"
 #include "chrome/renderer/safe_browsing/features.h"
 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
 #include "chrome/renderer/safe_browsing/render_view_fake_resources_test.h"
 #include "chrome/renderer/safe_browsing/scorer.h"
 #include "crypto/sha2.h"
@@ -72,7 +73,8 @@ class PhishingClassifierTest : public RenderViewFakeResourcesTest {
     rule->set_weight(1.0);
 
     model.add_page_term(3);
-    model.add_page_word(3);
+    model.set_murmur_hash_seed(2777808611U);
+    model.add_page_word(MurmurHash3String("login", model.murmur_hash_seed()));
     model.set_max_words_per_term(1);
 
     clock_ = new MockFeatureExtractorClock;
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
index 404a0b3..ff3a50e8 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
@@ -17,6 +17,7 @@
 #include "crypto/sha2.h"
 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
 #include "chrome/renderer/safe_browsing/features.h"
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
 #include "ui/base/l10n/l10n_util.h"
 #include "unicode/ubrk.h"
 
@@ -40,7 +41,7 @@ const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
 
 // All of the state pertaining to the current feature extraction.
 struct PhishingTermFeatureExtractor::ExtractionState {
-  // Stores up to max_words_per_ngram_ previous words separated by spaces.
+  // Stores up to max_words_per_term_ previous words separated by spaces.
   std::string previous_words;
 
   // Stores the sizes of the words in previous_words.  Note: the size includes
@@ -90,12 +91,14 @@ struct PhishingTermFeatureExtractor::ExtractionState {
 
 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
     const base::hash_set<std::string>* page_term_hashes,
-    const base::hash_set<std::string>* page_word_hashes,
+    const base::hash_set<uint32>* page_word_hashes,
     size_t max_words_per_term,
+    uint32 murmurhash3_seed,
     FeatureExtractorClock* clock)
     : page_term_hashes_(page_term_hashes),
       page_word_hashes_(page_word_hashes),
       max_words_per_term_(max_words_per_term),
+      murmurhash3_seed_(murmurhash3_seed),
       negative_word_cache_(kMaxNegativeWordCacheSize),
       clock_(clock),
       ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
@@ -206,8 +209,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
 void PhishingTermFeatureExtractor::HandleWord(
     const base::StringPiece16& word) {
   // Quickest out if we have seen this word before and know that it's not
-  // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,
-  // all of which are relatively expensive.
+  // part of any term. This avoids the lowercasing and UTF conversion, both of
+  // which are relatively expensive.
   if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
     // We know we're no longer in a possible n-gram, so clear the previous word
     // state.
@@ -217,7 +220,7 @@ void PhishingTermFeatureExtractor::HandleWord(
   }
 
   std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
-  std::string word_hash = crypto::SHA256HashString(word_lower);
+  uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
 
   // Quick out if the word is not part of any term, which is the common case.
   if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
@@ -229,11 +232,11 @@ void PhishingTermFeatureExtractor::HandleWord(
     return;
   }
 
-  // Find all of the n-grams that we need to check and compute their hashes.
-  // We already have the hash for word_lower, so we don't compute that again.
+  // Find all of the n-grams that we need to check and compute their SHA-256
+  // hashes.
   std::map<std::string /* hash */, std::string /* plaintext */>
       hashes_to_check;
-  hashes_to_check[word_hash] = word_lower;
+  hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
 
   // Combine the new word with the previous words to find additional n-grams.
   // Note that we don't yet add the new word length to previous_word_sizes,
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
index d10b575..4fb26c1 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
@@ -41,8 +41,8 @@ class PhishingTermFeatureExtractor {
   // all of the terms whose SHA-256 hashes are in |page_term_hashes|.  These
   // terms may be multi-word n-grams, with at most |max_words_per_term| words.
   //
-  // |page_word_hashes| contains the hashes for all of the individual words
-  // that make up the terms.  Both sets of strings are UTF-8 encoded and
+  // |page_word_hashes| contains the murmur3 hashes for all of the individual
+  // words that make up the terms.  Both sets of strings are UTF-8 encoded and
   // lowercased prior to hashing.  The caller owns both sets of strings, and
   // must ensure that they are valid until the PhishingTermFeatureExtractor is
   // destroyed.
@@ -51,8 +51,9 @@ class PhishingTermFeatureExtractor {
   // for testing.  The caller keeps ownership of the clock.
   PhishingTermFeatureExtractor(
       const base::hash_set<std::string>* page_term_hashes,
-      const base::hash_set<std::string>* page_word_hashes,
+      const base::hash_set<uint32>* page_word_hashes,
       size_t max_words_per_term,
+      uint32 murmurhash3_seed,
       FeatureExtractorClock* clock);
   ~PhishingTermFeatureExtractor();
 
@@ -121,15 +122,18 @@ class PhishingTermFeatureExtractor {
   // All of the term hashes that we are looking for in the page.
   const base::hash_set<std::string>* page_term_hashes_;
 
-  // Hashes of all the individual words in page_term_hashes_.  If
+  // Murmur3 hashes of all the individual words in page_term_hashes_.  If
   // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
   // would contain (hashed) "one" and "two".  We do this so that we can have a
   // quick out in the common case that the current word we are processing
   // doesn't contain any part of one of our terms.
-  const base::hash_set<std::string>* page_word_hashes_;
+  const base::hash_set<uint32>* page_word_hashes_;
 
   // The maximum number of words in an n-gram.
-  size_t max_words_per_term_;
+  const size_t max_words_per_term_;
+
+  // The seed for murmurhash3.
+  const uint32 murmurhash3_seed_;
 
   // This cache is used to see if we need to check the word at all, as
   // converting to UTF8, lowercasing, and hashing are all relatively expensive
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
index ee8d6ed..41c7d6f 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
@@ -18,6 +18,7 @@
 #include "crypto/sha2.h"
 #include "chrome/renderer/safe_browsing/features.h"
 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
 #include "testing/gmock/include/gmock/gmock.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
@@ -61,15 +62,17 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
     words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
     words.insert("\xe5\x86\x8d\xe8\xa7\x81");
 
+    static const uint32 kMurmurHash3Seed = 2777808611U;
     for (base::hash_set<std::string>::iterator it = words.begin();
          it != words.end(); ++it) {
-      word_hashes_.insert(crypto::SHA256HashString(*it));
+      word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
     }
 
     extractor_.reset(new PhishingTermFeatureExtractor(
         &term_hashes_,
         &word_hashes_,
         3 /* max_words_per_term */,
+        kMurmurHash3Seed,
         &clock_));
   }
 
@@ -112,7 +115,7 @@ class PhishingTermFeatureExtractorTest : public ::testing::Test {
   MockFeatureExtractorClock clock_;
   scoped_ptr<PhishingTermFeatureExtractor> extractor_;
   base::hash_set<std::string> term_hashes_;
-  base::hash_set<std::string> word_hashes_;
+  base::hash_set<uint32> word_hashes_;
   bool success_;  // holds the success value from ExtractFeatures
 };
 
diff --git a/chrome/renderer/safe_browsing/scorer.cc b/chrome/renderer/safe_browsing/scorer.cc
index 6a8eaaf..e56bc39 100644
--- a/chrome/renderer/safe_browsing/scorer.cc
+++ b/chrome/renderer/safe_browsing/scorer.cc
@@ -70,7 +70,7 @@ Scorer* Scorer::Create(const base::StringPiece& model_str) {
     scorer->page_terms_.insert(model.hashes(model.page_term(i)));
   }
   for (int i = 0; i < model.page_word_size(); ++i) {
-    scorer->page_words_.insert(model.hashes(model.page_word(i)));
+    scorer->page_words_.insert(model.page_word(i));
   }
   return scorer.release();
 }
@@ -91,7 +91,7 @@ const base::hash_set<std::string>& Scorer::page_terms() const {
   return page_terms_;
 }
 
-const base::hash_set<std::string>& Scorer::page_words() const {
+const base::hash_set<uint32>& Scorer::page_words() const {
   return page_words_;
 }
 
@@ -99,6 +99,10 @@ size_t Scorer::max_words_per_term() const {
   return model_.max_words_per_term();
 }
 
+uint32 Scorer::murmurhash3_seed() const {
+  return model_.murmur_hash_seed();
+}
+
 double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule,
                                 const FeatureMap& features) const {
   const base::hash_map<std::string, double>& feature_map = features.features();
diff --git a/chrome/renderer/safe_browsing/scorer.h b/chrome/renderer/safe_browsing/scorer.h
index eef5419..e925935 100644
--- a/chrome/renderer/safe_browsing/scorer.h
+++ b/chrome/renderer/safe_browsing/scorer.h
@@ -50,11 +50,14 @@ class Scorer {
 
   // Returns a set of hashed page words that appear in the model in binary
   // format.
-  const base::hash_set<std::string>& page_words() const;
+  const base::hash_set<uint32>& page_words() const;
 
   // Return the maximum number of words per term for the loaded model.
   size_t max_words_per_term() const;
 
+  // Returns the murmurhash3 seed for the loaded model.
+  uint32 murmurhash3_seed() const;
+
  protected:
   // Most clients should use the factory method.  This constructor is public
   // to allow for mock implementations.
@@ -73,7 +76,7 @@ class Scorer {
 
   ClientSideModel model_;
   base::hash_set<std::string> page_terms_;
-  base::hash_set<std::string> page_words_;
+  base::hash_set<uint32> page_words_;
 
   DISALLOW_COPY_AND_ASSIGN(Scorer);
 };
diff --git a/chrome/renderer/safe_browsing/scorer_unittest.cc b/chrome/renderer/safe_browsing/scorer_unittest.cc
index e017426..d588017 100644
--- a/chrome/renderer/safe_browsing/scorer_unittest.cc
+++ b/chrome/renderer/safe_browsing/scorer_unittest.cc
@@ -31,9 +31,6 @@ class PhishingScorerTest : public ::testing::Test {
     model_.add_hashes("feature3");
     model_.add_hashes("token one");
     model_.add_hashes("token two");
-    model_.add_hashes("token");
-    model_.add_hashes("one");
-    model_.add_hashes("two");
 
     ClientSideModel::Rule* rule;
     rule = model_.add_rule();
@@ -51,11 +48,14 @@ class PhishingScorerTest : public ::testing::Test {
     model_.add_page_term(3);  // token one
     model_.add_page_term(4);  // token two
 
-    model_.add_page_word(5);  // token
-    model_.add_page_word(6);  // one
-    model_.add_page_word(7);  // two
+    // These will be murmur3 hashes, but for this test it's not necessary
+    // that the hashes correspond to actual words.
+    model_.add_page_word(1000U);
+    model_.add_page_word(2000U);
+    model_.add_page_word(3000U);
 
     model_.set_max_words_per_term(2);
+    model_.set_murmur_hash_seed(12345U);
   }
 
   ClientSideModel model_;
@@ -89,12 +89,14 @@ TEST_F(PhishingScorerTest, PageTerms) {
 TEST_F(PhishingScorerTest, PageWords) {
   scoped_ptr<Scorer> scorer(Scorer::Create(model_.SerializeAsString()));
   ASSERT_TRUE(scorer.get());
-  base::hash_set<std::string> expected_page_words;
-  expected_page_words.insert("token");
-  expected_page_words.insert("one");
-  expected_page_words.insert("two");
+  base::hash_set<uint32> expected_page_words;
+  expected_page_words.insert(1000U);
+  expected_page_words.insert(2000U);
+  expected_page_words.insert(3000U);
   EXPECT_THAT(scorer->page_words(),
               ::testing::ContainerEq(expected_page_words));
+  EXPECT_EQ(2U, scorer->max_words_per_term());
+  EXPECT_EQ(12345U, scorer->murmurhash3_seed());
 }
 
 TEST_F(PhishingScorerTest, ComputeScore) {
author	bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-13 04:21:47 +0000
committer	bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-13 04:21:47 +0000
commit	3571205b49a19bf6a2420cb3164f9e386f0758f9 (patch)
tree	08c7d6c403c7607781ed46863e2b3268adab4c1a
parent	ffd7d7365317669d36edaaec4c49b1c0842a1dfc (diff)
download	chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.zip chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.tar.gz chromium_src-3571205b49a19bf6a2420cb3164f9e386f0758f9.tar.bz2