Add a term feature extractor for client-side phishing detection.

This class creates features for n-grams in the page text that appear in the phishing classification model. It will eventually operate on the plain text that is extracted by RenderView::CaptureText(). To make it harder for phishers to enumerate the terms in the classification model, they will be supplied as SHA-256 hashes rather than plain text. The term feature extractor hashes the words in the document in order to check whether they match the model. Since this is potentially expensive, the term feature extractor limits how long it will run on each iteration, similar to the PhishingDOMFeatureExtractor. TEST=PhishingTermFeatureExtractorTest BUG=none Review URL: http://codereview.chromium.org/3214002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@58537 0039d316-1c4b-4281-b951-d872f2087c98
author: bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-03 21:02:15 +0000
committer: bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-03 21:02:15 +0000
commit: e1d6a597b92bb42fa6414bb90613433c38694682 (patch)
tree: fae6fed0a095e7c648738572c91db6a81264d2bb /chrome
parent: ff8a4e38b04046fad0accb3044a7087db42732ed (diff)
download: chromium_src-e1d6a597b92bb42fa6414bb90613433c38694682.zip
chromium_src-e1d6a597b92bb42fa6414bb90613433c38694682.tar.gz
chromium_src-e1d6a597b92bb42fa6414bb90613433c38694682.tar.bz2
8 files changed, 718 insertions, 2 deletions
diff --git a/chrome/chrome_renderer.gypi b/chrome/chrome_renderer.gypi
index c62b7af..d32f853 100644
--- a/chrome/chrome_renderer.gypi
+++ b/chrome/chrome_renderer.gypi
@@ -190,6 +190,8 @@
         'renderer/safe_browsing/features.h',
         'renderer/safe_browsing/phishing_dom_feature_extractor.cc',
         'renderer/safe_browsing/phishing_dom_feature_extractor.h',
+        'renderer/safe_browsing/phishing_term_feature_extractor.cc',
+        'renderer/safe_browsing/phishing_term_feature_extractor.h',
         'renderer/safe_browsing/phishing_url_feature_extractor.cc',
         'renderer/safe_browsing/phishing_url_feature_extractor.h',
         'renderer/speech_input_dispatcher.cc',
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index 2414894..cd751e2 100644
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -1359,6 +1359,7 @@
         'renderer/renderer_about_handler_unittest.cc',
         'renderer/renderer_main_unittest.cc',
         'renderer/safe_browsing/features_unittest.cc',
+        'renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc',
         'renderer/safe_browsing/phishing_url_feature_extractor_unittest.cc',
         'renderer/spellchecker/spellcheck_unittest.cc',
         'renderer/spellchecker/spellcheck_worditerator_unittest.cc',
diff --git a/chrome/renderer/safe_browsing/features.cc b/chrome/renderer/safe_browsing/features.cc
index 4d67cf3..40fd97b 100644
--- a/chrome/renderer/safe_browsing/features.cc
+++ b/chrome/renderer/safe_browsing/features.cc
@@ -80,5 +80,8 @@ const char kPageNumScriptTagsGTSix[] = "PageNumScriptTags>6";
 // Other DOM HTML features
 const char kPageImgOtherDomainFreq[] = "PageImgOtherDomainFreq";
 
+// Page term features
+const char kPageTerm[] = "PageTerm=";
+
 }  // namespace features
 }  // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/features.h b/chrome/renderer/safe_browsing/features.h
index f3c8348..82370c1 100644
--- a/chrome/renderer/safe_browsing/features.h
+++ b/chrome/renderer/safe_browsing/features.h
@@ -158,6 +158,16 @@ extern const char kPageNumScriptTagsGTSix[];
 // The fraction of images whose src attribute points to an external domain.
 extern const char kPageImgOtherDomainFreq[];
 
+////////////////////////////////////////////////////
+// Page term features
+////////////////////////////////////////////////////
+
+// Token feature for a term (whitespace-delimited) on a page.  Terms can be
+// single words or multi-word n-grams.  Rather than adding this feature for
+// every possible token on a page, only the terms that are mentioned in the
+// classification model are added.
+extern const char kPageTerm[];
+
 }  // namespace features
 }  // namepsace safe_browsing
 
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
index ef0e42e..2b72d46 100644
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h
@@ -49,7 +49,8 @@ class PhishingDOMFeatureExtractor {
   // render thread for too long, the feature extractor may run in several
   // chunks of work, posting a task to the current MessageLoop to continue
   // processing.  Once feature extraction is complete, |done_callback|
-  // is run.  PhishingDOMFeatureExtractor takes ownership of the callback.
+  // is run on the current thread.  PhishingDOMFeatureExtractor takes
+  // ownership of the callback.
   void ExtractFeatures(FeatureMap* features, DoneCallback* done_callback);
 
   // Cancels any pending feature extraction.  The DoneCallback will not be run.
@@ -61,7 +62,7 @@ class PhishingDOMFeatureExtractor {
   struct FrameData;
   struct PageFeatureState;
 
-  // The maximum amount of time that we will spend on a single extraction
+  // The maximum amount of wall time that we will spend on a single extraction
   // iteration before pausing to let other MessageLoop tasks run.
   static const int kMaxTimePerChunkMs;
 
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
new file mode 100644
index 0000000..0ec0dbc
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
@@ -0,0 +1,295 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
+
+#include <list>
+#include <map>
+
+#include "app/l10n_util.h"
+#include "base/compiler_specific.h"
+#include "base/histogram.h"
+#include "base/logging.h"
+#include "base/message_loop.h"
+#include "base/sha2.h"
+#include "base/time.h"
+#include "base/utf_string_conversions.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "unicode/ubrk.h"
+
+namespace safe_browsing {
+
+// This time should be short enough that it doesn't noticeably disrupt the
+// user's interaction with the page.
+const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 50;
+
+// Experimenting shows that we get a reasonable gain in performance by
+// increasing this up to around 10, but there's not much benefit in
+// increasing it past that.
+const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;
+
+// This should be longer than we expect feature extraction to take on any
+// actual phishing page.
+const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
+
+// All of the state pertaining to the current feature extraction.
+struct PhishingTermFeatureExtractor::ExtractionState {
+  // Stores up to max_words_per_ngram_ previous words separated by spaces.
+  std::string previous_words;
+
+  // Stores the sizes of the words in previous_words.  Note: the size includes
+  // the space after each word.  In other words, the sum of all sizes in this
+  // list is equal to the length of previous_words.
+  std::list<size_t> previous_word_sizes;
+
+  // An iterator for word breaking.
+  UBreakIterator* iterator;
+
+  // Our current position in the text that was passed to the ExtractionState
+  // constructor, speciailly, the most recent break position returned by our
+  // iterator.
+  int position;
+
+  // True if position has been initialized.
+  bool position_initialized;
+
+  // The time at which we started feature extraction for the current page.
+  base::TimeTicks start_time;
+
+  // The number of iterations we've done for the current extraction.
+  int num_iterations;
+
+  ExtractionState(const string16& text, base::TimeTicks start_time_ticks)
+      : position(-1),
+        position_initialized(false),
+        start_time(start_time_ticks),
+        num_iterations(0) {
+    UErrorCode status = U_ZERO_ERROR;
+    // TODO(bryner): We should pass in the language for the document.
+    iterator = ubrk_open(UBRK_WORD, NULL,
+                         text.data(), text.size(),
+                         &status);
+    if (U_FAILURE(status)) {
+      DLOG(ERROR) << "ubrk_open failed: " << status;
+      iterator = NULL;
+    }
+  }
+
+  ~ExtractionState() {
+    if (iterator) {
+      ubrk_close(iterator);
+    }
+  }
+};
+
+PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
+    const base::hash_set<std::string>* page_term_hashes,
+    const base::hash_set<std::string>* page_word_hashes,
+    size_t max_words_per_term,
+    FeatureExtractorClock* clock)
+    : page_term_hashes_(page_term_hashes),
+      page_word_hashes_(page_word_hashes),
+      max_words_per_term_(max_words_per_term),
+      clock_(clock),
+      ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
+  Clear();
+}
+
+PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
+  // The RenderView should have called CancelPendingExtraction() before
+  // we are destroyed.
+  CheckNoPendingExtraction();
+}
+
+void PhishingTermFeatureExtractor::ExtractFeatures(
+    const string16* page_text,
+    FeatureMap* features,
+    DoneCallback* done_callback) {
+  // The RenderView should have called CancelPendingExtraction() before
+  // starting a new extraction, so DCHECK this.
+  CheckNoPendingExtraction();
+  // However, in an opt build, we will go ahead and clean up the pending
+  // extraction so that we can start in a known state.
+  CancelPendingExtraction();
+
+  page_text_ = page_text;
+  features_ = features;
+  done_callback_.reset(done_callback);
+
+  state_.reset(new ExtractionState(*page_text_, clock_->Now()));
+  MessageLoop::current()->PostTask(
+      FROM_HERE,
+      method_factory_.NewRunnableMethod(
+          &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));
+}
+
+void PhishingTermFeatureExtractor::CancelPendingExtraction() {
+  // Cancel any pending callbacks, and clear our state.
+  method_factory_.RevokeAll();
+  Clear();
+}
+
+void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
+  DCHECK(state_.get());
+  ++state_->num_iterations;
+  base::TimeTicks current_chunk_start_time = clock_->Now();
+
+  if (!state_->iterator) {
+    // We failed to initialize the break iterator, so stop now.
+    UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
+    RunCallback(false);
+    return;
+  }
+
+  if (!state_->position_initialized) {
+    state_->position = ubrk_first(state_->iterator);
+    if (state_->position == UBRK_DONE) {
+      // No words present, so we're done.
+      RunCallback(true);
+      return;
+    }
+    state_->position_initialized = true;
+  }
+
+  int num_words = 0;
+  for (int next = ubrk_next(state_->iterator);
+       next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
+    if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
+      // next is now positioned at the end of a word.
+      HandleWord(string16(*page_text_, state_->position,
+                          next - state_->position));
+      ++num_words;
+    }
+    state_->position = next;
+
+    if (num_words >= kClockCheckGranularity) {
+      num_words = 0;
+      base::TimeTicks now = clock_->Now();
+      if (now - state_->start_time >=
+          base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
+        DLOG(ERROR) << "Feature extraction took too long, giving up";
+        // We expect this to happen infrequently, so record when it does.
+        UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
+        RunCallback(false);
+        return;
+      }
+      base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
+      if (chunk_elapsed >=
+          base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
+        // The time limit for the current chunk is up, so post a task to
+        // continue extraction.
+        //
+        // Record how much time we actually spent on the chunk.  If this is
+        // much higher than kMaxTimePerChunkMs, we may need to adjust the
+        // clock granularity.
+        UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
+                            chunk_elapsed);
+        MessageLoop::current()->PostTask(
+            FROM_HERE,
+            method_factory_.NewRunnableMethod(
+                &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout));
+        return;
+      }
+      // Otherwise, continue.
+    }
+  }
+  RunCallback(true);
+}
+
+void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
+  std::string word_lower = UTF16ToUTF8(l10n_util::ToLower(word));
+  std::string word_hash = base::SHA256HashString(word_lower);
+
+  // Quick out if the word is not part of any term, which is the common case.
+  if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
+    // Word doesn't exist in our terms so we can clear the n-gram state.
+    state_->previous_words.clear();
+    state_->previous_word_sizes.clear();
+    return;
+  }
+
+  // Find all of the n-grams that we need to check and compute their hashes.
+  // We already have the hash for word_lower, so we don't compute that again.
+  std::map<std::string /* hash */, std::string /* plaintext */>
+      hashes_to_check;
+  hashes_to_check[word_hash] = word_lower;
+
+  // Combine the new word with the previous words to find additional n-grams.
+  // Note that we don't yet add the new word length to previous_word_sizes,
+  // since we don't want to compute the hash for the word by itself again.
+  //
+  // TODO(bryner): Use UMA stats to determine whether this is too slow.
+  // If it is, there are a couple of cases that we could optimize:
+  //  - We could cache plaintext words that are not in page_word_hashes_, so
+  //    that we can avoid hashing these again.
+  //  - We could include positional information about words in the n-grams,
+  //    rather than just a list of all of the words.  For example, we could
+  //    change the term format so that each word is hashed separately, or
+  //    we could add extra data to the word list to indicate the position
+  //    at which the word appears in an n-gram, and skip checking the word if
+  //    it's not at that position.
+  state_->previous_words.append(word_lower);
+  std::string current_term = state_->previous_words;
+  for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
+       it != state_->previous_word_sizes.end(); ++it) {
+    hashes_to_check[base::SHA256HashString(current_term)] = current_term;
+    current_term.erase(0, *it);
+  }
+
+  // Add features for any hashes that match page_term_hashes_.
+  for (std::map<std::string, std::string>::iterator it =
+           hashes_to_check.begin();
+       it != hashes_to_check.end(); ++it) {
+    if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
+      features_->AddBooleanFeature(features::kPageTerm + it->second);
+    }
+  }
+
+  // Now that we have handled the current word, we have to add a space at the
+  // end of it, and add the new word's size (including the space) to
+  // previous_word_sizes.  Note: it's possible that the document language
+  // doesn't use ASCII spaces to separate words.  That's fine though, we just
+  // need to be consistent with how the model is generated.
+  state_->previous_words.append(" ");
+  state_->previous_word_sizes.push_back(word_lower.size() + 1);
+
+  // Cap the number of previous words.
+  if (state_->previous_word_sizes.size() >= max_words_per_term_) {
+    state_->previous_words.erase(0, state_->previous_word_sizes.front());
+    state_->previous_word_sizes.pop_front();
+  }
+}
+
+void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
+  DCHECK(!done_callback_.get());
+  DCHECK(!state_.get());
+  if (done_callback_.get() || state_.get()) {
+    LOG(ERROR) << "Extraction in progress, missing call to "
+               << "CancelPendingExtraction";
+  }
+}
+
+void PhishingTermFeatureExtractor::RunCallback(bool success) {
+  // Record some timing stats that we can use to evaluate feature extraction
+  // performance.  These include both successful and failed extractions.
+  DCHECK(state_.get());
+  UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
+                       state_->num_iterations);
+  UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
+                      clock_->Now() - state_->start_time);
+
+  DCHECK(done_callback_.get());
+  done_callback_->Run(success);
+  Clear();
+}
+
+void PhishingTermFeatureExtractor::Clear() {
+  page_text_ = NULL;
+  features_ = NULL;
+  done_callback_.reset(NULL);
+  state_.reset(NULL);
+}
+
+}  // namespace safe_browsing
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
new file mode 100644
index 0000000..d34ad66
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// PhishingTermFeatureExtractor handles computing term features from the text
+// of a web page for the client-side phishing detection model.  To do this, it
+// takes a list of terms that appear in the model, and scans through the page
+// text looking for them.  Any terms that appear will cause a corresponding
+// features::kPageTerm feature to be added to the FeatureMap.
+//
+// To make it harder for a phisher to enumerate all of the relevant terms in
+// the model, the terms are provided as SHA-256 hashes, rather than plain text.
+//
+// TODO(bryner): When we compute the score, all of the features in the
+// FeatureMap will be hashed so that they can be compared against the model.
+// When this is implemented, add a comment about it here.
+//
+// There is one PhishingTermFeatureExtractor per RenderView.
+
+#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
+#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/callback.h"
+#include "base/hash_tables.h"
+#include "base/scoped_ptr.h"
+#include "base/string16.h"
+#include "base/task.h"
+
+namespace safe_browsing {
+class FeatureExtractorClock;
+class FeatureMap;
+
+class PhishingTermFeatureExtractor {
+ public:
+  // Callback to be run when feature extraction finishes.  The callback
+  // argument is true if extraction was successful, false otherwise.
+  typedef Callback1<bool>::Type DoneCallback;
+
+  // Creates a PhishingTermFeatureExtractor which will extract features for
+  // all of the terms whose SHA-256 hashes are in |page_term_hashes|.  These
+  // terms may be multi-word n-grams, with at most |max_words_per_term| words.
+  //
+  // |page_word_hashes| contains the hashes for all of the individual words
+  // that make up the terms.  Both sets of strings are UTF-8 encoded and
+  // lowercased prior to hashing.  The caller owns both sets of strings, and
+  // must ensure that they are valid until the PhishingTermFeatureExtractor is
+  // destroyed.
+  //
+  // |clock| is used for timing feature extractor operations, and may be mocked
+  // for testing.  PhishingTermFeatureExtractor takes ownership of the clock.
+  PhishingTermFeatureExtractor(
+      const base::hash_set<std::string>* page_term_hashes,
+      const base::hash_set<std::string>* page_word_hashes,
+      size_t max_words_per_term,
+      FeatureExtractorClock* clock);
+  ~PhishingTermFeatureExtractor();
+
+  // Begins extracting features from |page_text| into the given FeatureMap.
+  // |page_text| should contain the plain text of a web page, including any
+  // subframes, as returned by RenderView::CaptureText().
+  //
+  // To avoid blocking the render thread for too long, the feature extractor
+  // may run in several chunks of work, posting a task to the current
+  // MessageLoop to continue processing.  Once feature extraction is complete,
+  // |done_callback| is run on the current thread.
+  // PhishingTermFeatureExtractor takes ownership of the callback.
+  //
+  // |page_text| and |features| are owned by the caller, and must not be
+  // destroyed until either |done_callback| is run or
+  // CancelPendingExtraction() is called.
+  void ExtractFeatures(const string16* page_text,
+                       FeatureMap* features,
+                       DoneCallback* done_callback);
+
+  // Cancels any pending feature extraction.  The DoneCallback will not be run.
+  // Must be called if there is a feature extraction in progress when the page
+  // is unloaded or the PhishingTermFeatureExtractor is destroyed.
+  void CancelPendingExtraction();
+
+ private:
+  struct ExtractionState;
+
+  // The maximum amount of wall time that we will spend on a single extraction
+  // iteration before pausing to let other MessageLoop tasks run.
+  static const int kMaxTimePerChunkMs;
+
+  // The number of words that we will process before checking to see whether
+  // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
+  // slow, we don't do this on every word processed.
+  static const int kClockCheckGranularity;
+
+  // The maximum total amount of time that the feature extractor will run
+  // before giving up on the current page.
+  static const int kMaxTotalTimeMs;
+
+  // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
+  // until a predefined maximum amount of time has elapsed, then posts a task
+  // to the current MessageLoop to continue extraction.  When extraction
+  // finishes, calls RunCallback().
+  void ExtractFeaturesWithTimeout();
+
+  // Handles a single word in the page text.
+  void HandleWord(const string16& word);
+
+  // Helper to verify that there is no pending feature extraction.  Dies in
+  // debug builds if the state is not as expected.  This is a no-op in release
+  // builds.
+  void CheckNoPendingExtraction();
+
+  // Runs |done_callback_| and then clears all internal state.
+  void RunCallback(bool success);
+
+  // Clears all internal feature extraction state.
+  void Clear();
+
+  // All of the term hashes that we are looking for in the page.
+  const base::hash_set<std::string>* page_term_hashes_;
+
+  // Hashes of all the individual words in page_term_hashes_.  If
+  // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
+  // would contain (hashed) "one" and "two".  We do this so that we can have a
+  // quick out in the common case that the current word we are processing
+  // doesn't contain any part of one of our terms.
+  const base::hash_set<std::string>* page_word_hashes_;
+
+  // The maximum number of words in an n-gram.
+  size_t max_words_per_term_;
+
+  // Owned pointer to our clock.
+  scoped_ptr<FeatureExtractorClock> clock_;
+
+  // The output parameters from the most recent call to ExtractFeatures().
+  const string16* page_text_;  // The caller keeps ownership of this.
+  FeatureMap* features_;  // The caller keeps ownership of this.
+  scoped_ptr<DoneCallback> done_callback_;
+
+  // Stores the current state of term extraction from |page_text_|.
+  scoped_ptr<ExtractionState> state_;
+
+  // Used to create ExtractFeaturesWithTimeout tasks.
+  // These tasks are revoked if extraction is cancelled.
+  ScopedRunnableMethodFactory<PhishingTermFeatureExtractor> method_factory_;
+
+  DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
+};
+
+}  // namespace safe_browsing
+
+#endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
new file mode 100644
index 0000000..812fb93
--- /dev/null
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor_unittest.cc
@@ -0,0 +1,252 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
+
+#include <string>
+
+#include "base/callback.h"
+#include "base/hash_tables.h"
+#include "base/message_loop.h"
+#include "base/scoped_ptr.h"
+#include "base/sha2.h"
+#include "base/string16.h"
+#include "base/stringprintf.h"
+#include "base/time.h"
+#include "base/utf_string_conversions.h"
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
+#include "chrome/renderer/safe_browsing/features.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using ::testing::ContainerEq;
+using ::testing::Return;
+
+namespace safe_browsing {
+
+class PhishingTermFeatureExtractorTest : public ::testing::Test {
+ protected:
+  class MockClock : public FeatureExtractorClock {
+   public:
+    MOCK_METHOD0(Now, base::TimeTicks());
+  };
+
+  virtual void SetUp() {
+    base::hash_set<std::string> terms;
+    terms.insert("one");
+    terms.insert("one one");
+    terms.insert("two");
+    terms.insert("multi word test");
+    terms.insert("capitalization");
+    terms.insert("space");
+    terms.insert("separator");
+    terms.insert("punctuation");
+    // Chinese (translation of "hello")
+    terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
+    // Chinese (translation of "goodbye")
+    terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
+
+    for (base::hash_set<std::string>::iterator it = terms.begin();
+         it != terms.end(); ++it) {
+      term_hashes_.insert(base::SHA256HashString(*it));
+    }
+
+    base::hash_set<std::string> words;
+    words.insert("one");
+    words.insert("two");
+    words.insert("multi");
+    words.insert("word");
+    words.insert("test");
+    words.insert("capitalization");
+    words.insert("space");
+    words.insert("separator");
+    words.insert("punctuation");
+    words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
+    words.insert("\xe5\x86\x8d\xe8\xa7\x81");
+
+    for (base::hash_set<std::string>::iterator it = words.begin();
+         it != words.end(); ++it) {
+      word_hashes_.insert(base::SHA256HashString(*it));
+    }
+
+    clock_ = new MockClock();
+    extractor_.reset(new PhishingTermFeatureExtractor(
+        &term_hashes_,
+        &word_hashes_,
+        3 /* max_words_per_term */,
+        clock_));
+  }
+
+  // Runs the TermFeatureExtractor on |page_text|, waiting for the
+  // completion callback.  Returns the success boolean from the callback.
+  bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
+    success_ = false;
+    extractor_->ExtractFeatures(
+        page_text,
+        features,
+        NewCallback(this, &PhishingTermFeatureExtractorTest::ExtractionDone));
+    msg_loop_.Run();
+    return success_;
+  }
+
+  // Completion callback for feature extraction.
+  void ExtractionDone(bool success) {
+    success_ = success;
+    msg_loop_.Quit();
+  }
+
+  MessageLoop msg_loop_;
+  scoped_ptr<PhishingTermFeatureExtractor> extractor_;
+  base::hash_set<std::string> term_hashes_;
+  base::hash_set<std::string> word_hashes_;
+  MockClock* clock_;  // owned by extractor_
+  bool success_;  // holds the success value from ExtractFeatures
+};
+
+TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
+  // This test doesn't exercise the extraction timing.
+  EXPECT_CALL(*clock_, Now())
+      .WillRepeatedly(Return(base::TimeTicks::Now()));
+
+  string16 page_text = ASCIIToUTF16("blah");
+  FeatureMap expected_features;  // initially empty
+
+  FeatureMap features;
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  page_text = ASCIIToUTF16("one one");
+  expected_features.Clear();
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("one"));
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("one one"));
+
+  features.Clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  page_text = ASCIIToUTF16("bla bla multi word test bla");
+  expected_features.Clear();
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("multi word test"));
+
+  features.Clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  // This text has all of the words for one of the terms, but they are
+  // not in the correct order.
+  page_text = ASCIIToUTF16("bla bla test word multi bla");
+  expected_features.Clear();
+
+  features.Clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  page_text = ASCIIToUTF16("Capitalization plus non-space\n"
+                           "separator... punctuation!");
+  expected_features.Clear();
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("capitalization"));
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("space"));
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("separator"));
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("punctuation"));
+
+  features.Clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  // Test with empty page text.
+  page_text = string16();
+  expected_features.Clear();
+  features.Clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+
+  // Chinese translation of the phrase "hello goodbye". This tests that
+  // we can correctly separate terms in languages that don't use spaces.
+  page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
+  expected_features.Clear();
+  expected_features.AddBooleanFeature(
+      features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
+  expected_features.AddBooleanFeature(
+      features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
+
+  features.Clear();
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+}
+
+TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
+  // For this test, we'll cause the feature extraction to run multiple
+  // iterations by incrementing the clock.
+
+  // This page has a total of 30 words.  For the features to be computed
+  // correctly, the extractor has to process the entire string of text.
+  string16 page_text(ASCIIToUTF16("one "));
+  for (int i = 0; i < 28; ++i) {
+    page_text.append(ASCIIToUTF16(StringPrintf("%d ", i)));
+  }
+  page_text.append(ASCIIToUTF16("two"));
+
+  // Advance the clock 30 ms every 10 words processed, 10 ms between chunks.
+  // Note that this assumes kClockCheckGranularity = 10 and
+  // kMaxTimePerChunkMs = 50.
+  base::TimeTicks now = base::TimeTicks::Now();
+  EXPECT_CALL(*clock_, Now())
+      // Time check at the start of extraction.
+      .WillOnce(Return(now))
+      // Time check at the start of the first chunk of work.
+      .WillOnce(Return(now))
+      // Time check after the first 10 words.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)))
+      // Time check after the next 10 words.  This is over the chunk
+      // time limit, so a continuation task will be posted.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(60)))
+      // Time check at the start of the second chunk of work.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(70)))
+      // Time check after the next 10 words.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(100)))
+      // A final check for the histograms.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(101)));
+
+  FeatureMap expected_features;
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("one"));
+  expected_features.AddBooleanFeature(features::kPageTerm +
+                                      std::string("two"));
+
+  FeatureMap features;
+  ASSERT_TRUE(ExtractFeatures(&page_text, &features));
+  EXPECT_THAT(features.features(), ContainerEq(expected_features.features()));
+  // Make sure none of the mock expectations carry over to the next test.
+  ::testing::Mock::VerifyAndClearExpectations(clock_);
+
+  // Now repeat the test with the same text, but advance the clock faster so
+  // that the extraction time exceeds the maximum total time for the feature
+  // extractor.  Extraction should fail.  Note that this assumes
+  // kMaxTotalTimeMs = 500.
+  EXPECT_CALL(*clock_, Now())
+      // Time check at the start of extraction.
+      .WillOnce(Return(now))
+      // Time check at the start of the first chunk of work.
+      .WillOnce(Return(now))
+      // Time check after the first 10 words,
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
+      // Time check at the start of the second chunk of work.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
+      // Time check after the next 10 words.  This is over the limit.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
+      // A final time check for the histograms.
+      .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
+
+  features.Clear();
+  EXPECT_FALSE(ExtractFeatures(&page_text, &features));
+}
+
+}  // namespace safe_browsing
author	bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-03 21:02:15 +0000
committer	bryner@chromium.org <bryner@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-03 21:02:15 +0000
commit	e1d6a597b92bb42fa6414bb90613433c38694682 (patch)
tree	fae6fed0a095e7c648738572c91db6a81264d2bb /chrome
parent	ff8a4e38b04046fad0accb3044a7087db42732ed (diff)
download	chromium_src-e1d6a597b92bb42fa6414bb90613433c38694682.zip chromium_src-e1d6a597b92bb42fa6414bb90613433c38694682.tar.gz chromium_src-e1d6a597b92bb42fa6414bb90613433c38694682.tar.bz2