Optimize phishing page term feature extraction.

We've been seeing in the histograms that for some users, page feature extraction is taking much longer than we would like, up to 200ms between stops. This could possibly contribute to jankiness in the UI. These are some timings from my desktop for how long it takes to extract features before and after these changes. Obviously this is on much better hardware than the use case we are worried about, but hopefully the increase is speed is proportional. Before After CNN 19 7.5 ESPN 22 9.5 Google 12 4 Salon 40 12 BUG= TEST=Ran associated unit tests. Review URL: http://codereview.chromium.org/7549003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@96097 0039d316-1c4b-4281-b951-d872f2087c98
author: gcasto@chromium.org <gcasto@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-08-09 23:30:44 +0000
committer: gcasto@chromium.org <gcasto@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-08-09 23:30:44 +0000
commit: 5cf71c5f13a3dbabae77e519d5762ac5295a0de9 (patch)
tree: bbe4c1ec6f2819ca8f3ee1329ae56be4ab152314 /chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
parent: 460e571ba4d40edd7fe22d378d82f73bf555fc57 (diff)
download: chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.zip
chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.tar.gz
chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.tar.bz2
1 files changed, 20 insertions, 13 deletions
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
index 73db09b..3b4ba85 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
@@ -35,6 +35,9 @@ const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;
 // actual phishing page.
 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
 
+// The maximum size of the negative word cache.
+const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
+
 // All of the state pertaining to the current feature extraction.
 struct PhishingTermFeatureExtractor::ExtractionState {
   // Stores up to max_words_per_ngram_ previous words separated by spaces.
@@ -93,6 +96,7 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
     : page_term_hashes_(page_term_hashes),
       page_word_hashes_(page_word_hashes),
       max_words_per_term_(max_words_per_term),
+      negative_word_cache_(kMaxNegativeWordCacheSize),
       clock_(clock),
       ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
   Clear();
@@ -159,8 +163,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
        next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
     if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
       // next is now positioned at the end of a word.
-      HandleWord(string16(*page_text_, state_->position,
-                          next - state_->position));
+      HandleWord(base::StringPiece16(page_text_->data() + state_->position,
+                                     next - state_->position));
       ++num_words;
     }
     state_->position = next;
@@ -196,10 +200,21 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
       // Otherwise, continue.
     }
   }
+  // We need to clear the cache because the data that it depends on (page_text_)
+  // is going away.
+  negative_word_cache_.Clear();
   RunCallback(true);
 }
 
-void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
+void PhishingTermFeatureExtractor::HandleWord(
+    const base::StringPiece16& word) {
+  // Quickest out if we have seen this word before and know that it's not
+  // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,
+  // all of which are relatively expensive.
+  if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
+    return;
+  }
+
   std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
   std::string word_hash = crypto::SHA256HashString(word_lower);
 
@@ -208,6 +223,8 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
     // Word doesn't exist in our terms so we can clear the n-gram state.
     state_->previous_words.clear();
     state_->previous_word_sizes.clear();
+    // Insert into negative cache so that we don't try this again.
+    negative_word_cache_.Put(word, true);
     return;
   }
 
@@ -221,16 +238,6 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
   // Note that we don't yet add the new word length to previous_word_sizes,
   // since we don't want to compute the hash for the word by itself again.
   //
-  // TODO(bryner): Use UMA stats to determine whether this is too slow.
-  // If it is, there are a couple of cases that we could optimize:
-  //  - We could cache plaintext words that are not in page_word_hashes_, so
-  //    that we can avoid hashing these again.
-  //  - We could include positional information about words in the n-grams,
-  //    rather than just a list of all of the words.  For example, we could
-  //    change the term format so that each word is hashed separately, or
-  //    we could add extra data to the word list to indicate the position
-  //    at which the word appears in an n-gram, and skip checking the word if
-  //    it's not at that position.
   state_->previous_words.append(word_lower);
   std::string current_term = state_->previous_words;
   for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
author	gcasto@chromium.org <gcasto@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-08-09 23:30:44 +0000
committer	gcasto@chromium.org <gcasto@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-08-09 23:30:44 +0000
commit	5cf71c5f13a3dbabae77e519d5762ac5295a0de9 (patch)
tree	bbe4c1ec6f2819ca8f3ee1329ae56be4ab152314 /chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
parent	460e571ba4d40edd7fe22d378d82f73bf555fc57 (diff)
download	chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.zip chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.tar.gz chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.tar.bz2