Optimize phishing page term feature extraction.

We've been seeing in the histograms that for some users, page feature extraction is taking much longer than we would like, up to 200ms between stops. This could possibly contribute to jankiness in the UI. These are some timings from my desktop for how long it takes to extract features before and after these changes. Obviously this is on much better hardware than the use case we are worried about, but hopefully the increase is speed is proportional. Before After CNN 19 7.5 ESPN 22 9.5 Google 12 4 Salon 40 12 BUG= TEST=Ran associated unit tests. Review URL: http://codereview.chromium.org/7549003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@96097 0039d316-1c4b-4281-b951-d872f2087c98
author: gcasto@chromium.org <gcasto@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-08-09 23:30:44 +0000
committer: gcasto@chromium.org <gcasto@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-08-09 23:30:44 +0000
commit: 5cf71c5f13a3dbabae77e519d5762ac5295a0de9 (patch)
tree: bbe4c1ec6f2819ca8f3ee1329ae56be4ab152314
parent: 460e571ba4d40edd7fe22d378d82f73bf555fc57 (diff)
download: chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.zip
chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.tar.gz
chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.tar.bz2
8 files changed, 492 insertions, 23 deletions
diff --git a/base/i18n/case_conversion.cc b/base/i18n/case_conversion.cc
index 4aa9028..d3b06c9 100644
--- a/base/i18n/case_conversion.cc
+++ b/base/i18n/case_conversion.cc
@@ -4,19 +4,20 @@
 
 #include "base/i18n/case_conversion.h"
 
+#include "base/string16.h"
 #include "unicode/unistr.h"
 
 namespace base {
 namespace i18n {
 
-string16 ToLower(const string16& string) {
-  icu::UnicodeString unicode_string(string.c_str(), string.size());
+string16 ToLower(const StringPiece16& string) {
+  icu::UnicodeString unicode_string(string.data(), string.size());
   unicode_string.toLower();
   return string16(unicode_string.getBuffer(), unicode_string.length());
 }
 
-string16 ToUpper(const string16& string) {
-  icu::UnicodeString unicode_string(string.c_str(), string.size());
+string16 ToUpper(const StringPiece16& string) {
+  icu::UnicodeString unicode_string(string.data(), string.size());
   unicode_string.toUpper();
   return string16(unicode_string.getBuffer(), unicode_string.length());
 }
diff --git a/base/i18n/case_conversion.h b/base/i18n/case_conversion.h
index ecbfba1..73dd862 100644
--- a/base/i18n/case_conversion.h
+++ b/base/i18n/case_conversion.h
@@ -6,16 +6,16 @@
 #define BASE_I18N_CASE_CONVERSION_H_
 #pragma once
 
-#include "base/string16.h"
+#include "base/string_piece.h"
 
 namespace base {
 namespace i18n {
 
 // Returns the lower case equivalent of string. Uses ICU's default locale.
-string16 ToLower(const string16& string);
+string16 ToLower(const StringPiece16& string);
 
 // Returns the upper case equivalent of string. Uses ICU's default locale.
-string16 ToUpper(const string16& string);
+string16 ToUpper(const StringPiece16& string);
 
 }  // namespace i18n
 }  // namespace base
diff --git a/base/memory/mru_cache.h b/base/memory/mru_cache.h
index 32a6fbb..434e592 100644
--- a/base/memory/mru_cache.h
+++ b/base/memory/mru_cache.h
@@ -22,16 +22,26 @@
 #include <utility>
 
 #include "base/basictypes.h"
+#include "base/hash_tables.h"
 #include "base/logging.h"
 
 namespace base {
 
 // MRUCacheBase ----------------------------------------------------------------
 
+// This template is used to standardize map type containers that can be used
+// by MRUCacheBase. This level of indirection is necessary because of the way
+// that template template params and default template params interact.
+template <class KeyType, class ValueType>
+struct MRUCacheStandardMap {
+  typedef std::map<KeyType, ValueType> Type;
+};
+
 // Base class for the MRU cache specializations defined below.
 // The deletor will get called on all payloads that are being removed or
 // replaced.
-template <class KeyType, class PayloadType, class DeletorType>
+template <class KeyType, class PayloadType, class DeletorType,
+          template <typename, typename> class MapType = MRUCacheStandardMap>
 class MRUCacheBase {
  public:
   // The payload of the list. This maintains a copy of the key so we can
@@ -40,7 +50,8 @@ class MRUCacheBase {
 
  private:
   typedef std::list<value_type> PayloadList;
-  typedef std::map<KeyType, typename PayloadList::iterator> KeyIndex;
+  typedef typename MapType<KeyType,
+                           typename PayloadList::iterator>::Type KeyIndex;
 
  public:
   typedef typename PayloadList::size_type size_type;
@@ -258,6 +269,38 @@ class OwningMRUCache
   DISALLOW_COPY_AND_ASSIGN(OwningMRUCache);
 };
 
+// HashingMRUCache ------------------------------------------------------------
+
+template <class KeyType, class ValueType>
+struct MRUCacheHashMap {
+  typedef base::hash_map<KeyType, ValueType> Type;
+};
+
+// This class is similar to MRUCache, except that it uses base::hash_map as
+// the map type instead of std::map. Note that your KeyType must be hashable
+// to use this cache.
+template <class KeyType, class PayloadType>
+class HashingMRUCache : public MRUCacheBase<KeyType,
+                                            PayloadType,
+                                            MRUCacheNullDeletor<PayloadType>,
+                                            MRUCacheHashMap> {
+ private:
+  typedef MRUCacheBase<KeyType, PayloadType,
+                       MRUCacheNullDeletor<PayloadType>,
+                       MRUCacheHashMap> ParentType;
+
+ public:
+  // See MRUCacheBase, noting the possibility of using NO_AUTO_EVICT.
+  explicit HashingMRUCache(typename ParentType::size_type max_size)
+      : ParentType(max_size) {
+  }
+  virtual ~HashingMRUCache() {
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HashingMRUCache);
+};
+
 }  // namespace base
 
 #endif  // BASE_MEMORY_MRU_CACHE_H_
diff --git a/base/memory/mru_cache_unittest.cc b/base/memory/mru_cache_unittest.cc
index 89ca2fa..39677f2 100644
--- a/base/memory/mru_cache_unittest.cc
+++ b/base/memory/mru_cache_unittest.cc
@@ -251,3 +251,21 @@ TEST(MRUCacheTest, AutoEvict) {
   // There should be no objects leaked.
   EXPECT_EQ(initial_count, cached_item_live_count);
 }
+
+TEST(MRUCacheTest, HashingMRUCache) {
+  // Very simple test to make sure that the hashing cache works correctly.
+  typedef base::HashingMRUCache<std::string, CachedItem> Cache;
+  Cache cache(Cache::NO_AUTO_EVICT);
+
+  CachedItem one(1);
+  cache.Put("First", one);
+
+  CachedItem two(2);
+  cache.Put("Second", two);
+
+  EXPECT_EQ(one.value, cache.Get("First")->second.value);
+  EXPECT_EQ(two.value, cache.Get("Second")->second.value);
+  cache.ShrinkToSize(1);
+  EXPECT_EQ(two.value, cache.Get("Second")->second.value);
+  EXPECT_EQ(cache.end(), cache.Get("First"));
+}
diff --git a/base/string_piece.h b/base/string_piece.h
index 1077ac1..278c7b6 100644
--- a/base/string_piece.h
+++ b/base/string_piece.h
@@ -14,6 +14,11 @@
 // Systematic usage of StringPiece is encouraged as it will reduce unnecessary
 // conversions from "const char*" to "string" and back again.
 //
+// StringPiece16 is similar to StringPiece but for base::string16 instead of
+// std::string. We do not define as large of a subset of the STL functions
+// from basic_string as in StringPiece, but this can be changed if these
+// functions (find, find_first_of, etc.) are found to be useful in this context.
+//
 
 #ifndef BASE_STRING_PIECE_H_
 #define BASE_STRING_PIECE_H_
@@ -23,6 +28,8 @@
 
 #include "base/base_export.h"
 #include "base/basictypes.h"
+#include "base/hash_tables.h"
+#include "base/string16.h"
 
 namespace base {
 
@@ -164,6 +171,84 @@ class BASE_EXPORT StringPiece {
   size_type     length_;
 };
 
+class BASE_EXPORT StringPiece16 {
+ public:
+  // standard STL container boilerplate
+  typedef size_t size_type;
+  typedef char16 value_type;
+  typedef const char16* pointer;
+  typedef const char16& reference;
+  typedef const char16& const_reference;
+  typedef ptrdiff_t difference_type;
+  typedef const char16* const_iterator;
+  typedef const char16* iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+
+ public:
+  // We provide non-explicit singleton constructors so users can pass
+  // in a "const char16*" or a "string16" wherever a "StringPiece16" is
+  // expected.
+  StringPiece16() : ptr_(NULL), length_(0) { }
+  StringPiece16(const char16* str)
+      : ptr_(str),
+        length_((str == NULL) ? 0 : string16::traits_type::length(str)) { }
+  StringPiece16(const string16& str)
+      : ptr_(str.data()), length_(str.size()) { }
+  StringPiece16(const char16* offset, size_type len)
+      : ptr_(offset), length_(len) { }
+
+  // data() may return a pointer to a buffer with embedded NULs, and the
+  // returned buffer may or may not be null terminated.  Therefore it is
+  // typically a mistake to pass data() to a routine that expects a NUL
+  // terminated string.
+  const char16* data() const { return ptr_; }
+  size_type size() const { return length_; }
+  size_type length() const { return length_; }
+  bool empty() const { return length_ == 0; }
+
+  void clear() {
+    ptr_ = NULL;
+    length_ = 0;
+  }
+  void set(const char16* data, size_type len) {
+    ptr_ = data;
+    length_ = len;
+  }
+  void set(const char16* str) {
+    ptr_ = str;
+    length_ = str ? string16::traits_type::length(str) : 0;
+  }
+
+  char16 operator[](size_type i) const { return ptr_[i]; }
+
+  string16 as_string16() const {
+    // StringPiece claims that this is bad when data() is NULL, but unittesting
+    // seems to say otherwise.
+    return string16(data(), size());
+  }
+
+  iterator begin() const { return ptr_; }
+  iterator end() const { return ptr_ + length_; }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(ptr_ + length_);
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(ptr_);
+  }
+
+  size_type max_size() const { return length_; }
+  size_type capacity() const { return length_; }
+
+  static int wordmemcmp(const char16* p, const char16* p2, size_type N) {
+    return string16::traits_type::compare(p, p2, N);
+  }
+
+ private:
+  const char16*   ptr_;
+  size_type       length_;
+};
+
 BASE_EXPORT bool operator==(const StringPiece& x, const StringPiece& y);
 
 inline bool operator!=(const StringPiece& x, const StringPiece& y) {
@@ -188,6 +273,77 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
   return !(x < y);
 }
 
+inline bool operator==(const StringPiece16& x, const StringPiece16& y) {
+  if (x.size() != y.size())
+    return false;
+
+  return StringPiece16::wordmemcmp(x.data(), y.data(), x.size()) == 0;
+}
+
+inline bool operator!=(const StringPiece16& x, const StringPiece16& y) {
+  return !(x == y);
+}
+
+inline bool operator<(const StringPiece16& x, const StringPiece16& y) {
+  const int r = StringPiece16::wordmemcmp(
+      x.data(), y.data(), (x.size() < y.size() ? x.size() : y.size()));
+  return ((r < 0) || ((r == 0) && (x.size() < y.size())));
+}
+
+inline bool operator>(const StringPiece16& x, const StringPiece16& y) {
+  return y < x;
+}
+
+inline bool operator<=(const StringPiece16& x, const StringPiece16& y) {
+  return !(x > y);
+}
+
+inline bool operator>=(const StringPiece16& x, const StringPiece16& y) {
+  return !(x < y);
+}
+
 }  // namespace base
 
+// We provide appropriate hash functions so StringPiece and StringPiece16 can
+// be used as keys in hash sets and maps.
+
+// This hash function is copied from base/hash_tables.h. We don't use the
+// ones already defined for string and string16 directly because it would
+// require the string constructors to be called, which we don't want.
+#define HASH_STRING_PIECE(StringPieceType, string_piece)                \
+  std::size_t result = 0;                                               \
+  for (StringPieceType::const_iterator i = string_piece.begin();        \
+       i != string_piece.end(); ++i)                                    \
+    result = (result * 131) + *i;                                       \
+  return result;                                                        \
+
+namespace BASE_HASH_NAMESPACE {
+#if defined(COMPILER_GCC)
+
+template<>
+struct hash<base::StringPiece> {
+  std::size_t operator()(const base::StringPiece& sp) const {
+    HASH_STRING_PIECE(base::StringPiece, sp);
+  }
+};
+template<>
+struct hash<base::StringPiece16> {
+  std::size_t operator()(const base::StringPiece16& sp16) const {
+    HASH_STRING_PIECE(base::StringPiece16, sp16);
+  }
+};
+
+#elif defined(COMPILER_MSVC)
+
+inline size_t hash_value(const base::StringPiece& sp) {
+  HASH_STRING_PIECE(base::StringPiece, sp);
+}
+inline size_t hash_value(const base::StringPiece16& sp16) {
+  HASH_STRING_PIECE(base::StringPiece16, sp16);
+}
+
+#endif  // COMPILER
+
+}  // namespace BASE_HASH_NAMESPACE
+
 #endif  // BASE_STRING_PIECE_H_
diff --git a/base/string_piece_unittest.cc b/base/string_piece_unittest.cc
index c03b651..5339c09 100644
--- a/base/string_piece_unittest.cc
+++ b/base/string_piece_unittest.cc
@@ -5,6 +5,7 @@
 #include <string>
 
 #include "base/string_piece.h"
+#include "base/utf_string_conversions.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
 namespace base {
@@ -540,4 +541,234 @@ TEST(StringPieceTest, HeterogenousStringPieceEquals) {
   ASSERT_TRUE("hello" == StringPiece("hello"));
 }
 
+TEST(StringPiece16Test, CheckComparisonOperators) {
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("")) ==
+              StringPiece16(ASCIIToUTF16("")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) ==
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) ==
+              StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) ==
+               StringPiece16(ASCIIToUTF16("")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("")) ==
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) ==
+               StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) ==
+               StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) ==
+               StringPiece16(ASCIIToUTF16("a")));
+
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("")) !=
+               StringPiece16(ASCIIToUTF16("")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) !=
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) !=
+               StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) !=
+              StringPiece16(ASCIIToUTF16("")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("")) !=
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) !=
+              StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) !=
+              StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) !=
+              StringPiece16(ASCIIToUTF16("a")));
+
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) <
+              StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) <
+              StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) <
+              StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) <
+              StringPiece16(ASCIIToUTF16("bb")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) <
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("b")) <
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) <
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("b")) <
+               StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("bb")) <
+               StringPiece16(ASCIIToUTF16("aa")));
+
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) <=
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) <=
+              StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) <=
+              StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) <=
+              StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) <=
+              StringPiece16(ASCIIToUTF16("bb")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("b")) <=
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) <=
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("b")) <=
+               StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("bb")) <=
+               StringPiece16(ASCIIToUTF16("aa")));
+
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) <=
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) <=
+              StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) <=
+              StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) <=
+              StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) <=
+              StringPiece16(ASCIIToUTF16("bb")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("b")) <=
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) <=
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("b")) <=
+               StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("bb")) <=
+               StringPiece16(ASCIIToUTF16("aa")));
+
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) >=
+               StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) >=
+               StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) >=
+               StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) >=
+               StringPiece16(ASCIIToUTF16("bb")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("a")) >=
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("b")) >=
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) >=
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("b")) >=
+              StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("bb")) >=
+              StringPiece16(ASCIIToUTF16("aa")));
+
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) >
+               StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) >
+               StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("a")) >
+               StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) >
+               StringPiece16(ASCIIToUTF16("b")));
+  ASSERT_FALSE(StringPiece16(ASCIIToUTF16("aa")) >
+               StringPiece16(ASCIIToUTF16("bb")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("b")) >
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("aa")) >
+              StringPiece16(ASCIIToUTF16("a")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("b")) >
+              StringPiece16(ASCIIToUTF16("aa")));
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("bb")) >
+              StringPiece16(ASCIIToUTF16("aa")));
+
+  string16 x;
+  for (int i = 0; i < 256; i++) {
+    x += 'a';
+    string16 y = x;
+    ASSERT_EQ(StringPiece16(x), StringPiece16(y));
+    for (int j = 0; j < i; j++) {
+      string16 z = x;
+      z[j] = 'b';       // Differs in position 'j'
+      ASSERT_NE(StringPiece16(x), StringPiece16(z));
+    }
+  }
+}
+
+TEST(StringPiece16Test, CheckSTL) {
+  string16 first = ASCIIToUTF16("abcdefghijklmnopqrstuvwxyz");
+  StringPiece16 a(first);
+  string16 second = ASCIIToUTF16("abc");
+  StringPiece16 b(second.c_str());
+  string16 third = ASCIIToUTF16("xyz");
+  StringPiece16 c(third.c_str(), third.size());
+  string16 fourth = ASCIIToUTF16("foobarrandomstuff");
+  StringPiece16 d(fourth.c_str(), 6);
+  StringPiece16 e;
+  // Check some non-ascii characters.
+  string16 fifth(ASCIIToUTF16("123"));
+  fifth.push_back(0x0000);
+  fifth.push_back(0xd8c5);
+  fifth.push_back(0xdffe);
+  StringPiece16 f(fifth);
+
+  ASSERT_EQ(a[6], 'g');
+  ASSERT_EQ(b[0], 'a');
+  ASSERT_EQ(c[2], 'z');
+  ASSERT_EQ(f[3], '\0');
+  ASSERT_EQ(f[5], static_cast<char16>(0xdffe));
+
+  ASSERT_EQ(*d.data(), 'f');
+  ASSERT_EQ(d.data()[5], 'r');
+  ASSERT_TRUE(e.data() == NULL);
+
+  ASSERT_EQ(*a.begin(), 'a');
+  ASSERT_EQ(*(b.begin() + 2), 'c');
+  ASSERT_EQ(*(c.end() - 1), 'z');
+
+  ASSERT_EQ(*a.rbegin(), 'z');
+  ASSERT_EQ(*(b.rbegin() + 2), 'a');
+  ASSERT_EQ(*(c.rend() - 1), 'x');
+  ASSERT_TRUE(a.rbegin() + 26 == a.rend());
+
+  ASSERT_EQ(a.size(), 26U);
+  ASSERT_EQ(b.size(), 3U);
+  ASSERT_EQ(c.size(), 3U);
+  ASSERT_EQ(d.size(), 6U);
+  ASSERT_EQ(e.size(), 0U);
+  ASSERT_EQ(f.size(), 6U);
+
+  ASSERT_TRUE(!d.empty());
+  ASSERT_TRUE(d.begin() != d.end());
+  ASSERT_TRUE(d.begin() + 6 == d.end());
+
+  ASSERT_TRUE(e.empty());
+  ASSERT_TRUE(e.begin() == e.end());
+
+  d.clear();
+  ASSERT_EQ(d.size(), 0U);
+  ASSERT_TRUE(d.empty());
+  ASSERT_TRUE(d.data() == NULL);
+  ASSERT_TRUE(d.begin() == d.end());
+
+  ASSERT_GE(a.max_size(), a.capacity());
+  ASSERT_GE(a.capacity(), a.size());
+}
+
+TEST(StringPiece16Test, CheckNULL) {
+  StringPiece16 s(NULL);
+  ASSERT_EQ(s.data(), (const char16*)NULL);
+  ASSERT_EQ(s.size(), 0U);
+
+  s.set(NULL);
+  ASSERT_EQ(s.data(), (const char16*)NULL);
+  ASSERT_EQ(s.size(), 0U);
+
+  string16 str = s.as_string16();
+  ASSERT_EQ(s.data(), (const char16*)NULL);
+  ASSERT_EQ(s.size(), 0U);
+}
+
+TEST(StringPiece16Test, HeterogenousStringPieceEquals) {
+  ASSERT_TRUE(StringPiece16(ASCIIToUTF16("hello")) == ASCIIToUTF16("hello"));
+}
+
+TEST(StringPiece16Test, CheckConversion) {
+  // Make sure that we can convert from UTF8 to UTF16 and back. We use a two
+  // byte character (G clef) to test this.
+  ASSERT_EQ(
+      UTF16ToUTF8(
+          StringPiece16(UTF8ToUTF16("\xf0\x9d\x84\x9e")).as_string16()),
+      "\xf0\x9d\x84\x9e");
+}
+
 }  // namespace base
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
index 73db09b..3b4ba85 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc
@@ -35,6 +35,9 @@ const int PhishingTermFeatureExtractor::kClockCheckGranularity = 10;
 // actual phishing page.
 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
 
+// The maximum size of the negative word cache.
+const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
+
 // All of the state pertaining to the current feature extraction.
 struct PhishingTermFeatureExtractor::ExtractionState {
   // Stores up to max_words_per_ngram_ previous words separated by spaces.
@@ -93,6 +96,7 @@ PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
     : page_term_hashes_(page_term_hashes),
       page_word_hashes_(page_word_hashes),
       max_words_per_term_(max_words_per_term),
+      negative_word_cache_(kMaxNegativeWordCacheSize),
       clock_(clock),
       ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
   Clear();
@@ -159,8 +163,8 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
        next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
     if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
       // next is now positioned at the end of a word.
-      HandleWord(string16(*page_text_, state_->position,
-                          next - state_->position));
+      HandleWord(base::StringPiece16(page_text_->data() + state_->position,
+                                     next - state_->position));
       ++num_words;
     }
     state_->position = next;
@@ -196,10 +200,21 @@ void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
       // Otherwise, continue.
     }
   }
+  // We need to clear the cache because the data that it depends on (page_text_)
+  // is going away.
+  negative_word_cache_.Clear();
   RunCallback(true);
 }
 
-void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
+void PhishingTermFeatureExtractor::HandleWord(
+    const base::StringPiece16& word) {
+  // Quickest out if we have seen this word before and know that it's not
+  // part of any term. This avoids the SHA256, lowercasing, and UTF conversion,
+  // all of which are relatively expensive.
+  if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
+    return;
+  }
+
   std::string word_lower = UTF16ToUTF8(base::i18n::ToLower(word));
   std::string word_hash = crypto::SHA256HashString(word_lower);
 
@@ -208,6 +223,8 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
     // Word doesn't exist in our terms so we can clear the n-gram state.
     state_->previous_words.clear();
     state_->previous_word_sizes.clear();
+    // Insert into negative cache so that we don't try this again.
+    negative_word_cache_.Put(word, true);
     return;
   }
 
@@ -221,16 +238,6 @@ void PhishingTermFeatureExtractor::HandleWord(const string16& word) {
   // Note that we don't yet add the new word length to previous_word_sizes,
   // since we don't want to compute the hash for the word by itself again.
   //
-  // TODO(bryner): Use UMA stats to determine whether this is too slow.
-  // If it is, there are a couple of cases that we could optimize:
-  //  - We could cache plaintext words that are not in page_word_hashes_, so
-  //    that we can avoid hashing these again.
-  //  - We could include positional information about words in the n-grams,
-  //    rather than just a list of all of the words.  For example, we could
-  //    change the term format so that each word is hashed separately, or
-  //    we could add extra data to the word list to indicate the position
-  //    at which the word appears in an n-gram, and skip checking the word if
-  //    it's not at that position.
   state_->previous_words.append(word_lower);
   std::string current_term = state_->previous_words;
   for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
diff --git a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
index 74c9b0b..d10b575 100644
--- a/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
+++ b/chrome/renderer/safe_browsing/phishing_term_feature_extractor.h
@@ -21,7 +21,9 @@
 #include "base/basictypes.h"
 #include "base/callback_old.h"
 #include "base/hash_tables.h"
+#include "base/memory/mru_cache.h"
 #include "base/memory/scoped_ptr.h"
+#include "base/string_piece.h"
 #include "base/string16.h"
 #include "base/task.h"
 
@@ -92,6 +94,10 @@ class PhishingTermFeatureExtractor {
   // before giving up on the current page.
   static const int kMaxTotalTimeMs;
 
+  // The size of the cache that we use to determine if we can avoid lower
+  // casing, hashing, and UTF conversion.
+  static const int kMaxNegativeWordCacheSize;
+
   // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
   // until a predefined maximum amount of time has elapsed, then posts a task
   // to the current MessageLoop to continue extraction.  When extraction
@@ -99,7 +105,7 @@ class PhishingTermFeatureExtractor {
   void ExtractFeaturesWithTimeout();
 
   // Handles a single word in the page text.
-  void HandleWord(const string16& word);
+  void HandleWord(const base::StringPiece16& word);
 
   // Helper to verify that there is no pending feature extraction.  Dies in
   // debug builds if the state is not as expected.  This is a no-op in release
@@ -125,6 +131,13 @@ class PhishingTermFeatureExtractor {
   // The maximum number of words in an n-gram.
   size_t max_words_per_term_;
 
+  // This cache is used to see if we need to check the word at all, as
+  // converting to UTF8, lowercasing, and hashing are all relatively expensive
+  // operations. Though this is called an MRU cache, it seems to behave like
+  // an LRU cache (i.e. it evicts the oldest accesses first).
+  typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
+  WordCache negative_word_cache_;
+
   // Non-owned pointer to our clock.
   FeatureExtractorClock* clock_;
author	gcasto@chromium.org <gcasto@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-08-09 23:30:44 +0000
committer	gcasto@chromium.org <gcasto@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-08-09 23:30:44 +0000
commit	5cf71c5f13a3dbabae77e519d5762ac5295a0de9 (patch)
tree	bbe4c1ec6f2819ca8f3ee1329ae56be4ab152314
parent	460e571ba4d40edd7fe22d378d82f73bf555fc57 (diff)
download	chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.zip chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.tar.gz chromium_src-5cf71c5f13a3dbabae77e519d5762ac5295a0de9.tar.bz2