Improve Autocomplete Matches and Handling of Large Results Sets

Do not call FixupUserInput as it was prepending unexpected prefixes (such as file://) to the search string and bypassing valid results. Move the search string decomposition operation from the HQP into the IMUI. In the final substring filtering use whitespace delineated terms rather than words. Instead of bailing if we get a large results set (>500) filter it down to 500 by sorting by typed-count/visit-count/last-visit. This means it's no longer necessary to bypass the HQP if there is only one character in the search term so get rid of the ExpandedInMemoryURLIndexTest.ShortCircuit unit test. BUG=101301,103575 TEST=Added unit tests. Review URL: http://codereview.chromium.org/8526010 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@112527 0039d316-1c4b-4281-b951-d872f2087c98
author: mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-12-01 20:51:43 +0000
committer: mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-12-01 20:51:43 +0000
commit: df3fb512f19d28c2803d4b1d6b6c66143581492b (patch)
tree: f5f323fe9996c1d54a4e71d62e8bf37e67e6ebea
parent: 39cbe0a7385fbd37d7b7332a9967c7408f08175d (diff)
download: chromium_src-df3fb512f19d28c2803d4b1d6b6c66143581492b.zip
chromium_src-df3fb512f19d28c2803d4b1d6b6c66143581492b.tar.gz
chromium_src-df3fb512f19d28c2803d4b1d6b6c66143581492b.tar.bz2
5 files changed, 223 insertions, 179 deletions
diff --git a/chrome/browser/autocomplete/history_quick_provider.cc b/chrome/browser/autocomplete/history_quick_provider.cc
index 15c53b8..8c3fe80 100644
--- a/chrome/browser/autocomplete/history_quick_provider.cc
+++ b/chrome/browser/autocomplete/history_quick_provider.cc
@@ -59,11 +59,6 @@ void HistoryQuickProvider::Start(const AutocompleteInput& input,
 
   autocomplete_input_ = input;
 
-  // Do some fixup on the user input before matching against it, so we provide
-  // good results for local file paths, input with spaces, etc.
-  if (!FixupUserInput(&autocomplete_input_))
-    return;
-
   // TODO(pkasting): We should just block here until this loads.  Any time
   // someone unloads the history backend, we'll get inconsistent inline
   // autocomplete behavior here.
@@ -88,11 +83,7 @@ void HistoryQuickProvider::DeleteMatch(const AutocompleteMatch& match) {}
 void HistoryQuickProvider::DoAutocomplete() {
   // Get the matching URLs from the DB.
   string16 term_string = autocomplete_input_.text();
-  term_string = net::UnescapeURLComponent(term_string,
-      net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
-  history::String16Vector terms(
-      history::String16VectorFromString16(term_string, false));
-  ScoredHistoryMatches matches = GetIndex()->HistoryItemsForTerms(terms);
+  ScoredHistoryMatches matches = GetIndex()->HistoryItemsForTerms(term_string);
   if (matches.empty())
     return;
 
diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc
index 3dbbb79..cb61a57 100644
--- a/chrome/browser/history/in_memory_url_index.cc
+++ b/chrome/browser/history/in_memory_url_index.cc
@@ -400,63 +400,118 @@ void InMemoryURLIndex::DeleteURL(URLID row_id) {
   search_term_cache_.clear();
 }
 
-// Searching
+// InMemoryURLIndex::AddHistoryMatch -------------------------------------------
+
+InMemoryURLIndex::HistoryItemFactorGreater::HistoryItemFactorGreater(
+    const HistoryInfoMap& history_info_map)
+    : history_info_map_(history_info_map) {
+}
+
+InMemoryURLIndex::HistoryItemFactorGreater::~HistoryItemFactorGreater() {}
+
+bool InMemoryURLIndex::HistoryItemFactorGreater::operator()(
+    const HistoryID h1,
+    const HistoryID h2) {
+  const URLRow& r1(history_info_map_.find(h1)->second);
+  const URLRow& r2(history_info_map_.find(h2)->second);
+  // First cut: typed count, visit count, recency.
+  // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
+  // recently visited (within the last 12/24 hours) as highly important. Get
+  // input from mpearson.
+  if (r1.typed_count() != r2.typed_count())
+    return (r1.typed_count() > r2.typed_count());
+  if (r1.visit_count() != r2.visit_count())
+    return (r1.visit_count() > r2.visit_count());
+  return (r1.last_visit() > r2.last_visit());
+}
+
+// Searching -------------------------------------------------------------------
 
 ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
-    const String16Vector& terms) {
+    const string16& term_string) {
+  pre_filter_item_count = 0;
+  post_filter_item_count = 0;
+  post_scoring_item_count = 0;
+  string16 clean_string = net::UnescapeURLComponent(term_string,
+      net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
+  string16 lower_string(base::i18n::ToLower(clean_string));
+  String16Vector words(
+      history::String16VectorFromString16(lower_string, false));
   ScoredHistoryMatches scored_items;
 
   // Do nothing if we have indexed no words (probably because we've not been
-  // initialized yet).
-  if (private_data_->word_list_.empty())
+  // initialized yet) or the search string has no words.
+  if (private_data_->word_list_.empty() || words.empty()) {
+    search_term_cache_.clear();  // Invalidate the term cache.
     return scored_items;
+  }
 
-  if (!terms.empty()) {
-    // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
-    // approach.
-    ResetSearchTermCache();
-
-    // Lowercase the terms.
-    // TODO(mrossetti): Another opportunity for a transform algorithm.
-    String16Vector lower_terms;
-    for (String16Vector::const_iterator term_iter = terms.begin();
-         term_iter != terms.end(); ++term_iter)
-      lower_terms.push_back(base::i18n::ToLower(*term_iter));
-
-    string16 all_terms(JoinString(lower_terms, ' '));
-    HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
-
-    // Don't perform any scoring (and don't return any matches) if the
-    // candidate pool is large. (See comments in header.)
-    const size_t kItemsToScoreLimit = 500;
-    if (history_id_set.size() <= kItemsToScoreLimit) {
-      // Pass over all of the candidates filtering out any without a proper
-      // substring match, inserting those which pass in order by score.
-      scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
-          AddHistoryMatch(*this, lower_terms)).ScoredMatches();
-
-      // Select and sort only the top kMaxMatches results.
-      if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
-        std::partial_sort(scored_items.begin(),
-                          scored_items.begin() +
-                              AutocompleteProvider::kMaxMatches,
-                          scored_items.end(),
-                          ScoredHistoryMatch::MatchScoreGreater);
-          scored_items.resize(AutocompleteProvider::kMaxMatches);
-      } else {
-        std::sort(scored_items.begin(), scored_items.end(),
-                  ScoredHistoryMatch::MatchScoreGreater);
-      }
-    }
+  // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
+  // approach.
+  ResetSearchTermCache();
+
+  HistoryIDSet history_id_set = HistoryIDSetFromWords(words);
+
+  // Trim the candidate pool if it is large. Note that we do not filter out
+  // items that do not contain the search terms as proper substrings -- doing
+  // so is the performance-costly operation we are trying to avoid in order
+  // to maintain omnibox responsiveness.
+  const size_t kItemsToScoreLimit = 500;
+  pre_filter_item_count = history_id_set.size();
+  // If we trim the results set we do not want to cache the results for next
+  // time as the user's ultimately desired result could easily be eliminated
+  // in this early rough filter.
+  bool was_trimmed = (pre_filter_item_count > kItemsToScoreLimit);
+  if (was_trimmed) {
+    HistoryIDVector history_ids;
+    std::copy(history_id_set.begin(), history_id_set.end(),
+              std::back_inserter(history_ids));
+    // Trim down the set by sorting by typed-count, visit-count, and last
+    // visit.
+    HistoryItemFactorGreater
+        item_factor_functor(private_data_->history_info_map_);
+    std::partial_sort(history_ids.begin(),
+                      history_ids.begin() + kItemsToScoreLimit,
+                      history_ids.end(),
+                      item_factor_functor);
+    history_id_set.clear();
+    std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
+              std::inserter(history_id_set, history_id_set.end()));
+    post_filter_item_count = history_id_set.size();
   }
 
-  // Remove any stale SearchTermCacheItems.
-  for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
-       cache_iter != search_term_cache_.end(); ) {
-    if (!cache_iter->second.used_)
-      search_term_cache_.erase(cache_iter++);
-    else
-      ++cache_iter;
+  // Pass over all of the candidates filtering out any without a proper
+  // substring match, inserting those which pass in order by score.
+  history::String16Vector terms;
+  Tokenize(lower_string, kWhitespaceUTF16, &terms);
+  scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
+      AddHistoryMatch(*this, terms)).ScoredMatches();
+
+  // Select and sort only the top kMaxMatches results.
+  if (scored_items.size() > AutocompleteProvider::kMaxMatches) {
+    std::partial_sort(scored_items.begin(),
+                      scored_items.begin() +
+                          AutocompleteProvider::kMaxMatches,
+                      scored_items.end(),
+                      ScoredHistoryMatch::MatchScoreGreater);
+      scored_items.resize(AutocompleteProvider::kMaxMatches);
+  } else {
+    std::sort(scored_items.begin(), scored_items.end(),
+              ScoredHistoryMatch::MatchScoreGreater);
+  }
+  post_scoring_item_count = scored_items.size();
+
+  if (was_trimmed) {
+    search_term_cache_.clear();  // Invalidate the term cache.
+  } else {
+    // Remove any stale SearchTermCacheItems.
+    for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
+         cache_iter != search_term_cache_.end(); ) {
+      if (!cache_iter->second.used_)
+        search_term_cache_.erase(cache_iter++);
+      else
+        ++cache_iter;
+    }
   }
 
   return scored_items;
@@ -469,19 +524,19 @@ void InMemoryURLIndex::ResetSearchTermCache() {
 }
 
 HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
-    const string16& uni_string) {
+    const String16Vector& unsorted_words) {
   // Break the terms down into individual terms (words), get the candidate
   // set for each term, and intersect each to get a final candidate list.
   // Note that a single 'term' from the user's perspective might be
   // a string like "http://www.somewebsite.com" which, from our perspective,
   // is four words: 'http', 'www', 'somewebsite', and 'com'.
   HistoryIDSet history_id_set;
-  String16Vector terms = String16VectorFromString16(uni_string, true);
+  String16Vector words(unsorted_words);
   // Sort the terms into the longest first as such are likely to narrow down
   // the results quicker. Also, single character terms are the most expensive
   // to process so save them for last.
-  std::sort(terms.begin(), terms.end(), LengthGreater);
-  for (String16Vector::iterator iter = terms.begin(); iter != terms.end();
+  std::sort(words.begin(), words.end(), LengthGreater);
+  for (String16Vector::iterator iter = words.begin(); iter != words.end();
        ++iter) {
     string16 uni_word = *iter;
     HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
@@ -489,7 +544,7 @@ HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
       history_id_set.clear();
       break;
     }
-    if (iter == terms.begin()) {
+    if (iter == words.begin()) {
       history_id_set.swap(term_history_set);
     } else {
       HistoryIDSet new_history_id_set;
@@ -821,6 +876,8 @@ int InMemoryURLIndex::ScoreComponentForMatches(const TermMatches& matches,
   return ScoreForValue(raw_score, kTermScoreLevel);
 }
 
+// InMemoryURLIndex::AddHistoryMatch -------------------------------------------
+
 InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch(
     const InMemoryURLIndex& index,
     const String16Vector& lower_terms)
@@ -854,6 +911,8 @@ bool InMemoryURLIndex::URLSchemeIsWhitelisted(const GURL& gurl) const {
   return scheme_whitelist_.find(gurl.scheme()) != scheme_whitelist_.end();
 }
 
+// Cache Management ------------------------------------------------------------
+
 void InMemoryURLIndex::SavePrivateData(InMemoryURLIndexCacheItem* cache) const {
   DCHECK(cache);
   cache->set_timestamp(base::Time::Now().ToInternalValue());
diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h
index bcbd676..b79f178 100644
--- a/chrome/browser/history/in_memory_url_index.h
+++ b/chrome/browser/history/in_memory_url_index.h
@@ -90,19 +90,22 @@ class InMemoryURLIndex {
   // directory.
   bool SaveToCacheFile();
 
-  // Given a vector containing one or more words as string16s, scans the
-  // history index and return a vector with all scored, matching history items.
-  // Each term must occur somewhere in the history item's URL or page title for
-  // the item to qualify; however, the terms do not necessarily have to be
-  // adjacent. Results are sorted with higher scoring items first. Each term
-  // from |terms| may contain punctuation but should not contain spaces.
-  // A search request which results in more than |kItemsToScoreLimit| total
-  // candidate items returns no matches (though the results set will be
-  // retained and used for subsequent calls to this function) as the scoring
-  // of such a large number of candidates may cause perceptible typing response
-  // delays in the omnibox. This is likely to occur for short omnibox terms
-  // such as 'h' and 'w' which will be found in nearly all history candidates.
-  ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms);
+  // Given a string16 in |term_string|, scans the history index and returns a
+  // vector with all scored, matching history items. The |term_string| is
+  // broken down into individual terms (words), each of which must occur in the
+  // candidate history item's URL or page title for the item to qualify;
+  // however, the terms do not necessarily have to be adjacent. Once we have
+  // a set of candidates, they are filtered to insure that all |term_string|
+  // terms, as separated by whitespace, occur within the candidate's URL
+  // or page title. Scores are then calculated on no more than
+  // |kItemsToScoreLimit| candidates, as the scoring of such a large number of
+  // candidates may cause perceptible typing response delays in the omnibox.
+  // This is likely to occur for short omnibox terms such as 'h' and 'w' which
+  // will be found in nearly all history candidates. Results are sorted by
+  // descending score. The full results set (i.e. beyond the
+  // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls
+  // to this function.
+  ScoredHistoryMatches HistoryItemsForTerms(const string16& term_string);
 
   // Updates or adds an history item to the index if it meets the minimum
   // 'quick' criteria.
@@ -119,7 +122,7 @@ class InMemoryURLIndex {
   FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
-  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities);
+  FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, NonUniqueTermCharacterSets);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, StaticFunctions);
@@ -184,6 +187,20 @@ class InMemoryURLIndex {
     const String16Vector& lower_terms_;
   };
 
+  // A helper predicate class used to filter excess history items when the
+  // candidate results set is too large.
+  class HistoryItemFactorGreater
+      : public std::binary_function<HistoryID, HistoryID, void> {
+   public:
+    explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);
+    ~HistoryItemFactorGreater();
+
+    bool operator()(const HistoryID h1, const HistoryID h2);
+
+  private:
+    const history::HistoryInfoMap& history_info_map_;
+  };
+
   // Initializes all index data members in preparation for restoring the index
   // from the cache or a complete rebuild from the history database.
   void ClearPrivateData();
@@ -221,8 +238,8 @@ class InMemoryURLIndex {
   void ResetSearchTermCache();
 
   // Composes a set of history item IDs by intersecting the set for each word
-  // in |uni_string|.
-  HistoryIDSet HistoryIDSetFromWords(const string16& uni_string);
+  // in |unsorted_words|.
+  HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words);
 
   // Helper function to HistoryIDSetFromWords which composes a set of history
   // ids for the given term given in |term|.
@@ -306,6 +323,12 @@ class InMemoryURLIndex {
   // http://crbug.com/83659
   bool cached_at_shutdown_;
 
+  // Used for unit testing only. Records the number of candidate history items
+  // at three stages in the index searching process.
+  size_t pre_filter_item_count;    // After word index is queried.
+  size_t post_filter_item_count;   // After trimming large result set.
+  size_t post_scoring_item_count;  // After performing final filter and scoring.
+
   DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex);
 };
 
diff --git a/chrome/browser/history/in_memory_url_index_types.h b/chrome/browser/history/in_memory_url_index_types.h
index 03f772c..b309c61 100644
--- a/chrome/browser/history/in_memory_url_index_types.h
+++ b/chrome/browser/history/in_memory_url_index_types.h
@@ -132,6 +132,7 @@ typedef std::map<char16, WordIDSet> CharWordIDMap;
 // A map from word (by word_id) to history items containing that word.
 typedef history::URLID HistoryID;
 typedef std::set<HistoryID> HistoryIDSet;
+typedef std::vector<HistoryID> HistoryIDVector;
 typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
 typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap;
 
diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc
index c4420bc..d60cb46 100644
--- a/chrome/browser/history/in_memory_url_index_unittest.cc
+++ b/chrome/browser/history/in_memory_url_index_unittest.cc
@@ -10,6 +10,7 @@
 #include "base/string16.h"
 #include "base/string_util.h"
 #include "base/utf_string_conversions.h"
+#include "chrome/browser/autocomplete/autocomplete.h"
 #include "chrome/browser/history/in_memory_database.h"
 #include "chrome/browser/history/in_memory_url_index.h"
 #include "chrome/browser/history/in_memory_url_index_types.h"
@@ -194,26 +195,6 @@ FilePath::StringType LimitedInMemoryURLIndexTest::TestDBName() const {
   return FILE_PATH_LITERAL("url_history_provider_test_limited.db.txt");
 }
 
-class ExpandedInMemoryURLIndexTest : public InMemoryURLIndexTest {
- protected:
-  virtual void SetUp();
-};
-
-void ExpandedInMemoryURLIndexTest::SetUp() {
-  InMemoryURLIndexTest::SetUp();
-  // Add 600 more history items.
-  // NOTE: Keep the string length constant at least the length of the format
-  // string plus 5 to account for a 3 digit number and terminator.
-  char url_format[] = "http://www.google.com/%d";
-  const size_t kMaxLen = arraysize(url_format) + 5;
-  char url_string[kMaxLen + 1];
-  for (int i = 0; i < 600; ++i) {
-    base::snprintf(url_string, kMaxLen, url_format, i);
-    URLRow row(MakeURLRow(url_string, "Google Search", 20, 0, 20));
-    AddURL(row);
-  }
-}
-
 TEST_F(InMemoryURLIndexTest, Construction) {
   url_index_.reset(new InMemoryURLIndex(FilePath()));
   EXPECT_TRUE(url_index_.get());
@@ -244,7 +225,7 @@ TEST_F(InMemoryURLIndexTest, Retrieval) {
 
   // See if a very specific term gives a single result.
   ScoredHistoryMatches matches =
-      url_index_->HistoryItemsForTerms(Make1Term("DrudgeReport"));
+      url_index_->HistoryItemsForTerms(ASCIIToUTF16("DrudgeReport"));
   ASSERT_EQ(1U, matches.size());
 
   // Verify that we got back the result we expected.
@@ -253,14 +234,14 @@ TEST_F(InMemoryURLIndexTest, Retrieval) {
   EXPECT_EQ(ASCIIToUTF16("DRUDGE REPORT 2010"), matches[0].url_info.title());
 
   // Search which should result in multiple results.
-  matches = url_index_->HistoryItemsForTerms(Make1Term("drudge"));
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("drudge"));
   ASSERT_EQ(2U, matches.size());
   // The results should be in descending score order.
   EXPECT_GE(matches[0].raw_score, matches[1].raw_score);
 
   // Search which should result in nearly perfect result.
-  matches = url_index_->HistoryItemsForTerms(Make2Terms("https",
-                                                        "NearlyPerfectResult"));
+  matches = url_index_->HistoryItemsForTerms(
+      ASCIIToUTF16("https NearlyPerfectResult"));
   ASSERT_EQ(1U, matches.size());
   // The results should have a very high score.
   EXPECT_GT(matches[0].raw_score, 900);
@@ -271,11 +252,7 @@ TEST_F(InMemoryURLIndexTest, Retrieval) {
             matches[0].url_info.title());
 
   // Search which should result in very poor result.
-  String16Vector original_terms;
-  original_terms.push_back(ASCIIToUTF16("z"));
-  original_terms.push_back(ASCIIToUTF16("y"));
-  original_terms.push_back(ASCIIToUTF16("x"));
-  matches = url_index_->HistoryItemsForTerms(original_terms);
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("z y x"));
   ASSERT_EQ(1U, matches.size());
   // The results should have a poor score.
   EXPECT_LT(matches[0].raw_score, 500);
@@ -286,22 +263,46 @@ TEST_F(InMemoryURLIndexTest, Retrieval) {
             matches[0].url_info.title());
 
   // Search which will match at the end of an URL with encoded characters.
-  matches = url_index_->HistoryItemsForTerms(Make1Term("ice"));
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("ice"));
   ASSERT_EQ(1U, matches.size());
 }
 
-TEST_F(ExpandedInMemoryURLIndexTest, ShortCircuit) {
+TEST_F(InMemoryURLIndexTest, ProperStringMatching) {
   url_index_.reset(new InMemoryURLIndex(FilePath()));
   url_index_->Init(this, "en,ja,hi,zh");
 
-  // A search for 'w' should short-circuit and not return any matches.
+  // Search for the following with the expected results:
+  // "atdmt view" - found
+  // "atdmt.view" - not found
+  // "view.atdmt" - found
   ScoredHistoryMatches matches =
-      url_index_->HistoryItemsForTerms(Make1Term("w"));
-  EXPECT_TRUE(matches.empty());
+      url_index_->HistoryItemsForTerms(ASCIIToUTF16("atdmt view"));
+  ASSERT_EQ(1U, matches.size());
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("atdmt.view"));
+  ASSERT_EQ(0U, matches.size());
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("view.atdmt"));
+  ASSERT_EQ(1U, matches.size());
+}
 
-  // A search for 'working' should not short-circuit.
-  matches = url_index_->HistoryItemsForTerms(Make1Term("working"));
-  EXPECT_EQ(1U, matches.size());
+TEST_F(InMemoryURLIndexTest, HugeResultSet) {
+  url_index_.reset(new InMemoryURLIndex(FilePath()));
+  url_index_->Init(this, "en,ja,hi,zh");
+
+  // Create a huge set of qualifying history items.
+  for (URLID row_id = 5000; row_id < 6000; ++row_id) {
+    URLRow new_row(GURL("http://www.brokeandaloneinmanitoba.com/"), row_id);
+    new_row.set_last_visit(base::Time::Now());
+    url_index_->UpdateURL(row_id, new_row);
+  }
+
+  ScoredHistoryMatches matches =
+      url_index_->HistoryItemsForTerms(ASCIIToUTF16("b"));
+  ASSERT_EQ(AutocompleteProvider::kMaxMatches, matches.size());
+  // There are 7 matches already in the database.
+  ASSERT_EQ(1007U, url_index_->pre_filter_item_count);
+  ASSERT_EQ(500U, url_index_->post_filter_item_count);
+  ASSERT_EQ(AutocompleteProvider::kMaxMatches,
+            url_index_->post_scoring_item_count);
 }
 
 TEST_F(InMemoryURLIndexTest, TitleSearch) {
@@ -309,14 +310,10 @@ TEST_F(InMemoryURLIndexTest, TitleSearch) {
   url_index_->Init(this, "en,ja,hi,zh");
   // Signal if someone has changed the test DB.
   EXPECT_EQ(27U, url_index_->private_data_->history_info_map_.size());
-  String16Vector original_terms;
 
   // Ensure title is being searched.
-  original_terms.push_back(ASCIIToUTF16("MORTGAGE"));
-  original_terms.push_back(ASCIIToUTF16("RATE"));
-  original_terms.push_back(ASCIIToUTF16("DROPS"));
   ScoredHistoryMatches matches =
-      url_index_->HistoryItemsForTerms(original_terms);
+      url_index_->HistoryItemsForTerms(ASCIIToUTF16("MORTGAGE RATE DROPS"));
   ASSERT_EQ(1U, matches.size());
 
   // Verify that we got back the result we expected.
@@ -333,12 +330,8 @@ TEST_F(InMemoryURLIndexTest, TitleChange) {
   url_index_->Init(this, "en,ja,hi,zh");
 
   // Verify current title terms retrieves desired item.
-  String16Vector original_terms;
-  original_terms.push_back(ASCIIToUTF16("lebronomics"));
-  original_terms.push_back(ASCIIToUTF16("could"));
-  original_terms.push_back(ASCIIToUTF16("high"));
-  original_terms.push_back(ASCIIToUTF16("taxes"));
-  original_terms.push_back(ASCIIToUTF16("influence"));
+  string16 original_terms =
+      ASCIIToUTF16("lebronomics could high taxes influence");
   ScoredHistoryMatches matches =
       url_index_->HistoryItemsForTerms(original_terms);
   ASSERT_EQ(1U, matches.size());
@@ -354,13 +347,7 @@ TEST_F(InMemoryURLIndexTest, TitleChange) {
   URLRow old_row(matches[0].url_info);
 
   // Verify new title terms retrieves nothing.
-  String16Vector new_terms;
-  new_terms.push_back(ASCIIToUTF16("does"));
-  new_terms.push_back(ASCIIToUTF16("eat"));
-  new_terms.push_back(ASCIIToUTF16("oats"));
-  new_terms.push_back(ASCIIToUTF16("little"));
-  new_terms.push_back(ASCIIToUTF16("lambs"));
-  new_terms.push_back(ASCIIToUTF16("ivy"));
+  string16 new_terms = ASCIIToUTF16("does eat oats little lambs ivy");
   matches = url_index_->HistoryItemsForTerms(new_terms);
   ASSERT_EQ(0U, matches.size());
 
@@ -383,25 +370,25 @@ TEST_F(InMemoryURLIndexTest, NonUniqueTermCharacterSets) {
   // The presence of duplicate characters should succeed. Exercise by cycling
   // through a string with several duplicate characters.
   ScoredHistoryMatches matches =
-      url_index_->HistoryItemsForTerms(Make1Term("ABRA"));
+      url_index_->HistoryItemsForTerms(ASCIIToUTF16("ABRA"));
   ASSERT_EQ(1U, matches.size());
   EXPECT_EQ(28, matches[0].url_info.id());
   EXPECT_EQ("http://www.ddj.com/windows/184416623",
             matches[0].url_info.url().spec());
 
-  matches = url_index_->HistoryItemsForTerms(Make1Term("ABRACAD"));
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("ABRACAD"));
   ASSERT_EQ(1U, matches.size());
   EXPECT_EQ(28, matches[0].url_info.id());
 
-  matches = url_index_->HistoryItemsForTerms(Make1Term("ABRACADABRA"));
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("ABRACADABRA"));
   ASSERT_EQ(1U, matches.size());
   EXPECT_EQ(28, matches[0].url_info.id());
 
-  matches = url_index_->HistoryItemsForTerms(Make1Term("ABRACADABR"));
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("ABRACADABR"));
   ASSERT_EQ(1U, matches.size());
   EXPECT_EQ(28, matches[0].url_info.id());
 
-  matches = url_index_->HistoryItemsForTerms(Make1Term("ABRACA"));
+  matches = url_index_->HistoryItemsForTerms(ASCIIToUTF16("ABRACA"));
   ASSERT_EQ(1U, matches.size());
   EXPECT_EQ(28, matches[0].url_info.id());
 }
@@ -426,58 +413,42 @@ TEST_F(InMemoryURLIndexTest, TypedCharacterCaching) {
 
   // Simulate typing "r" giving "r" in the simulated omnibox. The results for
   // 'r' will be not cached because it is only 1 character long.
-  String16Vector original_terms;
-  string16 term_r = ASCIIToUTF16("r");
-  original_terms.push_back(term_r);
-  url_index_->HistoryItemsForTerms(original_terms);
+  url_index_->HistoryItemsForTerms(ASCIIToUTF16("r"));
   EXPECT_EQ(0U, cache.size());
 
   // Simulate typing "re" giving "r re" in the simulated omnibox.
-  string16 term_re = ASCIIToUTF16("re");
-  original_terms.push_back(term_re);
   // 're' should be cached at this point but not 'r' as it is a single
   // character.
-  ASSERT_EQ(2U, original_terms.size());
-  url_index_->HistoryItemsForTerms(original_terms);
+  url_index_->HistoryItemsForTerms(ASCIIToUTF16("r re"));
   ASSERT_EQ(1U, cache.size());
-  CheckTerm(cache, term_re);
+  CheckTerm(cache, ASCIIToUTF16("re"));
 
   // Simulate typing "reco" giving "r re reco" in the simulated omnibox.
-  string16 term_reco = ASCIIToUTF16("reco");
-  original_terms.push_back(term_reco);
   // 're' and 'reco' should be cached at this point but not 'r' as it is a
   // single character.
-  url_index_->HistoryItemsForTerms(original_terms);
+  url_index_->HistoryItemsForTerms(ASCIIToUTF16("r re reco"));
   ASSERT_EQ(2U, cache.size());
-  CheckTerm(cache, term_re);
-  CheckTerm(cache, term_reco);
-
-  original_terms.clear();  // Simulate pressing <ESC>.
+  CheckTerm(cache, ASCIIToUTF16("re"));
+  CheckTerm(cache, ASCIIToUTF16("reco"));
 
   // Simulate typing "mort".
-  string16 term_mort = ASCIIToUTF16("mort");
-  original_terms.push_back(term_mort);
   // Since we now have only one search term, the cached results for 're' and
   // 'reco' should be purged, giving us only 1 item in the cache (for 'mort').
-  url_index_->HistoryItemsForTerms(original_terms);
+  url_index_->HistoryItemsForTerms(ASCIIToUTF16("mort"));
   ASSERT_EQ(1U, cache.size());
-  CheckTerm(cache, term_mort);
+  CheckTerm(cache, ASCIIToUTF16("mort"));
 
   // Simulate typing "reco" giving "mort reco" in the simulated omnibox.
-  original_terms.push_back(term_reco);
-  url_index_->HistoryItemsForTerms(original_terms);
+  url_index_->HistoryItemsForTerms(ASCIIToUTF16("mort reco"));
   ASSERT_EQ(2U, cache.size());
-  CheckTerm(cache, term_mort);
-  CheckTerm(cache, term_reco);
+  CheckTerm(cache, ASCIIToUTF16("mort"));
+  CheckTerm(cache, ASCIIToUTF16("reco"));
 
   // Simulate a <DELETE> by removing the 'reco' and adding back the 'rec'.
-  original_terms.resize(original_terms.size() - 1);
-  string16 term_rec = ASCIIToUTF16("rec");
-  original_terms.push_back(term_rec);
-  url_index_->HistoryItemsForTerms(original_terms);
+  url_index_->HistoryItemsForTerms(ASCIIToUTF16("mort rec"));
   ASSERT_EQ(2U, cache.size());
-  CheckTerm(cache, term_mort);
-  CheckTerm(cache, term_rec);
+  CheckTerm(cache, ASCIIToUTF16("mort"));
+  CheckTerm(cache, ASCIIToUTF16("rec"));
 }
 
 TEST_F(InMemoryURLIndexTest, Scoring) {
@@ -518,14 +489,13 @@ TEST_F(InMemoryURLIndexTest, Scoring) {
 TEST_F(InMemoryURLIndexTest, AddNewRows) {
   url_index_.reset(new InMemoryURLIndex(FilePath()));
   url_index_->Init(this, "en,ja,hi,zh");
-  String16Vector original_terms;
 
   // Verify that the row we're going to add does not already exist.
   URLID new_row_id = 87654321;
   // Newly created URLRows get a last_visit time of 'right now' so it should
   // qualify as a quick result candidate.
-  original_terms.push_back(ASCIIToUTF16("brokeandalone"));
-  EXPECT_TRUE(url_index_->HistoryItemsForTerms(original_terms).empty());
+  EXPECT_TRUE(url_index_->HistoryItemsForTerms(
+      ASCIIToUTF16("brokeandalone")).empty());
 
   // Add a new row.
   URLRow new_row(GURL("http://www.brokeandaloneinmanitoba.com/"), new_row_id);
@@ -533,27 +503,27 @@ TEST_F(InMemoryURLIndexTest, AddNewRows) {
   url_index_->UpdateURL(new_row_id, new_row);
 
   // Verify that we can retrieve it.
-  EXPECT_EQ(1U, url_index_->HistoryItemsForTerms(original_terms).size());
+  EXPECT_EQ(1U, url_index_->HistoryItemsForTerms(
+      ASCIIToUTF16("brokeandalone")).size());
 
   // Add it again just to be sure that is harmless.
   url_index_->UpdateURL(new_row_id, new_row);
-  EXPECT_EQ(1U, url_index_->HistoryItemsForTerms(original_terms).size());
+  EXPECT_EQ(1U, url_index_->HistoryItemsForTerms(
+      ASCIIToUTF16("brokeandalone")).size());
 }
 
 TEST_F(InMemoryURLIndexTest, DeleteRows) {
   url_index_.reset(new InMemoryURLIndex(FilePath()));
   url_index_->Init(this, "en,ja,hi,zh");
-  String16Vector original_terms;
 
-  // Make sure we actually get an existing result.
-  original_terms.push_back(ASCIIToUTF16("DrudgeReport"));
   ScoredHistoryMatches matches =
-      url_index_->HistoryItemsForTerms(original_terms);
+      url_index_->HistoryItemsForTerms(ASCIIToUTF16("DrudgeReport"));
   ASSERT_EQ(1U, matches.size());
 
   // Determine the row id for that result, delete that id, then search again.
   url_index_->DeleteURL(matches[0].url_info.id());
-  EXPECT_TRUE(url_index_->HistoryItemsForTerms(original_terms).empty());
+  EXPECT_TRUE(url_index_->HistoryItemsForTerms(
+      ASCIIToUTF16("DrudgeReport")).empty());
 }
 
 TEST_F(InMemoryURLIndexTest, WhitelistedURLs) {
author	mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-12-01 20:51:43 +0000
committer	mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-12-01 20:51:43 +0000
commit	df3fb512f19d28c2803d4b1d6b6c66143581492b (patch)
tree	f5f323fe9996c1d54a4e71d62e8bf37e67e6ebea
parent	39cbe0a7385fbd37d7b7332a9967c7408f08175d (diff)
download	chromium_src-df3fb512f19d28c2803d4b1d6b6c66143581492b.zip chromium_src-df3fb512f19d28c2803d4b1d6b6c66143581492b.tar.gz chromium_src-df3fb512f19d28c2803d4b1d6b6c66143581492b.tar.bz2