Step 3 integrating the HistoryQuickProvider: Implement searching and production of search results in the InMemoryURLIndex. Unit test the production of results. Minor change in URLHistoryProvuder unit test setup.

BUG=None TEST=None (unit tests added) Review URL: http://codereview.chromium.org/3364004 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@58955 0039d316-1c4b-4281-b951-d872f2087c98
author: mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-09 17:24:30 +0000
committer: mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-09 17:24:30 +0000
commit: 422a0b90cbcd884221d3fabac3507dd964e5a064 (patch)
tree: 43b636c7e063afe2ab3152c724de4688bbb28fc1 /chrome/browser/history
parent: 829a0c65bbded9a3262d5164e612ebdbc961d3c5 (diff)
download: chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.zip
chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.tar.gz
chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.tar.bz2
3 files changed, 466 insertions, 19 deletions
diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc
index 5715d39..95afa57 100644
--- a/chrome/browser/history/in_memory_url_index.cc
+++ b/chrome/browser/history/in_memory_url_index.cc
@@ -12,6 +12,7 @@
 #include "base/string_util.h"
 #include "base/time.h"
 #include "base/utf_string_conversions.h"
+#include "chrome/browser/autocomplete/history_provider_util.h"
 #include "chrome/browser/history/url_database.h"
 #include "net/base/escape.h"
 #include "net/base/net_util.h"
@@ -21,10 +22,23 @@ using base::TimeDelta;
 
 namespace history {
 
+ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}
+
+ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,
+                                       size_t input_location,
+                                       bool match_in_scheme,
+                                       bool innermost_match,
+                                       int score)
+    : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),
+      raw_score(score) {
+}
+
 InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {}
 
 InMemoryURLIndex::~InMemoryURLIndex() {}
 
+static const int32_t kURLToLowerBufferSize = 2048;
+
 // Indexing
 
 bool InMemoryURLIndex::Init(history::URLDatabase* history_db,
@@ -61,9 +75,6 @@ bool InMemoryURLIndex::IndexRow(URLRow row) {
       UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
       NULL, NULL, NULL));
 
-  // TODO(mrossetti): Find or implement a ConvertPercentEncoding and use it
-  // on the url.
-
   // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
   HistoryID history_id = static_cast<HistoryID>(row.id());
 
@@ -77,7 +88,7 @@ bool InMemoryURLIndex::IndexRow(URLRow row) {
   history_info_map_.insert(std::make_pair(history_id, new_row));
 
   // Split into individual, unique words.
-  String16Set words = WordsFromString16(url);
+  String16Set words = WordSetFromString16(url);
 
   // For each word, add a new entry into the word index referring to the
   // associated history item.
@@ -90,26 +101,134 @@ bool InMemoryURLIndex::IndexRow(URLRow row) {
   return true;
 }
 
+// Searching
+
+ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
+    const String16Vector& terms) {
+  ScoredHistoryMatches scored_items;
+  if (!terms.empty()) {
+    // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
+    // and-sweep approach.
+    ResetTermCharWordSetCache();
+
+    // Lowercase the terms.
+    // TODO(mrossetti): Another opportunity for a transform algorithm.
+    String16Vector lower_terms;
+    for (String16Vector::const_iterator term_iter = terms.begin();
+         term_iter != terms.end(); ++term_iter) {
+      String16Vector::value_type lower_string(*term_iter);
+      std::transform(lower_string.begin(),
+                     lower_string.end(),
+                     lower_string.begin(),
+                     tolower);
+      lower_terms.push_back(lower_string);
+    }
+
+    String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
+    HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
+
+    // Pass over all of the candidates filtering out any without a proper
+    // substring match, inserting those which pass in order by score.
+    scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
+                                 AddHistoryMatch(*this,
+                                                 lower_terms)).ScoredMatches();
+  }
+
+  // Remove any stale TermCharWordSet's.
+  term_char_word_set_cache_.erase(
+      std::remove_if(term_char_word_set_cache_.begin(),
+                     term_char_word_set_cache_.end(),
+                     std::mem_fun_ref(&TermCharWordSet::IsNotUsed)),
+      term_char_word_set_cache_.end());
+  return scored_items;
+}
+
+void InMemoryURLIndex::ResetTermCharWordSetCache() {
+  // TODO(mrossetti): Consider keeping more of the cache around for possible
+  // repeat searches, except a 'shortcuts' approach might be better for that.
+  // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
+  for (TermCharWordSetVector::iterator iter =
+       term_char_word_set_cache_.begin();
+       iter != term_char_word_set_cache_.end(); ++iter)
+    iter->used_ = false;
+}
+
+InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
+    const string16& uni_string) {
+  // Break the terms down into individual terms (words), get the candidate
+  // set for each term, and intersect each to get a final candidate list.
+  // Note that a single 'term' from the user's perspective might be
+  // a string like "http://www.somewebsite.com" which, from our perspective,
+  // is four words: 'http', 'www', 'somewebsite', and 'com'.
+  HistoryIDSet history_id_set;
+  String16Set words = WordSetFromString16(uni_string);
+  bool first_word = true;
+  for (String16Set::iterator iter = words.begin();
+       iter != words.end(); ++iter) {
+    String16Set::value_type uni_word = *iter;
+    HistoryIDSet term_history_id_set =
+        InMemoryURLIndex::HistoryIDsForTerm(uni_word);
+    if (first_word) {
+      history_id_set.swap(term_history_id_set);
+      first_word = false;
+    } else {
+      HistoryIDSet old_history_id_set(history_id_set);
+      history_id_set.clear();
+      std::set_intersection(old_history_id_set.begin(),
+                            old_history_id_set.end(),
+                            term_history_id_set.begin(),
+                            term_history_id_set.end(),
+                            std::inserter(history_id_set,
+                                          history_id_set.begin()));
+    }
+  }
+  return history_id_set;
+}
+
+InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
+    const string16& uni_word) {
+  InMemoryURLIndex::HistoryIDSet history_id_set;
+
+  // For each character in the word, get the char/word_id map entry and
+  // intersect with the set in an incremental manner.
+  Char16Set uni_chars = CharactersFromString16(uni_word);
+  WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
+
+  // If any words resulted then we can compose a set of history IDs by unioning
+  // the sets from each word.
+  if (!word_id_set.empty()) {
+    for (WordIDSet::iterator word_id_iter = word_id_set.begin();
+         word_id_iter != word_id_set.end(); ++word_id_iter) {
+      WordID word_id = *word_id_iter;
+      WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
+      if (word_iter != word_id_history_map_.end()) {
+        HistoryIDSet& word_history_id_set(word_iter->second);
+        history_id_set.insert(word_history_id_set.begin(),
+                              word_history_id_set.end());
+      }
+    }
+  }
+
+  return history_id_set;
+}
+
 // Utility Functions
 
-InMemoryURLIndex::String16Set InMemoryURLIndex::WordsFromString16(
+// static
+InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
     const string16& uni_string) {
   String16Set words;
-
-  // TODO(mrossetti): Replace all | and _'s with a space, all % quoted
-  // characters with real characters, and break into words, using
-  // appropriate string16 functions.
   WordIterator iter(&uni_string, WordIterator::BREAK_WORD);
   if (iter.Init()) {
     while (iter.Advance()) {
-      if (iter.IsWord()) {
+      if (iter.IsWord())
         words.insert(iter.GetWord());
-      }
     }
   }
   return words;
 }
 
+// static
 InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16(
     const string16& uni_word) {
   Char16Set characters;
@@ -165,9 +284,215 @@ void InMemoryURLIndex::AddWordHistory(const string16& uni_word,
   }
 }
 
+InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars(
+    InMemoryURLIndex::Char16Set const& uni_chars) {
+  TermCharWordSet* best_term_char_word_set = NULL;
+  bool set_found = false;
+  size_t outstanding_count = 0;
+  Char16Set outstanding_chars;
+  for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
+       iter != term_char_word_set_cache_.end(); ++iter) {
+    TermCharWordSetVector::value_type& term_char_word_set(*iter);
+    Char16Set& char_set(term_char_word_set.char_set_);
+    Char16Set exclusions;
+    std::set_difference(char_set.begin(), char_set.end(),
+                        uni_chars.begin(), uni_chars.end(),
+                        std::inserter(exclusions, exclusions.begin()));
+    if (exclusions.empty()) {
+      // Do a reverse difference so that we know which characters remain
+      // to be indexed. Then decide if this is a better match than any
+      // previous cached set.
+      std::set_difference(uni_chars.begin(), uni_chars.end(),
+                          char_set.begin(), char_set.end(),
+                          std::inserter(exclusions, exclusions.begin()));
+      if (!set_found || exclusions.size() < outstanding_count) {
+        outstanding_chars = exclusions;
+        best_term_char_word_set = &*iter;
+        outstanding_count = exclusions.size();
+        set_found = true;
+      }
+    }
+  }
+
+  WordIDSet word_id_set;
+  if (set_found && outstanding_count == 0) {
+    // If there were no oustanding characters then we can use the cached one.
+    best_term_char_word_set->used_ = true;
+    word_id_set = best_term_char_word_set->word_id_set_;
+  } else {
+    // Some or all of the characters remain to be indexed.
+    bool first_character = true;
+    if (set_found) {
+      first_character = false;
+      word_id_set = best_term_char_word_set->word_id_set_;
+    } else {
+      outstanding_chars = uni_chars;
+    }
+
+    for (Char16Set::iterator uni_char_iter = outstanding_chars.begin();
+         uni_char_iter != outstanding_chars.end(); ++uni_char_iter) {
+      Char16Set::value_type uni_char = *uni_char_iter;
+      CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
+      if (char_iter == char_word_map_.end()) {
+        // The character was not found so bail.
+        word_id_set.clear();
+        break;
+      }
+      WordIDSet& char_word_id_set(char_iter->second);
+      if (first_character) {
+        word_id_set = char_word_id_set;
+        first_character = false;
+      } else {
+        WordIDSet old_word_id_set(word_id_set);
+        word_id_set.clear();
+        std::set_intersection(old_word_id_set.begin(),
+                              old_word_id_set.end(),
+                              char_word_id_set.begin(),
+                              char_word_id_set.end(),
+                              std::inserter(word_id_set,
+                              word_id_set.begin()));
+      }
+    }
+    // Update the cache.
+    if (!set_found || outstanding_count) {
+      term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars,
+          word_id_set, true));
+    }
+  }
+  return word_id_set;
+}
+
+// static
+int InMemoryURLIndex::RawScoreForURL(const URLRow& row,
+                                     const String16Vector& terms,
+                                     size_t* first_term_location) {
+  GURL gurl = row.url();
+  if (!gurl.is_valid())
+    return 0;
+
+  string16 url = UTF8ToUTF16(gurl.spec());
+
+  // Collect all term start positions so we can see if they appear in order.
+  std::vector<size_t> term_locations;
+  int out_of_order = 0;  // Count the terms which are out of order.
+  size_t start_location_total = 0;
+  size_t term_length_total = 0;
+  for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
+       ++iter) {
+    string16 term = *iter;
+    size_t term_location = url.find(term);
+    if (term_location == string16::npos)
+      return 0;  // A term was not found.  This should not happen.
+    if (iter != terms.begin()) {
+      // See if this term is out-of-order.
+      for (std::vector<size_t>::const_iterator order_iter =
+               term_locations.begin(); order_iter != term_locations.end();
+               ++order_iter) {
+        if (term_location <= *order_iter)
+          ++out_of_order;
+      }
+    } else {
+      *first_term_location = term_location;
+    }
+    term_locations.push_back(term_location);
+    start_location_total += term_location;
+    term_length_total += term.size();
+  }
+
+  // Calculate a raw score.
+  // TODO(mrossetti): This is good enough for now but must be fine-tuned.
+  const float kOrderMaxValue = 10.0;
+  float order_value = 10.0;
+  if (terms.size() > 1) {
+    int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;
+    order_value =
+        (static_cast<float>(max_possible_out_of_order - out_of_order) /
+         max_possible_out_of_order) * kOrderMaxValue;
+  }
+
+  const float kStartMaxValue = 10.0;
+  const size_t kMaxSignificantStart = 20;
+  float start_value =
+      (static_cast<float>(kMaxSignificantStart -
+       std::min(kMaxSignificantStart, start_location_total / terms.size()))) /
+      static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
+
+  const float kCompleteMaxValue = 10.0;
+  float complete_value =
+      (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *
+      kStartMaxValue;
+
+  const float kLastVisitMaxValue = 10.0;
+  const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
+  int64 delta_time = (kMaxSignificantDay -
+      std::min((base::Time::Now() - row.last_visit()),
+               kMaxSignificantDay)).ToInternalValue();
+  float last_visit_value =
+      (static_cast<float>(delta_time) /
+       static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
+      kLastVisitMaxValue;
+
+  const float kVisitCountMaxValue = 10.0;
+  const int kMaxSignificantVisits = 10;
+  float visit_count_value =
+      (static_cast<float>(std::min(row.visit_count(),
+       kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
+      kVisitCountMaxValue;
+  float raw_score = order_value + start_value + complete_value +
+      last_visit_value + visit_count_value;
+
+  // Normalize the score.
+  const float kMaxNormalizedRawScore = 1000.0;
+  raw_score =
+      (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
+                    kLastVisitMaxValue + kVisitCountMaxValue)) *
+      kMaxNormalizedRawScore;
+  return static_cast<int>(raw_score);
+}
+
 // static
 Time InMemoryURLIndex::RecentThreshold() {
   return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);
 }
 
+void InMemoryURLIndex::AddHistoryMatch::operator()(
+    const InMemoryURLIndex::HistoryID history_id) {
+  HistoryInfoMap::const_iterator hist_pos =
+      index_.history_info_map_.find(history_id);
+  if (hist_pos != index_.history_info_map_.end()) {
+    const URLRow& hist_item = hist_pos->second;
+    // TODO(mrossetti): Accommodate multiple term highlighting.
+    size_t input_location = 0;
+    int score = InMemoryURLIndex::RawScoreForURL(
+        hist_item, lower_terms_, &input_location);
+    if (score != 0) {
+      // We only retain the top 10 highest scoring results so
+      // see if this one fits into the top 10 and, if so, where.
+      ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
+      while (scored_iter != scored_matches_.end() &&
+             (*scored_iter).raw_score > score)
+        ++scored_iter;
+      if ((scored_matches_.size() < 10) ||
+          (scored_iter != scored_matches_.end())) {
+        // Create and insert the new item.
+        // TODO(mrossetti): Properly set |match_in_scheme| and
+        // |innermost_match|.
+        bool match_in_scheme = false;
+        bool innermost_match = true;
+        ScoredHistoryMatch match(hist_item, input_location,
+            match_in_scheme, innermost_match, score);
+        if (!scored_matches_.empty())
+          scored_matches_.insert(scored_iter, match);
+        else
+          scored_matches_.push_back(match);
+        // Trim any entries beyond 10.
+        if (scored_matches_.size() > 10) {
+          scored_matches_.erase(scored_matches_.begin() + 10,
+                                scored_matches_.end());
+        }
+      }
+    }
+  }
+}
+
 }  // namespace history
diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h
index 5f7ee3a..b5b3373 100644
--- a/chrome/browser/history/in_memory_url_index.h
+++ b/chrome/browser/history/in_memory_url_index.h
@@ -6,6 +6,7 @@
 #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
 #pragma once
 
+#include <functional>
 #include <map>
 #include <set>
 #include <string>
@@ -28,6 +29,23 @@ namespace history {
 
 class URLDatabase;
 
+// Used for intermediate history result operations.
+struct ScoredHistoryMatch : public HistoryMatch {
+  // Required for STL, we don't use this directly.
+  ScoredHistoryMatch();
+
+  ScoredHistoryMatch(const URLRow& url_info,
+                     size_t input_location,
+                     bool match_in_scheme,
+                     bool innermost_match,
+                     int score);
+
+  // An interim score taking into consideration location and completeness
+  // of the match.
+  int raw_score;
+};
+typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches;
+
 // The URL history source.
 // Holds portions of the URL database in memory in an indexed form.  Used to
 // quickly look up matching URLs for a given query string.  Used by
@@ -65,16 +83,18 @@ class InMemoryURLIndex {
   // history index and return a vector with all scored, matching history items.
   // Each term must occur somewhere in the history item for the item to
   // qualify; however, the terms do not necessarily have to be adjacent.
-  HistoryMatches HistoryItemsForTerms(const String16Vector& terms);
+  // Results are sorted with higher scoring items first.
+  ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms);
 
   // Returns the date threshold for considering an history item as significant.
   static base::Time RecentThreshold();
 
  private:
+  friend class AddHistoryMatch;
   friend class InMemoryURLIndexTest;
   FRIEND_TEST(InMemoryURLIndexTest, Initialization);
 
-  // Convenience types
+  // Convenience types.
   typedef std::set<string16> String16Set;
   typedef std::set<char16> Char16Set;
 
@@ -103,6 +123,8 @@ class InMemoryURLIndex {
           word_id_set_(word_id_set),
           used_(used) {}
 
+    bool IsNotUsed() const { return !used_; }
+
     Char16Set char_set_;
     WordIDSet word_id_set_;
     bool used_;  // true if this set has been used for the current term search.
@@ -115,8 +137,33 @@ class InMemoryURLIndex {
   // A map from history_id to the history's URL and title.
   typedef std::map<HistoryID, URLRow> HistoryInfoMap;
 
+  // A helper class which performs the final filter on each candidate
+  // history URL match, inserting accepted matches into |scored_matches_|
+  // and trimming the maximum number of matches to 10.
+  class AddHistoryMatch : std::unary_function<HistoryID, void> {
+   public:
+    AddHistoryMatch(const InMemoryURLIndex& index,
+                    const String16Vector& lower_terms)
+        : index_(index),
+          lower_terms_(lower_terms) {}
+
+    void operator()(const HistoryID history_id);
+
+    ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }
+
+   private:
+    const InMemoryURLIndex& index_;
+    ScoredHistoryMatches scored_matches_;
+    const String16Vector& lower_terms_;
+  };
+
   // Break a string down into individual words.
-  String16Set WordsFromString16(const string16& uni_string);
+  static String16Set WordSetFromString16(const string16& uni_string);
+
+  // Given a set of Char16s, find words containing those characters. If any
+  // existing, cached set is a proper subset then start with that cached
+  // set. Update the cache.
+  WordIDSet WordIDSetForTermChars(const Char16Set& uni_chars);
 
   // URL History indexing support functions.
 
@@ -145,6 +192,33 @@ class InMemoryURLIndex {
   // |history_id| as the initial element of the word's set.
   void AddWordHistory(const string16& uni_word, HistoryID history_id);
 
+  // Clear the search term cache. This cache holds on to the intermediate
+  // word results for each previously typed character to eliminate the need
+  // to re-perform set operations for previously typed characters.
+  void ResetTermCharWordSetCache();
+
+  // Compose a set of history item IDs by intersecting the set for each word
+  // in |uni_string|.
+  HistoryIDSet HistoryIDSetFromWords(const string16& uni_string);
+
+  // Helper function to HistoryIDSetFromWords which composes a set of history
+  // ids for the given term given in |uni_word|.
+  HistoryIDSet HistoryIDsForTerm(const string16& uni_word);
+
+  // Calculate a raw score for this history item by first determining
+  // if all of the terms in |terms_vector| occur in |row| and, if so,
+  // calculating a raw score based on 1) starting position of each term
+  // in the user input, 2) completeness of each term's match, 3) ordering
+  // of the occurrence of each term (i.e. they appear in order), 4) last
+  // visit time, and 5) number of visits.
+  // This raw score allows the results to be ordered and can be used
+  // to influence the final score calculated by the client of this
+  // index. Return the starting location of the first term in
+  // |first_term_location|.
+  static int RawScoreForURL(const URLRow& row,
+                            const String16Vector& terms_vector,
+                            size_t* first_term_location);
+
   // A list of all of indexed words. The index of a word in this list is the
   // ID of the word in the word_map_. It reduces the memory overhead by
   // replacing a potentially long and repeated string with a simple index.
diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc
index f399496..9306ec5 100644
--- a/chrome/browser/history/in_memory_url_index_unittest.cc
+++ b/chrome/browser/history/in_memory_url_index_unittest.cc
@@ -109,6 +109,7 @@ TEST_F(InMemoryURLIndexTest, Construction) {
   EXPECT_TRUE(url_index_.get());
 }
 
+// TODO(mrossetti): Write python script to calculate the validation criteria.
 TEST_F(InMemoryURLIndexTest, Initialization) {
   // Verify that the database contains the expected number of items, which
   // is the pre-filtered count, i.e. all of the items.
@@ -116,21 +117,68 @@ TEST_F(InMemoryURLIndexTest, Initialization) {
   EXPECT_TRUE(statement);
   uint64 row_count = 0;
   while (statement.Step()) ++row_count;
-  EXPECT_EQ(29U, row_count);
+  EXPECT_EQ(33U, row_count);
   url_index_.reset(new InMemoryURLIndex);
   url_index_->Init(this, "en,ja,hi,zh");
 
-  // There should have been 25 of the 29 urls accepted during filtering.
-  EXPECT_EQ(25U, url_index_->history_item_count_);
+  // There should have been 27 of the 31 urls accepted during filtering.
+  EXPECT_EQ(28U, url_index_->history_item_count_);
 
   // history_info_map_ should have the same number of items as were filtered.
-  EXPECT_EQ(25U, url_index_->history_info_map_.size());
+  EXPECT_EQ(28U, url_index_->history_info_map_.size());
 
   // The resulting indexes should account for:
   //    37 characters
   //    88 words
   EXPECT_EQ(37U, url_index_->char_word_map_.size());
-  EXPECT_EQ(88U, url_index_->word_map_.size());
+  EXPECT_EQ(91U, url_index_->word_map_.size());
+}
+
+TEST_F(InMemoryURLIndexTest, Retrieval) {
+  url_index_.reset(new InMemoryURLIndex);
+  std::string languages("en,ja,hi,zh");
+  url_index_->Init(this, languages);
+  InMemoryURLIndex::String16Vector terms;
+  // The term will be lowercased by the search.
+
+  // See if a very specific term gives a single result.
+  string16 term = ASCIIToUTF16("DrudgeReport");
+  terms.push_back(term);
+  ScoredHistoryMatches matches = url_index_->HistoryItemsForTerms(terms);
+  EXPECT_EQ(1U, matches.size());
+
+  // Search which should result in multiple results.
+  terms.clear();
+  term = ASCIIToUTF16("drudge");
+  terms.push_back(term);
+  matches = url_index_->HistoryItemsForTerms(terms);
+  ASSERT_EQ(2U, matches.size());
+  // The results should be in descending score order.
+  EXPECT_GT(matches[0].raw_score, matches[1].raw_score);
+
+  // Search which should result in nearly perfect result.
+  terms.clear();
+  term = ASCIIToUTF16("http");
+  terms.push_back(term);
+  term = ASCIIToUTF16("NearlyPerfectResult");
+  terms.push_back(term);
+  matches = url_index_->HistoryItemsForTerms(terms);
+  ASSERT_EQ(1U, matches.size());
+  // The results should have a very high score.
+  EXPECT_GT(matches[0].raw_score, 900);
+
+  // Search which should result in very poor result.
+  terms.clear();
+  term = ASCIIToUTF16("z");
+  terms.push_back(term);
+  term = ASCIIToUTF16("y");
+  terms.push_back(term);
+  term = ASCIIToUTF16("x");
+  terms.push_back(term);
+  matches = url_index_->HistoryItemsForTerms(terms);
+  ASSERT_EQ(1U, matches.size());
+  // The results should have a poor score.
+  EXPECT_LT(matches[0].raw_score, 200);
 }
 
 }  // namespace history
author	mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-09 17:24:30 +0000
committer	mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-09 17:24:30 +0000
commit	422a0b90cbcd884221d3fabac3507dd964e5a064 (patch)
tree	43b636c7e063afe2ab3152c724de4688bbb28fc1 /chrome/browser/history
parent	829a0c65bbded9a3262d5164e612ebdbc961d3c5 (diff)
download	chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.zip chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.tar.gz chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.tar.bz2