summaryrefslogtreecommitdiffstats
path: root/chrome/browser/history
diff options
context:
space:
mode:
authormrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-09-09 17:24:30 +0000
committermrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-09-09 17:24:30 +0000
commit422a0b90cbcd884221d3fabac3507dd964e5a064 (patch)
tree43b636c7e063afe2ab3152c724de4688bbb28fc1 /chrome/browser/history
parent829a0c65bbded9a3262d5164e612ebdbc961d3c5 (diff)
downloadchromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.zip
chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.tar.gz
chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.tar.bz2
Step 3 integrating the HistoryQuickProvider: Implement searching and production of search results in the InMemoryURLIndex. Unit test the production of results. Minor change in URLHistoryProvuder unit test setup.
BUG=None TEST=None (unit tests added) Review URL: http://codereview.chromium.org/3364004 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@58955 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/history')
-rw-r--r--chrome/browser/history/in_memory_url_index.cc347
-rw-r--r--chrome/browser/history/in_memory_url_index.h80
-rw-r--r--chrome/browser/history/in_memory_url_index_unittest.cc58
3 files changed, 466 insertions, 19 deletions
diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc
index 5715d39..95afa57 100644
--- a/chrome/browser/history/in_memory_url_index.cc
+++ b/chrome/browser/history/in_memory_url_index.cc
@@ -12,6 +12,7 @@
#include "base/string_util.h"
#include "base/time.h"
#include "base/utf_string_conversions.h"
+#include "chrome/browser/autocomplete/history_provider_util.h"
#include "chrome/browser/history/url_database.h"
#include "net/base/escape.h"
#include "net/base/net_util.h"
@@ -21,10 +22,23 @@ using base::TimeDelta;
namespace history {
+ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {}
+
+ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info,
+ size_t input_location,
+ bool match_in_scheme,
+ bool innermost_match,
+ int score)
+ : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match),
+ raw_score(score) {
+}
+
InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {}
InMemoryURLIndex::~InMemoryURLIndex() {}
+static const int32_t kURLToLowerBufferSize = 2048;
+
// Indexing
bool InMemoryURLIndex::Init(history::URLDatabase* history_db,
@@ -61,9 +75,6 @@ bool InMemoryURLIndex::IndexRow(URLRow row) {
UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
NULL, NULL, NULL));
- // TODO(mrossetti): Find or implement a ConvertPercentEncoding and use it
- // on the url.
-
// TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max().
HistoryID history_id = static_cast<HistoryID>(row.id());
@@ -77,7 +88,7 @@ bool InMemoryURLIndex::IndexRow(URLRow row) {
history_info_map_.insert(std::make_pair(history_id, new_row));
// Split into individual, unique words.
- String16Set words = WordsFromString16(url);
+ String16Set words = WordSetFromString16(url);
// For each word, add a new entry into the word index referring to the
// associated history item.
@@ -90,26 +101,134 @@ bool InMemoryURLIndex::IndexRow(URLRow row) {
return true;
}
+// Searching
+
+ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms(
+ const String16Vector& terms) {
+ ScoredHistoryMatches scored_items;
+ if (!terms.empty()) {
+ // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
+ // and-sweep approach.
+ ResetTermCharWordSetCache();
+
+ // Lowercase the terms.
+ // TODO(mrossetti): Another opportunity for a transform algorithm.
+ String16Vector lower_terms;
+ for (String16Vector::const_iterator term_iter = terms.begin();
+ term_iter != terms.end(); ++term_iter) {
+ String16Vector::value_type lower_string(*term_iter);
+ std::transform(lower_string.begin(),
+ lower_string.end(),
+ lower_string.begin(),
+ tolower);
+ lower_terms.push_back(lower_string);
+ }
+
+ String16Vector::value_type all_terms(JoinString(lower_terms, ' '));
+ HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms);
+
+ // Pass over all of the candidates filtering out any without a proper
+ // substring match, inserting those which pass in order by score.
+ scored_items = std::for_each(history_id_set.begin(), history_id_set.end(),
+ AddHistoryMatch(*this,
+ lower_terms)).ScoredMatches();
+ }
+
+ // Remove any stale TermCharWordSet's.
+ term_char_word_set_cache_.erase(
+ std::remove_if(term_char_word_set_cache_.begin(),
+ term_char_word_set_cache_.end(),
+ std::mem_fun_ref(&TermCharWordSet::IsNotUsed)),
+ term_char_word_set_cache_.end());
+ return scored_items;
+}
+
+void InMemoryURLIndex::ResetTermCharWordSetCache() {
+ // TODO(mrossetti): Consider keeping more of the cache around for possible
+ // repeat searches, except a 'shortcuts' approach might be better for that.
+ // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
+ for (TermCharWordSetVector::iterator iter =
+ term_char_word_set_cache_.begin();
+ iter != term_char_word_set_cache_.end(); ++iter)
+ iter->used_ = false;
+}
+
+InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(
+ const string16& uni_string) {
+ // Break the terms down into individual terms (words), get the candidate
+ // set for each term, and intersect each to get a final candidate list.
+ // Note that a single 'term' from the user's perspective might be
+ // a string like "http://www.somewebsite.com" which, from our perspective,
+ // is four words: 'http', 'www', 'somewebsite', and 'com'.
+ HistoryIDSet history_id_set;
+ String16Set words = WordSetFromString16(uni_string);
+ bool first_word = true;
+ for (String16Set::iterator iter = words.begin();
+ iter != words.end(); ++iter) {
+ String16Set::value_type uni_word = *iter;
+ HistoryIDSet term_history_id_set =
+ InMemoryURLIndex::HistoryIDsForTerm(uni_word);
+ if (first_word) {
+ history_id_set.swap(term_history_id_set);
+ first_word = false;
+ } else {
+ HistoryIDSet old_history_id_set(history_id_set);
+ history_id_set.clear();
+ std::set_intersection(old_history_id_set.begin(),
+ old_history_id_set.end(),
+ term_history_id_set.begin(),
+ term_history_id_set.end(),
+ std::inserter(history_id_set,
+ history_id_set.begin()));
+ }
+ }
+ return history_id_set;
+}
+
+InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(
+ const string16& uni_word) {
+ InMemoryURLIndex::HistoryIDSet history_id_set;
+
+ // For each character in the word, get the char/word_id map entry and
+ // intersect with the set in an incremental manner.
+ Char16Set uni_chars = CharactersFromString16(uni_word);
+ WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
+
+ // If any words resulted then we can compose a set of history IDs by unioning
+ // the sets from each word.
+ if (!word_id_set.empty()) {
+ for (WordIDSet::iterator word_id_iter = word_id_set.begin();
+ word_id_iter != word_id_set.end(); ++word_id_iter) {
+ WordID word_id = *word_id_iter;
+ WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
+ if (word_iter != word_id_history_map_.end()) {
+ HistoryIDSet& word_history_id_set(word_iter->second);
+ history_id_set.insert(word_history_id_set.begin(),
+ word_history_id_set.end());
+ }
+ }
+ }
+
+ return history_id_set;
+}
+
// Utility Functions
-InMemoryURLIndex::String16Set InMemoryURLIndex::WordsFromString16(
+// static
+InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16(
const string16& uni_string) {
String16Set words;
-
- // TODO(mrossetti): Replace all | and _'s with a space, all % quoted
- // characters with real characters, and break into words, using
- // appropriate string16 functions.
WordIterator iter(&uni_string, WordIterator::BREAK_WORD);
if (iter.Init()) {
while (iter.Advance()) {
- if (iter.IsWord()) {
+ if (iter.IsWord())
words.insert(iter.GetWord());
- }
}
}
return words;
}
+// static
InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16(
const string16& uni_word) {
Char16Set characters;
@@ -165,9 +284,215 @@ void InMemoryURLIndex::AddWordHistory(const string16& uni_word,
}
}
+InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars(
+ InMemoryURLIndex::Char16Set const& uni_chars) {
+ TermCharWordSet* best_term_char_word_set = NULL;
+ bool set_found = false;
+ size_t outstanding_count = 0;
+ Char16Set outstanding_chars;
+ for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
+ iter != term_char_word_set_cache_.end(); ++iter) {
+ TermCharWordSetVector::value_type& term_char_word_set(*iter);
+ Char16Set& char_set(term_char_word_set.char_set_);
+ Char16Set exclusions;
+ std::set_difference(char_set.begin(), char_set.end(),
+ uni_chars.begin(), uni_chars.end(),
+ std::inserter(exclusions, exclusions.begin()));
+ if (exclusions.empty()) {
+ // Do a reverse difference so that we know which characters remain
+ // to be indexed. Then decide if this is a better match than any
+ // previous cached set.
+ std::set_difference(uni_chars.begin(), uni_chars.end(),
+ char_set.begin(), char_set.end(),
+ std::inserter(exclusions, exclusions.begin()));
+ if (!set_found || exclusions.size() < outstanding_count) {
+ outstanding_chars = exclusions;
+ best_term_char_word_set = &*iter;
+ outstanding_count = exclusions.size();
+ set_found = true;
+ }
+ }
+ }
+
+ WordIDSet word_id_set;
+ if (set_found && outstanding_count == 0) {
+ // If there were no oustanding characters then we can use the cached one.
+ best_term_char_word_set->used_ = true;
+ word_id_set = best_term_char_word_set->word_id_set_;
+ } else {
+ // Some or all of the characters remain to be indexed.
+ bool first_character = true;
+ if (set_found) {
+ first_character = false;
+ word_id_set = best_term_char_word_set->word_id_set_;
+ } else {
+ outstanding_chars = uni_chars;
+ }
+
+ for (Char16Set::iterator uni_char_iter = outstanding_chars.begin();
+ uni_char_iter != outstanding_chars.end(); ++uni_char_iter) {
+ Char16Set::value_type uni_char = *uni_char_iter;
+ CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
+ if (char_iter == char_word_map_.end()) {
+ // The character was not found so bail.
+ word_id_set.clear();
+ break;
+ }
+ WordIDSet& char_word_id_set(char_iter->second);
+ if (first_character) {
+ word_id_set = char_word_id_set;
+ first_character = false;
+ } else {
+ WordIDSet old_word_id_set(word_id_set);
+ word_id_set.clear();
+ std::set_intersection(old_word_id_set.begin(),
+ old_word_id_set.end(),
+ char_word_id_set.begin(),
+ char_word_id_set.end(),
+ std::inserter(word_id_set,
+ word_id_set.begin()));
+ }
+ }
+ // Update the cache.
+ if (!set_found || outstanding_count) {
+ term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars,
+ word_id_set, true));
+ }
+ }
+ return word_id_set;
+}
+
+// static
+int InMemoryURLIndex::RawScoreForURL(const URLRow& row,
+ const String16Vector& terms,
+ size_t* first_term_location) {
+ GURL gurl = row.url();
+ if (!gurl.is_valid())
+ return 0;
+
+ string16 url = UTF8ToUTF16(gurl.spec());
+
+ // Collect all term start positions so we can see if they appear in order.
+ std::vector<size_t> term_locations;
+ int out_of_order = 0; // Count the terms which are out of order.
+ size_t start_location_total = 0;
+ size_t term_length_total = 0;
+ for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end();
+ ++iter) {
+ string16 term = *iter;
+ size_t term_location = url.find(term);
+ if (term_location == string16::npos)
+ return 0; // A term was not found. This should not happen.
+ if (iter != terms.begin()) {
+ // See if this term is out-of-order.
+ for (std::vector<size_t>::const_iterator order_iter =
+ term_locations.begin(); order_iter != term_locations.end();
+ ++order_iter) {
+ if (term_location <= *order_iter)
+ ++out_of_order;
+ }
+ } else {
+ *first_term_location = term_location;
+ }
+ term_locations.push_back(term_location);
+ start_location_total += term_location;
+ term_length_total += term.size();
+ }
+
+ // Calculate a raw score.
+ // TODO(mrossetti): This is good enough for now but must be fine-tuned.
+ const float kOrderMaxValue = 10.0;
+ float order_value = 10.0;
+ if (terms.size() > 1) {
+ int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2;
+ order_value =
+ (static_cast<float>(max_possible_out_of_order - out_of_order) /
+ max_possible_out_of_order) * kOrderMaxValue;
+ }
+
+ const float kStartMaxValue = 10.0;
+ const size_t kMaxSignificantStart = 20;
+ float start_value =
+ (static_cast<float>(kMaxSignificantStart -
+ std::min(kMaxSignificantStart, start_location_total / terms.size()))) /
+ static_cast<float>(kMaxSignificantStart) * kStartMaxValue;
+
+ const float kCompleteMaxValue = 10.0;
+ float complete_value =
+ (static_cast<float>(term_length_total) / static_cast<float>(url.size())) *
+ kStartMaxValue;
+
+ const float kLastVisitMaxValue = 10.0;
+ const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30);
+ int64 delta_time = (kMaxSignificantDay -
+ std::min((base::Time::Now() - row.last_visit()),
+ kMaxSignificantDay)).ToInternalValue();
+ float last_visit_value =
+ (static_cast<float>(delta_time) /
+ static_cast<float>(kMaxSignificantDay.ToInternalValue())) *
+ kLastVisitMaxValue;
+
+ const float kVisitCountMaxValue = 10.0;
+ const int kMaxSignificantVisits = 10;
+ float visit_count_value =
+ (static_cast<float>(std::min(row.visit_count(),
+ kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) *
+ kVisitCountMaxValue;
+ float raw_score = order_value + start_value + complete_value +
+ last_visit_value + visit_count_value;
+
+ // Normalize the score.
+ const float kMaxNormalizedRawScore = 1000.0;
+ raw_score =
+ (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue +
+ kLastVisitMaxValue + kVisitCountMaxValue)) *
+ kMaxNormalizedRawScore;
+ return static_cast<int>(raw_score);
+}
+
// static
Time InMemoryURLIndex::RecentThreshold() {
return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays);
}
+void InMemoryURLIndex::AddHistoryMatch::operator()(
+ const InMemoryURLIndex::HistoryID history_id) {
+ HistoryInfoMap::const_iterator hist_pos =
+ index_.history_info_map_.find(history_id);
+ if (hist_pos != index_.history_info_map_.end()) {
+ const URLRow& hist_item = hist_pos->second;
+ // TODO(mrossetti): Accommodate multiple term highlighting.
+ size_t input_location = 0;
+ int score = InMemoryURLIndex::RawScoreForURL(
+ hist_item, lower_terms_, &input_location);
+ if (score != 0) {
+ // We only retain the top 10 highest scoring results so
+ // see if this one fits into the top 10 and, if so, where.
+ ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin();
+ while (scored_iter != scored_matches_.end() &&
+ (*scored_iter).raw_score > score)
+ ++scored_iter;
+ if ((scored_matches_.size() < 10) ||
+ (scored_iter != scored_matches_.end())) {
+ // Create and insert the new item.
+ // TODO(mrossetti): Properly set |match_in_scheme| and
+ // |innermost_match|.
+ bool match_in_scheme = false;
+ bool innermost_match = true;
+ ScoredHistoryMatch match(hist_item, input_location,
+ match_in_scheme, innermost_match, score);
+ if (!scored_matches_.empty())
+ scored_matches_.insert(scored_iter, match);
+ else
+ scored_matches_.push_back(match);
+ // Trim any entries beyond 10.
+ if (scored_matches_.size() > 10) {
+ scored_matches_.erase(scored_matches_.begin() + 10,
+ scored_matches_.end());
+ }
+ }
+ }
+ }
+}
+
} // namespace history
diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h
index 5f7ee3a..b5b3373 100644
--- a/chrome/browser/history/in_memory_url_index.h
+++ b/chrome/browser/history/in_memory_url_index.h
@@ -6,6 +6,7 @@
#define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
#pragma once
+#include <functional>
#include <map>
#include <set>
#include <string>
@@ -28,6 +29,23 @@ namespace history {
class URLDatabase;
+// Used for intermediate history result operations.
+struct ScoredHistoryMatch : public HistoryMatch {
+ // Required for STL, we don't use this directly.
+ ScoredHistoryMatch();
+
+ ScoredHistoryMatch(const URLRow& url_info,
+ size_t input_location,
+ bool match_in_scheme,
+ bool innermost_match,
+ int score);
+
+ // An interim score taking into consideration location and completeness
+ // of the match.
+ int raw_score;
+};
+typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches;
+
// The URL history source.
// Holds portions of the URL database in memory in an indexed form. Used to
// quickly look up matching URLs for a given query string. Used by
@@ -65,16 +83,18 @@ class InMemoryURLIndex {
// history index and return a vector with all scored, matching history items.
// Each term must occur somewhere in the history item for the item to
// qualify; however, the terms do not necessarily have to be adjacent.
- HistoryMatches HistoryItemsForTerms(const String16Vector& terms);
+ // Results are sorted with higher scoring items first.
+ ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms);
// Returns the date threshold for considering an history item as significant.
static base::Time RecentThreshold();
private:
+ friend class AddHistoryMatch;
friend class InMemoryURLIndexTest;
FRIEND_TEST(InMemoryURLIndexTest, Initialization);
- // Convenience types
+ // Convenience types.
typedef std::set<string16> String16Set;
typedef std::set<char16> Char16Set;
@@ -103,6 +123,8 @@ class InMemoryURLIndex {
word_id_set_(word_id_set),
used_(used) {}
+ bool IsNotUsed() const { return !used_; }
+
Char16Set char_set_;
WordIDSet word_id_set_;
bool used_; // true if this set has been used for the current term search.
@@ -115,8 +137,33 @@ class InMemoryURLIndex {
// A map from history_id to the history's URL and title.
typedef std::map<HistoryID, URLRow> HistoryInfoMap;
+ // A helper class which performs the final filter on each candidate
+ // history URL match, inserting accepted matches into |scored_matches_|
+ // and trimming the maximum number of matches to 10.
+ class AddHistoryMatch : std::unary_function<HistoryID, void> {
+ public:
+ AddHistoryMatch(const InMemoryURLIndex& index,
+ const String16Vector& lower_terms)
+ : index_(index),
+ lower_terms_(lower_terms) {}
+
+ void operator()(const HistoryID history_id);
+
+ ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }
+
+ private:
+ const InMemoryURLIndex& index_;
+ ScoredHistoryMatches scored_matches_;
+ const String16Vector& lower_terms_;
+ };
+
// Break a string down into individual words.
- String16Set WordsFromString16(const string16& uni_string);
+ static String16Set WordSetFromString16(const string16& uni_string);
+
+ // Given a set of Char16s, find words containing those characters. If any
+ // existing, cached set is a proper subset then start with that cached
+ // set. Update the cache.
+ WordIDSet WordIDSetForTermChars(const Char16Set& uni_chars);
// URL History indexing support functions.
@@ -145,6 +192,33 @@ class InMemoryURLIndex {
// |history_id| as the initial element of the word's set.
void AddWordHistory(const string16& uni_word, HistoryID history_id);
+ // Clear the search term cache. This cache holds on to the intermediate
+ // word results for each previously typed character to eliminate the need
+ // to re-perform set operations for previously typed characters.
+ void ResetTermCharWordSetCache();
+
+ // Compose a set of history item IDs by intersecting the set for each word
+ // in |uni_string|.
+ HistoryIDSet HistoryIDSetFromWords(const string16& uni_string);
+
+ // Helper function to HistoryIDSetFromWords which composes a set of history
+ // ids for the given term given in |uni_word|.
+ HistoryIDSet HistoryIDsForTerm(const string16& uni_word);
+
+ // Calculate a raw score for this history item by first determining
+ // if all of the terms in |terms_vector| occur in |row| and, if so,
+ // calculating a raw score based on 1) starting position of each term
+ // in the user input, 2) completeness of each term's match, 3) ordering
+ // of the occurrence of each term (i.e. they appear in order), 4) last
+ // visit time, and 5) number of visits.
+ // This raw score allows the results to be ordered and can be used
+ // to influence the final score calculated by the client of this
+ // index. Return the starting location of the first term in
+ // |first_term_location|.
+ static int RawScoreForURL(const URLRow& row,
+ const String16Vector& terms_vector,
+ size_t* first_term_location);
+
// A list of all of indexed words. The index of a word in this list is the
// ID of the word in the word_map_. It reduces the memory overhead by
// replacing a potentially long and repeated string with a simple index.
diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc
index f399496..9306ec5 100644
--- a/chrome/browser/history/in_memory_url_index_unittest.cc
+++ b/chrome/browser/history/in_memory_url_index_unittest.cc
@@ -109,6 +109,7 @@ TEST_F(InMemoryURLIndexTest, Construction) {
EXPECT_TRUE(url_index_.get());
}
+// TODO(mrossetti): Write python script to calculate the validation criteria.
TEST_F(InMemoryURLIndexTest, Initialization) {
// Verify that the database contains the expected number of items, which
// is the pre-filtered count, i.e. all of the items.
@@ -116,21 +117,68 @@ TEST_F(InMemoryURLIndexTest, Initialization) {
EXPECT_TRUE(statement);
uint64 row_count = 0;
while (statement.Step()) ++row_count;
- EXPECT_EQ(29U, row_count);
+ EXPECT_EQ(33U, row_count);
url_index_.reset(new InMemoryURLIndex);
url_index_->Init(this, "en,ja,hi,zh");
- // There should have been 25 of the 29 urls accepted during filtering.
- EXPECT_EQ(25U, url_index_->history_item_count_);
+ // There should have been 27 of the 31 urls accepted during filtering.
+ EXPECT_EQ(28U, url_index_->history_item_count_);
// history_info_map_ should have the same number of items as were filtered.
- EXPECT_EQ(25U, url_index_->history_info_map_.size());
+ EXPECT_EQ(28U, url_index_->history_info_map_.size());
// The resulting indexes should account for:
// 37 characters
// 88 words
EXPECT_EQ(37U, url_index_->char_word_map_.size());
- EXPECT_EQ(88U, url_index_->word_map_.size());
+ EXPECT_EQ(91U, url_index_->word_map_.size());
+}
+
+TEST_F(InMemoryURLIndexTest, Retrieval) {
+ url_index_.reset(new InMemoryURLIndex);
+ std::string languages("en,ja,hi,zh");
+ url_index_->Init(this, languages);
+ InMemoryURLIndex::String16Vector terms;
+ // The term will be lowercased by the search.
+
+ // See if a very specific term gives a single result.
+ string16 term = ASCIIToUTF16("DrudgeReport");
+ terms.push_back(term);
+ ScoredHistoryMatches matches = url_index_->HistoryItemsForTerms(terms);
+ EXPECT_EQ(1U, matches.size());
+
+ // Search which should result in multiple results.
+ terms.clear();
+ term = ASCIIToUTF16("drudge");
+ terms.push_back(term);
+ matches = url_index_->HistoryItemsForTerms(terms);
+ ASSERT_EQ(2U, matches.size());
+ // The results should be in descending score order.
+ EXPECT_GT(matches[0].raw_score, matches[1].raw_score);
+
+ // Search which should result in nearly perfect result.
+ terms.clear();
+ term = ASCIIToUTF16("http");
+ terms.push_back(term);
+ term = ASCIIToUTF16("NearlyPerfectResult");
+ terms.push_back(term);
+ matches = url_index_->HistoryItemsForTerms(terms);
+ ASSERT_EQ(1U, matches.size());
+ // The results should have a very high score.
+ EXPECT_GT(matches[0].raw_score, 900);
+
+ // Search which should result in very poor result.
+ terms.clear();
+ term = ASCIIToUTF16("z");
+ terms.push_back(term);
+ term = ASCIIToUTF16("y");
+ terms.push_back(term);
+ term = ASCIIToUTF16("x");
+ terms.push_back(term);
+ matches = url_index_->HistoryItemsForTerms(terms);
+ ASSERT_EQ(1U, matches.size());
+ // The results should have a poor score.
+ EXPECT_LT(matches[0].raw_score, 200);
}
} // namespace history