diff options
author | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-09-09 17:24:30 +0000 |
---|---|---|
committer | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-09-09 17:24:30 +0000 |
commit | 422a0b90cbcd884221d3fabac3507dd964e5a064 (patch) | |
tree | 43b636c7e063afe2ab3152c724de4688bbb28fc1 /chrome/browser/history | |
parent | 829a0c65bbded9a3262d5164e612ebdbc961d3c5 (diff) | |
download | chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.zip chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.tar.gz chromium_src-422a0b90cbcd884221d3fabac3507dd964e5a064.tar.bz2 |
Step 3 integrating the HistoryQuickProvider: Implement searching and production of search results in the InMemoryURLIndex. Unit test the production of results. Minor change in URLHistoryProvuder unit test setup.
BUG=None
TEST=None (unit tests added)
Review URL: http://codereview.chromium.org/3364004
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@58955 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/history')
-rw-r--r-- | chrome/browser/history/in_memory_url_index.cc | 347 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index.h | 80 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index_unittest.cc | 58 |
3 files changed, 466 insertions, 19 deletions
diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc index 5715d39..95afa57 100644 --- a/chrome/browser/history/in_memory_url_index.cc +++ b/chrome/browser/history/in_memory_url_index.cc @@ -12,6 +12,7 @@ #include "base/string_util.h" #include "base/time.h" #include "base/utf_string_conversions.h" +#include "chrome/browser/autocomplete/history_provider_util.h" #include "chrome/browser/history/url_database.h" #include "net/base/escape.h" #include "net/base/net_util.h" @@ -21,10 +22,23 @@ using base::TimeDelta; namespace history { +ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} + +ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, + size_t input_location, + bool match_in_scheme, + bool innermost_match, + int score) + : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), + raw_score(score) { +} + InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {} InMemoryURLIndex::~InMemoryURLIndex() {} +static const int32_t kURLToLowerBufferSize = 2048; + // Indexing bool InMemoryURLIndex::Init(history::URLDatabase* history_db, @@ -61,9 +75,6 @@ bool InMemoryURLIndex::IndexRow(URLRow row) { UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, NULL, NULL, NULL)); - // TODO(mrossetti): Find or implement a ConvertPercentEncoding and use it - // on the url. - // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). HistoryID history_id = static_cast<HistoryID>(row.id()); @@ -77,7 +88,7 @@ bool InMemoryURLIndex::IndexRow(URLRow row) { history_info_map_.insert(std::make_pair(history_id, new_row)); // Split into individual, unique words. - String16Set words = WordsFromString16(url); + String16Set words = WordSetFromString16(url); // For each word, add a new entry into the word index referring to the // associated history item. @@ -90,26 +101,134 @@ bool InMemoryURLIndex::IndexRow(URLRow row) { return true; } +// Searching + +ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( + const String16Vector& terms) { + ScoredHistoryMatches scored_items; + if (!terms.empty()) { + // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- + // and-sweep approach. + ResetTermCharWordSetCache(); + + // Lowercase the terms. + // TODO(mrossetti): Another opportunity for a transform algorithm. + String16Vector lower_terms; + for (String16Vector::const_iterator term_iter = terms.begin(); + term_iter != terms.end(); ++term_iter) { + String16Vector::value_type lower_string(*term_iter); + std::transform(lower_string.begin(), + lower_string.end(), + lower_string.begin(), + tolower); + lower_terms.push_back(lower_string); + } + + String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); + HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); + + // Pass over all of the candidates filtering out any without a proper + // substring match, inserting those which pass in order by score. + scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), + AddHistoryMatch(*this, + lower_terms)).ScoredMatches(); + } + + // Remove any stale TermCharWordSet's. + term_char_word_set_cache_.erase( + std::remove_if(term_char_word_set_cache_.begin(), + term_char_word_set_cache_.end(), + std::mem_fun_ref(&TermCharWordSet::IsNotUsed)), + term_char_word_set_cache_.end()); + return scored_items; +} + +void InMemoryURLIndex::ResetTermCharWordSetCache() { + // TODO(mrossetti): Consider keeping more of the cache around for possible + // repeat searches, except a 'shortcuts' approach might be better for that. + // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. + for (TermCharWordSetVector::iterator iter = + term_char_word_set_cache_.begin(); + iter != term_char_word_set_cache_.end(); ++iter) + iter->used_ = false; +} + +InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( + const string16& uni_string) { + // Break the terms down into individual terms (words), get the candidate + // set for each term, and intersect each to get a final candidate list. + // Note that a single 'term' from the user's perspective might be + // a string like "http://www.somewebsite.com" which, from our perspective, + // is four words: 'http', 'www', 'somewebsite', and 'com'. + HistoryIDSet history_id_set; + String16Set words = WordSetFromString16(uni_string); + bool first_word = true; + for (String16Set::iterator iter = words.begin(); + iter != words.end(); ++iter) { + String16Set::value_type uni_word = *iter; + HistoryIDSet term_history_id_set = + InMemoryURLIndex::HistoryIDsForTerm(uni_word); + if (first_word) { + history_id_set.swap(term_history_id_set); + first_word = false; + } else { + HistoryIDSet old_history_id_set(history_id_set); + history_id_set.clear(); + std::set_intersection(old_history_id_set.begin(), + old_history_id_set.end(), + term_history_id_set.begin(), + term_history_id_set.end(), + std::inserter(history_id_set, + history_id_set.begin())); + } + } + return history_id_set; +} + +InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( + const string16& uni_word) { + InMemoryURLIndex::HistoryIDSet history_id_set; + + // For each character in the word, get the char/word_id map entry and + // intersect with the set in an incremental manner. + Char16Set uni_chars = CharactersFromString16(uni_word); + WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); + + // If any words resulted then we can compose a set of history IDs by unioning + // the sets from each word. + if (!word_id_set.empty()) { + for (WordIDSet::iterator word_id_iter = word_id_set.begin(); + word_id_iter != word_id_set.end(); ++word_id_iter) { + WordID word_id = *word_id_iter; + WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); + if (word_iter != word_id_history_map_.end()) { + HistoryIDSet& word_history_id_set(word_iter->second); + history_id_set.insert(word_history_id_set.begin(), + word_history_id_set.end()); + } + } + } + + return history_id_set; +} + // Utility Functions -InMemoryURLIndex::String16Set InMemoryURLIndex::WordsFromString16( +// static +InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( const string16& uni_string) { String16Set words; - - // TODO(mrossetti): Replace all | and _'s with a space, all % quoted - // characters with real characters, and break into words, using - // appropriate string16 functions. WordIterator iter(&uni_string, WordIterator::BREAK_WORD); if (iter.Init()) { while (iter.Advance()) { - if (iter.IsWord()) { + if (iter.IsWord()) words.insert(iter.GetWord()); - } } } return words; } +// static InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16( const string16& uni_word) { Char16Set characters; @@ -165,9 +284,215 @@ void InMemoryURLIndex::AddWordHistory(const string16& uni_word, } } +InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars( + InMemoryURLIndex::Char16Set const& uni_chars) { + TermCharWordSet* best_term_char_word_set = NULL; + bool set_found = false; + size_t outstanding_count = 0; + Char16Set outstanding_chars; + for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); + iter != term_char_word_set_cache_.end(); ++iter) { + TermCharWordSetVector::value_type& term_char_word_set(*iter); + Char16Set& char_set(term_char_word_set.char_set_); + Char16Set exclusions; + std::set_difference(char_set.begin(), char_set.end(), + uni_chars.begin(), uni_chars.end(), + std::inserter(exclusions, exclusions.begin())); + if (exclusions.empty()) { + // Do a reverse difference so that we know which characters remain + // to be indexed. Then decide if this is a better match than any + // previous cached set. + std::set_difference(uni_chars.begin(), uni_chars.end(), + char_set.begin(), char_set.end(), + std::inserter(exclusions, exclusions.begin())); + if (!set_found || exclusions.size() < outstanding_count) { + outstanding_chars = exclusions; + best_term_char_word_set = &*iter; + outstanding_count = exclusions.size(); + set_found = true; + } + } + } + + WordIDSet word_id_set; + if (set_found && outstanding_count == 0) { + // If there were no oustanding characters then we can use the cached one. + best_term_char_word_set->used_ = true; + word_id_set = best_term_char_word_set->word_id_set_; + } else { + // Some or all of the characters remain to be indexed. + bool first_character = true; + if (set_found) { + first_character = false; + word_id_set = best_term_char_word_set->word_id_set_; + } else { + outstanding_chars = uni_chars; + } + + for (Char16Set::iterator uni_char_iter = outstanding_chars.begin(); + uni_char_iter != outstanding_chars.end(); ++uni_char_iter) { + Char16Set::value_type uni_char = *uni_char_iter; + CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); + if (char_iter == char_word_map_.end()) { + // The character was not found so bail. + word_id_set.clear(); + break; + } + WordIDSet& char_word_id_set(char_iter->second); + if (first_character) { + word_id_set = char_word_id_set; + first_character = false; + } else { + WordIDSet old_word_id_set(word_id_set); + word_id_set.clear(); + std::set_intersection(old_word_id_set.begin(), + old_word_id_set.end(), + char_word_id_set.begin(), + char_word_id_set.end(), + std::inserter(word_id_set, + word_id_set.begin())); + } + } + // Update the cache. + if (!set_found || outstanding_count) { + term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars, + word_id_set, true)); + } + } + return word_id_set; +} + +// static +int InMemoryURLIndex::RawScoreForURL(const URLRow& row, + const String16Vector& terms, + size_t* first_term_location) { + GURL gurl = row.url(); + if (!gurl.is_valid()) + return 0; + + string16 url = UTF8ToUTF16(gurl.spec()); + + // Collect all term start positions so we can see if they appear in order. + std::vector<size_t> term_locations; + int out_of_order = 0; // Count the terms which are out of order. + size_t start_location_total = 0; + size_t term_length_total = 0; + for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); + ++iter) { + string16 term = *iter; + size_t term_location = url.find(term); + if (term_location == string16::npos) + return 0; // A term was not found. This should not happen. + if (iter != terms.begin()) { + // See if this term is out-of-order. + for (std::vector<size_t>::const_iterator order_iter = + term_locations.begin(); order_iter != term_locations.end(); + ++order_iter) { + if (term_location <= *order_iter) + ++out_of_order; + } + } else { + *first_term_location = term_location; + } + term_locations.push_back(term_location); + start_location_total += term_location; + term_length_total += term.size(); + } + + // Calculate a raw score. + // TODO(mrossetti): This is good enough for now but must be fine-tuned. + const float kOrderMaxValue = 10.0; + float order_value = 10.0; + if (terms.size() > 1) { + int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; + order_value = + (static_cast<float>(max_possible_out_of_order - out_of_order) / + max_possible_out_of_order) * kOrderMaxValue; + } + + const float kStartMaxValue = 10.0; + const size_t kMaxSignificantStart = 20; + float start_value = + (static_cast<float>(kMaxSignificantStart - + std::min(kMaxSignificantStart, start_location_total / terms.size()))) / + static_cast<float>(kMaxSignificantStart) * kStartMaxValue; + + const float kCompleteMaxValue = 10.0; + float complete_value = + (static_cast<float>(term_length_total) / static_cast<float>(url.size())) * + kStartMaxValue; + + const float kLastVisitMaxValue = 10.0; + const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); + int64 delta_time = (kMaxSignificantDay - + std::min((base::Time::Now() - row.last_visit()), + kMaxSignificantDay)).ToInternalValue(); + float last_visit_value = + (static_cast<float>(delta_time) / + static_cast<float>(kMaxSignificantDay.ToInternalValue())) * + kLastVisitMaxValue; + + const float kVisitCountMaxValue = 10.0; + const int kMaxSignificantVisits = 10; + float visit_count_value = + (static_cast<float>(std::min(row.visit_count(), + kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * + kVisitCountMaxValue; + float raw_score = order_value + start_value + complete_value + + last_visit_value + visit_count_value; + + // Normalize the score. + const float kMaxNormalizedRawScore = 1000.0; + raw_score = + (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + + kLastVisitMaxValue + kVisitCountMaxValue)) * + kMaxNormalizedRawScore; + return static_cast<int>(raw_score); +} + // static Time InMemoryURLIndex::RecentThreshold() { return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); } +void InMemoryURLIndex::AddHistoryMatch::operator()( + const InMemoryURLIndex::HistoryID history_id) { + HistoryInfoMap::const_iterator hist_pos = + index_.history_info_map_.find(history_id); + if (hist_pos != index_.history_info_map_.end()) { + const URLRow& hist_item = hist_pos->second; + // TODO(mrossetti): Accommodate multiple term highlighting. + size_t input_location = 0; + int score = InMemoryURLIndex::RawScoreForURL( + hist_item, lower_terms_, &input_location); + if (score != 0) { + // We only retain the top 10 highest scoring results so + // see if this one fits into the top 10 and, if so, where. + ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); + while (scored_iter != scored_matches_.end() && + (*scored_iter).raw_score > score) + ++scored_iter; + if ((scored_matches_.size() < 10) || + (scored_iter != scored_matches_.end())) { + // Create and insert the new item. + // TODO(mrossetti): Properly set |match_in_scheme| and + // |innermost_match|. + bool match_in_scheme = false; + bool innermost_match = true; + ScoredHistoryMatch match(hist_item, input_location, + match_in_scheme, innermost_match, score); + if (!scored_matches_.empty()) + scored_matches_.insert(scored_iter, match); + else + scored_matches_.push_back(match); + // Trim any entries beyond 10. + if (scored_matches_.size() > 10) { + scored_matches_.erase(scored_matches_.begin() + 10, + scored_matches_.end()); + } + } + } + } +} + } // namespace history diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h index 5f7ee3a..b5b3373 100644 --- a/chrome/browser/history/in_memory_url_index.h +++ b/chrome/browser/history/in_memory_url_index.h @@ -6,6 +6,7 @@ #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ #pragma once +#include <functional> #include <map> #include <set> #include <string> @@ -28,6 +29,23 @@ namespace history { class URLDatabase; +// Used for intermediate history result operations. +struct ScoredHistoryMatch : public HistoryMatch { + // Required for STL, we don't use this directly. + ScoredHistoryMatch(); + + ScoredHistoryMatch(const URLRow& url_info, + size_t input_location, + bool match_in_scheme, + bool innermost_match, + int score); + + // An interim score taking into consideration location and completeness + // of the match. + int raw_score; +}; +typedef std::vector<ScoredHistoryMatch> ScoredHistoryMatches; + // The URL history source. // Holds portions of the URL database in memory in an indexed form. Used to // quickly look up matching URLs for a given query string. Used by @@ -65,16 +83,18 @@ class InMemoryURLIndex { // history index and return a vector with all scored, matching history items. // Each term must occur somewhere in the history item for the item to // qualify; however, the terms do not necessarily have to be adjacent. - HistoryMatches HistoryItemsForTerms(const String16Vector& terms); + // Results are sorted with higher scoring items first. + ScoredHistoryMatches HistoryItemsForTerms(const String16Vector& terms); // Returns the date threshold for considering an history item as significant. static base::Time RecentThreshold(); private: + friend class AddHistoryMatch; friend class InMemoryURLIndexTest; FRIEND_TEST(InMemoryURLIndexTest, Initialization); - // Convenience types + // Convenience types. typedef std::set<string16> String16Set; typedef std::set<char16> Char16Set; @@ -103,6 +123,8 @@ class InMemoryURLIndex { word_id_set_(word_id_set), used_(used) {} + bool IsNotUsed() const { return !used_; } + Char16Set char_set_; WordIDSet word_id_set_; bool used_; // true if this set has been used for the current term search. @@ -115,8 +137,33 @@ class InMemoryURLIndex { // A map from history_id to the history's URL and title. typedef std::map<HistoryID, URLRow> HistoryInfoMap; + // A helper class which performs the final filter on each candidate + // history URL match, inserting accepted matches into |scored_matches_| + // and trimming the maximum number of matches to 10. + class AddHistoryMatch : std::unary_function<HistoryID, void> { + public: + AddHistoryMatch(const InMemoryURLIndex& index, + const String16Vector& lower_terms) + : index_(index), + lower_terms_(lower_terms) {} + + void operator()(const HistoryID history_id); + + ScoredHistoryMatches ScoredMatches() const { return scored_matches_; } + + private: + const InMemoryURLIndex& index_; + ScoredHistoryMatches scored_matches_; + const String16Vector& lower_terms_; + }; + // Break a string down into individual words. - String16Set WordsFromString16(const string16& uni_string); + static String16Set WordSetFromString16(const string16& uni_string); + + // Given a set of Char16s, find words containing those characters. If any + // existing, cached set is a proper subset then start with that cached + // set. Update the cache. + WordIDSet WordIDSetForTermChars(const Char16Set& uni_chars); // URL History indexing support functions. @@ -145,6 +192,33 @@ class InMemoryURLIndex { // |history_id| as the initial element of the word's set. void AddWordHistory(const string16& uni_word, HistoryID history_id); + // Clear the search term cache. This cache holds on to the intermediate + // word results for each previously typed character to eliminate the need + // to re-perform set operations for previously typed characters. + void ResetTermCharWordSetCache(); + + // Compose a set of history item IDs by intersecting the set for each word + // in |uni_string|. + HistoryIDSet HistoryIDSetFromWords(const string16& uni_string); + + // Helper function to HistoryIDSetFromWords which composes a set of history + // ids for the given term given in |uni_word|. + HistoryIDSet HistoryIDsForTerm(const string16& uni_word); + + // Calculate a raw score for this history item by first determining + // if all of the terms in |terms_vector| occur in |row| and, if so, + // calculating a raw score based on 1) starting position of each term + // in the user input, 2) completeness of each term's match, 3) ordering + // of the occurrence of each term (i.e. they appear in order), 4) last + // visit time, and 5) number of visits. + // This raw score allows the results to be ordered and can be used + // to influence the final score calculated by the client of this + // index. Return the starting location of the first term in + // |first_term_location|. + static int RawScoreForURL(const URLRow& row, + const String16Vector& terms_vector, + size_t* first_term_location); + // A list of all of indexed words. The index of a word in this list is the // ID of the word in the word_map_. It reduces the memory overhead by // replacing a potentially long and repeated string with a simple index. diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc index f399496..9306ec5 100644 --- a/chrome/browser/history/in_memory_url_index_unittest.cc +++ b/chrome/browser/history/in_memory_url_index_unittest.cc @@ -109,6 +109,7 @@ TEST_F(InMemoryURLIndexTest, Construction) { EXPECT_TRUE(url_index_.get()); } +// TODO(mrossetti): Write python script to calculate the validation criteria. TEST_F(InMemoryURLIndexTest, Initialization) { // Verify that the database contains the expected number of items, which // is the pre-filtered count, i.e. all of the items. @@ -116,21 +117,68 @@ TEST_F(InMemoryURLIndexTest, Initialization) { EXPECT_TRUE(statement); uint64 row_count = 0; while (statement.Step()) ++row_count; - EXPECT_EQ(29U, row_count); + EXPECT_EQ(33U, row_count); url_index_.reset(new InMemoryURLIndex); url_index_->Init(this, "en,ja,hi,zh"); - // There should have been 25 of the 29 urls accepted during filtering. - EXPECT_EQ(25U, url_index_->history_item_count_); + // There should have been 27 of the 31 urls accepted during filtering. + EXPECT_EQ(28U, url_index_->history_item_count_); // history_info_map_ should have the same number of items as were filtered. - EXPECT_EQ(25U, url_index_->history_info_map_.size()); + EXPECT_EQ(28U, url_index_->history_info_map_.size()); // The resulting indexes should account for: // 37 characters // 88 words EXPECT_EQ(37U, url_index_->char_word_map_.size()); - EXPECT_EQ(88U, url_index_->word_map_.size()); + EXPECT_EQ(91U, url_index_->word_map_.size()); +} + +TEST_F(InMemoryURLIndexTest, Retrieval) { + url_index_.reset(new InMemoryURLIndex); + std::string languages("en,ja,hi,zh"); + url_index_->Init(this, languages); + InMemoryURLIndex::String16Vector terms; + // The term will be lowercased by the search. + + // See if a very specific term gives a single result. + string16 term = ASCIIToUTF16("DrudgeReport"); + terms.push_back(term); + ScoredHistoryMatches matches = url_index_->HistoryItemsForTerms(terms); + EXPECT_EQ(1U, matches.size()); + + // Search which should result in multiple results. + terms.clear(); + term = ASCIIToUTF16("drudge"); + terms.push_back(term); + matches = url_index_->HistoryItemsForTerms(terms); + ASSERT_EQ(2U, matches.size()); + // The results should be in descending score order. + EXPECT_GT(matches[0].raw_score, matches[1].raw_score); + + // Search which should result in nearly perfect result. + terms.clear(); + term = ASCIIToUTF16("http"); + terms.push_back(term); + term = ASCIIToUTF16("NearlyPerfectResult"); + terms.push_back(term); + matches = url_index_->HistoryItemsForTerms(terms); + ASSERT_EQ(1U, matches.size()); + // The results should have a very high score. + EXPECT_GT(matches[0].raw_score, 900); + + // Search which should result in very poor result. + terms.clear(); + term = ASCIIToUTF16("z"); + terms.push_back(term); + term = ASCIIToUTF16("y"); + terms.push_back(term); + term = ASCIIToUTF16("x"); + terms.push_back(term); + matches = url_index_->HistoryItemsForTerms(terms); + ASSERT_EQ(1U, matches.size()); + // The results should have a poor score. + EXPECT_LT(matches[0].raw_score, 200); } } // namespace history |