diff options
6 files changed, 519 insertions, 404 deletions
diff --git a/chrome/browser/autocomplete/history_quick_provider.cc b/chrome/browser/autocomplete/history_quick_provider.cc index d678ae45..57c9d11 100644 --- a/chrome/browser/autocomplete/history_quick_provider.cc +++ b/chrome/browser/autocomplete/history_quick_provider.cc @@ -93,10 +93,6 @@ void HistoryQuickProvider::DeleteMatch(const AutocompleteMatch& match) {} void HistoryQuickProvider::DoAutocomplete() { // Get the matching URLs from the DB. string16 term_string = autocomplete_input_.text(); - // TODO(mrossetti): Temporary workaround for http://crbug.com/88498. - // Just give up after 50 characters. - if (term_string.size() > 50) - return; term_string = UnescapeURLComponent(term_string, UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); history::InMemoryURLIndex::String16Vector terms( diff --git a/chrome/browser/autocomplete/history_quick_provider_unittest.cc b/chrome/browser/autocomplete/history_quick_provider_unittest.cc index 6143481..7a8be3c 100644 --- a/chrome/browser/autocomplete/history_quick_provider_unittest.cc +++ b/chrome/browser/autocomplete/history_quick_provider_unittest.cc @@ -69,7 +69,7 @@ struct TestURLInfo { {"http://abcxyzdefghijklmnopqrstuvw.com/a", "", 3, 1, 0}, {"http://xyzabcdefghijklmnopqrstuvw.com/a", "", 3, 1, 0}, {"http://cda.com/Dogs%20Cats%20Gorillas%20Sea%20Slugs%20and%20Mice", - "Dogs & Cats & Mice", 1, 1, 0}, + "Dogs & Cats & Mice & Other Animals", 1, 1, 0}, }; class HistoryQuickProviderTest : public TestingBrowserProcessTest, @@ -83,16 +83,20 @@ class HistoryQuickProviderTest : public TestingBrowserProcessTest, virtual void OnProviderUpdate(bool updated_matches); protected: - void SetUp() { - profile_.reset(new TestingProfile()); - profile_->CreateHistoryService(true, false); - profile_->CreateBookmarkModel(true); - profile_->BlockUntilBookmarkModelLoaded(); - history_service_ = profile_->GetHistoryService(Profile::EXPLICIT_ACCESS); - EXPECT_TRUE(history_service_); - provider_ = new HistoryQuickProvider(this, profile_.get()); - FillData(); - } + class SetShouldContain : public std::unary_function<const std::string&, + std::set<std::string> > { + public: + explicit SetShouldContain(const ACMatches& matched_urls); + + void operator()(const std::string& expected); + + std::set<std::string> LeftOvers() const { return matches_; } + + private: + std::set<std::string> matches_; + }; + + void SetUp(); void TearDown() { provider_ = NULL; @@ -121,6 +125,17 @@ class HistoryQuickProviderTest : public TestingBrowserProcessTest, scoped_refptr<HistoryQuickProvider> provider_; }; +void HistoryQuickProviderTest::SetUp() { + profile_.reset(new TestingProfile()); + profile_->CreateHistoryService(true, false); + profile_->CreateBookmarkModel(true); + profile_->BlockUntilBookmarkModelLoaded(); + history_service_ = profile_->GetHistoryService(Profile::EXPLICIT_ACCESS); + EXPECT_TRUE(history_service_); + provider_ = new HistoryQuickProvider(this, profile_.get()); + FillData(); +} + void HistoryQuickProviderTest::OnProviderUpdate(bool updated_matches) { MessageLoop::current()->Quit(); } @@ -155,29 +170,24 @@ void HistoryQuickProviderTest::FillData() { provider_->SetIndexForTesting(index); } -class SetShouldContain : public std::unary_function<const std::string&, - std::set<std::string> > { - public: - explicit SetShouldContain(const ACMatches& matched_urls) { - for (ACMatches::const_iterator iter = matched_urls.begin(); - iter != matched_urls.end(); ++iter) - matches_.insert(iter->destination_url.spec()); - } - - void operator()(const std::string& expected) { - EXPECT_EQ(1U, matches_.erase(expected)) << "Results did not contain '" - << expected << "' but should have."; - } +HistoryQuickProviderTest::SetShouldContain::SetShouldContain( + const ACMatches& matched_urls) { + for (ACMatches::const_iterator iter = matched_urls.begin(); + iter != matched_urls.end(); ++iter) + matches_.insert(iter->destination_url.spec()); +} - std::set<std::string> LeftOvers() const { return matches_; } +void HistoryQuickProviderTest::SetShouldContain::operator()( + const std::string& expected) { + EXPECT_EQ(1U, matches_.erase(expected)) + << "Results did not contain '" << expected << "' but should have."; +} - private: - std::set<std::string> matches_; -}; void HistoryQuickProviderTest::RunTest(const string16 text, std::vector<std::string> expected_urls, std::string expected_top_result) { + SCOPED_TRACE(text); // Minimal hint to query being run. std::sort(expected_urls.begin(), expected_urls.end()); MessageLoop::current()->RunAllPending(); @@ -215,23 +225,28 @@ void HistoryQuickProviderTest::RunTest(const string16 text, } TEST_F(HistoryQuickProviderTest, SimpleSingleMatch) { - string16 text(ASCIIToUTF16("slashdot")); std::string expected_url("http://slashdot.org/favorite_page.html"); std::vector<std::string> expected_urls; expected_urls.push_back(expected_url); - RunTest(text, expected_urls, expected_url); + RunTest(ASCIIToUTF16("slashdot"), expected_urls, expected_url); +} + +TEST_F(HistoryQuickProviderTest, MultiTermTitleMatch) { + std::string expected_url( + "http://cda.com/Dogs%20Cats%20Gorillas%20Sea%20Slugs%20and%20Mice"); + std::vector<std::string> expected_urls; + expected_urls.push_back(expected_url); + RunTest(ASCIIToUTF16("mice other animals"), expected_urls, expected_url); } TEST_F(HistoryQuickProviderTest, NonWordLastCharacterMatch) { - string16 text(ASCIIToUTF16("slashdot.org/")); std::string expected_url("http://slashdot.org/favorite_page.html"); std::vector<std::string> expected_urls; expected_urls.push_back(expected_url); - RunTest(text, expected_urls, expected_url); + RunTest(ASCIIToUTF16("slashdot.org/"), expected_urls, expected_url); } TEST_F(HistoryQuickProviderTest, MultiMatch) { - string16 text(ASCIIToUTF16("foo")); std::vector<std::string> expected_urls; // Scores high because of typed_count. expected_urls.push_back("http://foo.com/"); @@ -239,52 +254,51 @@ TEST_F(HistoryQuickProviderTest, MultiMatch) { expected_urls.push_back("http://foo.com/dir/another/"); // Scores high because of high visit count. expected_urls.push_back("http://foo.com/dir/another/again/myfile.html"); - RunTest(text, expected_urls, "http://foo.com/"); + RunTest(ASCIIToUTF16("foo"), expected_urls, "http://foo.com/"); } TEST_F(HistoryQuickProviderTest, StartRelativeMatch) { - string16 text(ASCIIToUTF16("xyz")); std::vector<std::string> expected_urls; expected_urls.push_back("http://xyzabcdefghijklmnopqrstuvw.com/a"); expected_urls.push_back("http://abcxyzdefghijklmnopqrstuvw.com/a"); expected_urls.push_back("http://abcdefxyzghijklmnopqrstuvw.com/a"); - RunTest(text, expected_urls, "http://xyzabcdefghijklmnopqrstuvw.com/a"); + RunTest(ASCIIToUTF16("xyz"), expected_urls, + "http://xyzabcdefghijklmnopqrstuvw.com/a"); } TEST_F(HistoryQuickProviderTest, VisitCountMatches) { - string16 text(ASCIIToUTF16("visitedest")); std::vector<std::string> expected_urls; expected_urls.push_back("http://visitedest.com/y/a"); expected_urls.push_back("http://visitedest.com/y/b"); expected_urls.push_back("http://visitedest.com/x/c"); - RunTest(text, expected_urls, "http://visitedest.com/y/a"); + RunTest(ASCIIToUTF16("visitedest"), expected_urls, + "http://visitedest.com/y/a"); } TEST_F(HistoryQuickProviderTest, TypedCountMatches) { - string16 text(ASCIIToUTF16("typeredest")); std::vector<std::string> expected_urls; expected_urls.push_back("http://typeredest.com/y/a"); expected_urls.push_back("http://typeredest.com/y/b"); expected_urls.push_back("http://typeredest.com/x/c"); - RunTest(text, expected_urls, "http://typeredest.com/y/a"); + RunTest(ASCIIToUTF16("typeredest"), expected_urls, + "http://typeredest.com/y/a"); } TEST_F(HistoryQuickProviderTest, DaysAgoMatches) { - string16 text(ASCIIToUTF16("daysagoest")); std::vector<std::string> expected_urls; expected_urls.push_back("http://daysagoest.com/y/a"); expected_urls.push_back("http://daysagoest.com/y/b"); expected_urls.push_back("http://daysagoest.com/x/c"); - RunTest(text, expected_urls, "http://daysagoest.com/y/a"); + RunTest(ASCIIToUTF16("daysagoest"), expected_urls, + "http://daysagoest.com/y/a"); } TEST_F(HistoryQuickProviderTest, EncodingLimitMatch) { - string16 text(ASCIIToUTF16("ice")); std::vector<std::string> expected_urls; std::string url( "http://cda.com/Dogs%20Cats%20Gorillas%20Sea%20Slugs%20and%20Mice"); expected_urls.push_back(url); - RunTest(text, expected_urls, url); + RunTest(ASCIIToUTF16("ice"), expected_urls, url); // Verify that the matches' ACMatchClassifications offsets are in range. ACMatchClassifications content(ac_matches_[0].contents_class); // The max offset accounts for 6 occurrences of '%20' plus the 'http://'. @@ -293,7 +307,7 @@ TEST_F(HistoryQuickProviderTest, EncodingLimitMatch) { citer != content.end(); ++citer) EXPECT_LT(citer->offset, max_offset); ACMatchClassifications description(ac_matches_[0].description_class); - std::string page_title("Dogs & Cats & Mice"); + std::string page_title("Dogs & Cats & Mice & Other Animals"); for (ACMatchClassifications::const_iterator diter = description.begin(); diter != description.end(); ++diter) EXPECT_LT(diter->offset, page_title.length()); diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc index 658dac3..37db2b4 100644 --- a/chrome/browser/history/in_memory_url_index.cc +++ b/chrome/browser/history/in_memory_url_index.cc @@ -76,31 +76,26 @@ bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1, return m1.raw_score >= m2.raw_score; } -struct InMemoryURLIndex::TermCharWordSet { - TermCharWordSet() // Required for STL resize(). - : char_(0), - word_id_set_(), - used_(false) {} - TermCharWordSet(const char16& uni_char, - const WordIDSet& word_id_set, - bool used) - : char_(uni_char), - word_id_set_(word_id_set), - used_(used) {} - - // Predicate for STL algorithm use. - bool is_not_used() const { return !used_; } - - char16 char_; - WordIDSet word_id_set_; - bool used_; // true if this set has been used for the current term search. -}; +InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem( + const WordIDSet& word_id_set, + const HistoryIDSet& history_id_set) + : word_id_set_(word_id_set), + history_id_set_(history_id_set), + used_(true) {} + +InMemoryURLIndex::SearchTermCacheItem::SearchTermCacheItem() + : used_(true) {} // Comparison function for sorting TermMatches by their offsets. bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) { return m1.offset < m2.offset; } +// Comparison function for sorting search terms by descending length. +bool LengthGreater(const string16& string_a, const string16& string_b) { + return string_a.length() > string_b.length(); +} + // std::accumulate helper function to add up TermMatches' lengths. int AccumulateMatchLength(int total, const TermMatch& match) { return total + match.length; @@ -249,8 +244,8 @@ void InMemoryURLIndex::ClearPrivateData() { word_map_.clear(); char_word_map_.clear(); word_id_history_map_.clear(); - term_char_word_set_cache_.clear(); history_info_map_.clear(); + search_term_cache_.clear(); } bool InMemoryURLIndex::RestoreFromCacheFile() { @@ -349,7 +344,7 @@ void InMemoryURLIndex::UpdateURL(URLID row_id, const URLRow& row) { history_info_map_.erase(row_id); } // This invalidates the cache. - term_char_word_set_cache_.clear(); + search_term_cache_.clear(); // TODO(mrossetti): Record this transaction in the cache. } @@ -361,7 +356,7 @@ void InMemoryURLIndex::DeleteURL(URLID row_id) { // will propagate to the user. history_info_map_.erase(row_id); // This invalidates the word cache. - term_char_word_set_cache_.clear(); + search_term_cache_.clear(); // TODO(mrossetti): Record this transaction in the cache. } @@ -371,9 +366,9 @@ ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( const String16Vector& terms) { ScoredHistoryMatches scored_items; if (!terms.empty()) { - // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- - // and-sweep approach. - ResetTermCharWordSetCache(); + // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep + // approach. + ResetSearchTermCache(); // Lowercase the terms. // TODO(mrossetti): Another opportunity for a transform algorithm. @@ -382,7 +377,7 @@ ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( term_iter != terms.end(); ++term_iter) lower_terms.push_back(base::i18n::ToLower(*term_iter)); - String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); + string16 all_terms(JoinString(lower_terms, ' ')); HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); // Don't perform any scoring (and don't return any matches) if the @@ -409,23 +404,22 @@ ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( } } - // Remove any stale TermCharWordSet's. - term_char_word_set_cache_.erase( - std::remove_if(term_char_word_set_cache_.begin(), - term_char_word_set_cache_.end(), - std::mem_fun_ref(&TermCharWordSet::is_not_used)), - term_char_word_set_cache_.end()); + // Remove any stale SearchTermCacheItems. + for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); + cache_iter != search_term_cache_.end(); ) { + if (!cache_iter->second.used_) + search_term_cache_.erase(cache_iter++); + else + ++cache_iter; + } + return scored_items; } -void InMemoryURLIndex::ResetTermCharWordSetCache() { - // TODO(mrossetti): Consider keeping more of the cache around for possible - // repeat searches, except a 'shortcuts' approach might be better for that. - // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. - for (TermCharWordSetVector::iterator iter = - term_char_word_set_cache_.begin(); - iter != term_char_word_set_cache_.end(); ++iter) - iter->used_ = false; +void InMemoryURLIndex::ResetSearchTermCache() { + for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); + iter != search_term_cache_.end(); ++iter) + iter->second.used_ = false; } InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( @@ -436,49 +430,124 @@ InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( // a string like "http://www.somewebsite.com" which, from our perspective, // is four words: 'http', 'www', 'somewebsite', and 'com'. HistoryIDSet history_id_set; - String16Set words = WordSetFromString16(uni_string); - bool first_word = true; - for (String16Set::iterator iter = words.begin(); - iter != words.end(); ++iter) { - String16Set::value_type uni_word = *iter; - HistoryIDSet term_history_id_set = - InMemoryURLIndex::HistoryIDsForTerm(uni_word); - if (first_word) { - history_id_set.swap(term_history_id_set); - first_word = false; - } else { - HistoryIDSet old_history_id_set(history_id_set); + String16Vector terms = WordVectorFromString16(uni_string, true); + // Sort the terms into the longest first as such are likely to narrow down + // the results quicker. Also, single character terms are the most expensive + // to process so save them for last. + std::sort(terms.begin(), terms.end(), LengthGreater); + for (String16Vector::iterator iter = terms.begin(); iter != terms.end(); + ++iter) { + string16 uni_word = *iter; + HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); + if (term_history_set.empty()) { history_id_set.clear(); - std::set_intersection(old_history_id_set.begin(), - old_history_id_set.end(), - term_history_id_set.begin(), - term_history_id_set.end(), - std::inserter(history_id_set, - history_id_set.begin())); + break; + } + if (iter == terms.begin()) { + history_id_set.swap(term_history_set); + } else { + HistoryIDSet new_history_id_set; + std::set_intersection(history_id_set.begin(), history_id_set.end(), + term_history_set.begin(), term_history_set.end(), + std::inserter(new_history_id_set, + new_history_id_set.begin())); + history_id_set.swap(new_history_id_set); } } return history_id_set; } InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( - const string16& uni_word) { - InMemoryURLIndex::HistoryIDSet history_id_set; - - // For each unique character in the word, in order of first appearance, get - // the char/word_id map entry and intersect with the set in an incremental - // manner. - Char16Vector uni_chars = Char16VectorFromString16(uni_word); - WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); - - // TODO(mrossetti): At this point, as a possible optimization, we could - // scan through all candidate words and make sure the |uni_word| is a - // substring within the candidate words, eliminating those which aren't. - // I'm not sure it would be worth the effort. And remember, we've got to do - // a final substring match in order to get the highlighting ranges later - // in the process in any case. + const string16& term) { + if (term.empty()) + return HistoryIDSet(); + + // TODO(mrossetti): Consider optimizing for very common terms such as + // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently + // occuring words in the user's searches. + + size_t term_length = term.length(); + InMemoryURLIndex::WordIDSet word_id_set; + if (term_length > 1) { + // See if this term or a prefix thereof is present in the cache. + SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); + for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); + cache_iter != search_term_cache_.end(); ++cache_iter) { + if (StartsWith(term, cache_iter->first, false) && + (best_prefix == search_term_cache_.end() || + cache_iter->first.length() > best_prefix->first.length())) + best_prefix = cache_iter; + } + + // If a prefix was found then determine the leftover characters to be used + // for further refining the results from that prefix. + Char16Set prefix_chars; + string16 leftovers(term); + if (best_prefix != search_term_cache_.end()) { + // If the prefix is an exact match for the term then grab the cached + // results and we're done. + size_t prefix_length = best_prefix->first.length(); + if (prefix_length == term_length) { + best_prefix->second.used_ = true; + return best_prefix->second.history_id_set_; + } + + // Otherwise we have a handy starting point. + // If there are no history results for this prefix then we can bail early + // as there will be no history results for the full term. + if (best_prefix->second.history_id_set_.empty()) { + search_term_cache_[term] = SearchTermCacheItem(); + return HistoryIDSet(); + } + word_id_set = best_prefix->second.word_id_set_; + prefix_chars = Char16SetFromString16(best_prefix->first); + leftovers = term.substr(prefix_length); + } + + // Filter for each remaining, unique character in the term. + Char16Set leftover_chars = Char16SetFromString16(leftovers); + Char16Set unique_chars; + std::set_difference(leftover_chars.begin(), leftover_chars.end(), + prefix_chars.begin(), prefix_chars.end(), + std::inserter(unique_chars, unique_chars.begin())); + + // Reduce the word set with any leftover, unprocessed characters. + if (!unique_chars.empty()) { + WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); + // We might come up empty on the leftovers. + if (leftover_set.empty()) { + search_term_cache_[term] = SearchTermCacheItem(); + return HistoryIDSet(); + } + // Or there may not have been a prefix from which to start. + if (prefix_chars.empty()) { + word_id_set.swap(leftover_set); + } else { + WordIDSet new_word_id_set; + std::set_intersection(word_id_set.begin(), word_id_set.end(), + leftover_set.begin(), leftover_set.end(), + std::inserter(new_word_id_set, + new_word_id_set.begin())); + word_id_set.swap(new_word_id_set); + } + } + + // We must filter the word list because the resulting word set surely + // contains words which do not have the search term as a proper subset. + for (WordIDSet::iterator word_set_iter = word_id_set.begin(); + word_set_iter != word_id_set.end(); ) { + if (word_list_[*word_set_iter].find(term) == string16::npos) + word_id_set.erase(word_set_iter++); + else + ++word_set_iter; + } + } else { + word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); + } // If any words resulted then we can compose a set of history IDs by unioning // the sets from each word. + HistoryIDSet history_id_set; if (!word_id_set.empty()) { for (WordIDSet::iterator word_id_iter = word_id_set.begin(); word_id_iter != word_id_set.end(); ++word_id_iter) { @@ -492,6 +561,11 @@ InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( } } + // Record a new cache entry for this word if the term is longer than + // a single character. + if (term_length > 1) + search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); + return history_id_set; } @@ -533,37 +607,22 @@ InMemoryURLIndex::String16Vector InMemoryURLIndex::WordVectorFromString16( } // static -InMemoryURLIndex::Char16Vector InMemoryURLIndex::Char16VectorFromString16( - const string16& uni_word) { - InMemoryURLIndex::Char16Vector characters; - InMemoryURLIndex::Char16Set unique_characters; - for (string16::const_iterator iter = uni_word.begin(); - iter != uni_word.end(); ++iter) { - if (!unique_characters.count(*iter)) { - unique_characters.insert(*iter); - characters.push_back(*iter); - } - } - return characters; -} - -// static InMemoryURLIndex::Char16Set InMemoryURLIndex::Char16SetFromString16( - const string16& uni_word) { + const string16& term) { Char16Set characters; - for (string16::const_iterator iter = uni_word.begin(); - iter != uni_word.end(); ++iter) + for (string16::const_iterator iter = term.begin(); iter != term.end(); + ++iter) characters.insert(*iter); return characters; } -void InMemoryURLIndex::AddWordToIndex(const string16& uni_word, +void InMemoryURLIndex::AddWordToIndex(const string16& term, HistoryID history_id) { - WordMap::iterator word_pos = word_map_.find(uni_word); + WordMap::iterator word_pos = word_map_.find(term); if (word_pos != word_map_.end()) UpdateWordHistory(word_pos->second, history_id); else - AddWordHistory(uni_word, history_id); + AddWordHistory(term, history_id); } void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { @@ -575,20 +634,20 @@ void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { // Add a new word to the word list and the word map, and then create a // new entry in the word/history map. -void InMemoryURLIndex::AddWordHistory(const string16& uni_word, +void InMemoryURLIndex::AddWordHistory(const string16& term, HistoryID history_id) { - word_list_.push_back(uni_word); + word_list_.push_back(term); WordID word_id = word_list_.size() - 1; - word_map_[uni_word] = word_id; + word_map_[term] = word_id; HistoryIDSet history_id_set; history_id_set.insert(history_id); word_id_history_map_[word_id] = history_id_set; // For each character in the newly added word (i.e. a word that is not // already in the word index), add the word to the character index. - Char16Set characters = Char16SetFromString16(uni_word); + Char16Set characters = Char16SetFromString16(term); for (Char16Set::iterator uni_char_iter = characters.begin(); uni_char_iter != characters.end(); ++uni_char_iter) { - Char16Set::value_type uni_char = *uni_char_iter; + char16 uni_char = *uni_char_iter; CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); if (char_iter != char_word_map_.end()) { // Update existing entry in the char/word index. @@ -604,29 +663,11 @@ void InMemoryURLIndex::AddWordHistory(const string16& uni_word, } InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars( - const InMemoryURLIndex::Char16Vector& uni_chars) { - size_t index = CachedResultsIndexForTerm(uni_chars); - - // If there were no unprocessed characters in the search term |uni_chars| - // then we can use the cached one as-is as the results with no further - // filtering. - if (index != kNoCachedResultForTerm && index == uni_chars.size() - 1) - return term_char_word_set_cache_[index].word_id_set_; - - // Some or all of the characters remain to be indexed so trim the cache. - if (index + 1 < term_char_word_set_cache_.size()) - term_char_word_set_cache_.resize(index + 1); + const Char16Set& term_chars) { WordIDSet word_id_set; - // Take advantage of our cached starting point, if any. - Char16Vector::const_iterator c_iter = uni_chars.begin(); - if (index != kNoCachedResultForTerm) { - word_id_set = term_char_word_set_cache_[index].word_id_set_; - c_iter += index + 1; - } - // Now process the remaining characters in the search term. - for (; c_iter != uni_chars.end(); ++c_iter) { - Char16Vector::value_type uni_char = *c_iter; - CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); + for (Char16Set::const_iterator c_iter = term_chars.begin(); + c_iter != term_chars.end(); ++c_iter) { + CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); if (char_iter == char_word_map_.end()) { // A character was not found so there are no matching results: bail. word_id_set.clear(); @@ -640,37 +681,22 @@ InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars( break; } - if (word_id_set.empty()) { + if (c_iter == term_chars.begin()) { + // First character results becomes base set of results. word_id_set = char_word_id_set; } else { - WordIDSet old_word_id_set(word_id_set); - word_id_set.clear(); - std::set_intersection(old_word_id_set.begin(), - old_word_id_set.end(), - char_word_id_set.begin(), - char_word_id_set.end(), - std::inserter(word_id_set, - word_id_set.begin())); + // Subsequent character results get intersected in. + WordIDSet new_word_id_set; + std::set_intersection(word_id_set.begin(), word_id_set.end(), + char_word_id_set.begin(), char_word_id_set.end(), + std::inserter(new_word_id_set, + new_word_id_set.begin())); + word_id_set.swap(new_word_id_set); } - // Add this new char/set instance to the cache. - term_char_word_set_cache_.push_back(TermCharWordSet( - uni_char, word_id_set, true)); } return word_id_set; } -size_t InMemoryURLIndex::CachedResultsIndexForTerm( - const InMemoryURLIndex::Char16Vector& uni_chars) { - TermCharWordSetVector::iterator t_iter = term_char_word_set_cache_.begin(); - for (Char16Vector::const_iterator c_iter = uni_chars.begin(); - (c_iter != uni_chars.end()) && - (t_iter != term_char_word_set_cache_.end()) && - (*c_iter == t_iter->char_); - ++c_iter, ++t_iter) - t_iter->used_ = true; // Update the cache. - return t_iter - term_char_word_set_cache_.begin() - 1; -} - // static TermMatches InMemoryURLIndex::MatchTermInString(const string16& term, const string16& string, @@ -681,7 +707,7 @@ TermMatches InMemoryURLIndex::MatchTermInString(const string16& term, TermMatches matches; for (size_t location = short_string.find(term); location != string16::npos; location = short_string.find(term, location + 1)) { - matches.push_back(TermMatch(term_num, location, term.size())); + matches.push_back(TermMatch(term_num, location, term.length())); } return matches; } @@ -691,8 +717,7 @@ TermMatches InMemoryURLIndex::SortAndDeoverlap(const TermMatches& matches) { if (matches.empty()) return matches; TermMatches sorted_matches = matches; - std::sort(sorted_matches.begin(), sorted_matches.end(), - MatchOffsetLess); + std::sort(sorted_matches.begin(), sorted_matches.end(), MatchOffsetLess); TermMatches clean_matches; TermMatch last_match = sorted_matches[0]; clean_matches.push_back(last_match); @@ -775,10 +800,10 @@ ScoredHistoryMatch InMemoryURLIndex::ScoredMatchForURL( // Get partial scores based on term matching. Note that the score for // each of the URL and title are adjusted by the fraction of the // terms appearing in each. - int url_score = ScoreComponentForMatches(match.url_matches, url.size()) * + int url_score = ScoreComponentForMatches(match.url_matches, url.length()) * match.url_matches.size() / terms.size(); int title_score = - ScoreComponentForMatches(match.title_matches, title.size()) * + ScoreComponentForMatches(match.title_matches, title.length()) * match.title_matches.size() / terms.size(); // Arbitrarily pick the best. // TODO(mrossetti): It might make sense that a term which appears in both the @@ -1086,7 +1111,7 @@ void InMemoryURLIndex::SaveHistoryInfoMap( map_entry->set_history_id(iter->first); const URLRow& url_row(iter->second); // Note: We only save information that contributes to the index so there - // is no need to save term_char_word_set_cache_ (not persistent), + // is no need to save search_term_cache_ (not persistent), // languages_, etc. map_entry->set_visit_count(url_row.visit_count()); map_entry->set_typed_count(url_row.typed_count()); diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h index 44ad146..b7ff33d 100644 --- a/chrome/browser/history/in_memory_url_index.h +++ b/chrome/browser/history/in_memory_url_index.h @@ -176,10 +176,12 @@ class InMemoryURLIndex { private: friend class AddHistoryMatch; + friend class InMemoryURLIndexTest; FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization); FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheFilePath); FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Char16Utilities); + FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, NonUniqueTermCharacterSets); FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring); FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, StaticFunctions); FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch); @@ -214,18 +216,35 @@ class InMemoryURLIndex { typedef std::set<HistoryID> HistoryIDSet; typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; - // Support caching of term character results so that we can optimize - // searches which build upon a previous search. Each entry in this vector - // represents a progressive reduction of the result set for each unique - // character found in the search term, with each character being taken as - // initially encountered. For example, once the results for the search - // term of "mextexarkana" have been fully determined, this vector will - // contain one entry for the characters: m, e, x, t, a, r, k, & n, in - // that order. The result sets will naturally shrink in size for each - // subsequent character as the sets intersections are taken in an - // incremental manner. - struct TermCharWordSet; - typedef std::vector<TermCharWordSet> TermCharWordSetVector; + + // Support caching of term results so that we can optimize searches which + // build upon a previous search. Each entry in this map represents one + // search term from the most recent search. For example, if the user had + // typed "google blog trans" and then typed an additional 'l' (at the end, + // of course) then there would be four items in the cache: 'blog', 'google', + // 'trans', and 'transl'. All would be marked as being in use except for the + // 'trans' item; its cached data would have been used when optimizing the + // construction of the search results candidates for 'transl' but then would + // no longer needed. + // + // Items stored in the search term cache. If a search term exactly matches one + // in the cache then we can quickly supply the proper |history_id_set_| (and + // marking the cache item as being |used_|. If we find a prefix for a search + // term in the cache (which is very likely to occur as the user types each + // term into the omnibox) then we can short-circuit the index search for those + // characters in the prefix by returning the |word_id_set|. In that case we do + // not mark the item as being |used_|. + struct SearchTermCacheItem { + SearchTermCacheItem(const WordIDSet& word_id_set, + const HistoryIDSet& history_id_set); + // Creates a cache item for a term which has no results. + SearchTermCacheItem(); + + WordIDSet word_id_set_; + HistoryIDSet history_id_set_; + bool used_; // True if this item has been used for the current term search. + }; + typedef std::map<string16, SearchTermCacheItem> SearchTermCacheMap; // TODO(rohitrao): Probably replace this with QueryResults. typedef std::vector<URLRow> URLRowVector; @@ -262,19 +281,8 @@ class InMemoryURLIndex { // Breaks a string down into individual words. static String16Set WordSetFromString16(const string16& uni_string); - // Given a vector of Char16s, representing the characters the user has typed - // into the omnibox, finds words containing those characters. If any - // existing, cached set is a proper subset then starts with that cached - // set. Updates the previously-typed-character cache. - WordIDSet WordIDSetForTermChars(const Char16Vector& uni_chars); - - // Given a vector of Char16s in |uni_chars|, compare those characters, in - // order, with the previously searched term, returning the index of the - // cached results in the term_char_word_set_cache_ of the set with best - // matching number of characters. Returns kNoCachedResultForTerm if there - // was no match at all (i.e. the first character of |uni-chars| is not the - // same as the character of the first entry in term_char_word_set_cache_). - size_t CachedResultsIndexForTerm(const Char16Vector& uni_chars); + // Given a set of Char16s, finds words containing those characters. + WordIDSet WordIDSetForTermChars(const Char16Set& term_chars); // Creates a TermMatches which has an entry for each occurrence of the string // |term| found in the string |string|. Mark each match with |term_num| so @@ -289,10 +297,6 @@ class InMemoryURLIndex { // Indexes one URL history item. bool IndexRow(const URLRow& row); - // Breaks a string down into unique, individual characters in the order - // in which the characters are first encountered in the |uni_word| string. - static Char16Vector Char16VectorFromString16(const string16& uni_word); - // Breaks the |uni_word| string down into its individual characters. // Note that this is temporarily intended to work on a single word, but // _will_ work on a string of words, perhaps with unexpected results. @@ -315,18 +319,16 @@ class InMemoryURLIndex { // |history_id| as the initial element of the word's set. void AddWordHistory(const string16& uni_word, HistoryID history_id); - // Clears the search term cache. This cache holds on to the intermediate - // word results for each previously typed character to eliminate the need - // to re-perform set operations for previously typed characters. - void ResetTermCharWordSetCache(); + // Clears |used_| for each item in the search term cache. + void ResetSearchTermCache(); // Composes a set of history item IDs by intersecting the set for each word // in |uni_string|. HistoryIDSet HistoryIDSetFromWords(const string16& uni_string); // Helper function to HistoryIDSetFromWords which composes a set of history - // ids for the given term given in |uni_word|. - HistoryIDSet HistoryIDsForTerm(const string16& uni_word); + // ids for the given term given in |term|. + HistoryIDSet HistoryIDsForTerm(const string16& term); // Calculates a raw score for this history item by first determining // if all of the terms in |terms_vector| occur in |row| and, if so, @@ -405,8 +407,12 @@ class InMemoryURLIndex { WordMap word_map_; CharWordIDMap char_word_map_; WordIDHistoryMap word_id_history_map_; - TermCharWordSetVector term_char_word_set_cache_; HistoryInfoMap history_info_map_; + + // Cache of search terms. + SearchTermCacheMap search_term_cache_; + + // Languages used during the word-breaking process during indexing. std::string languages_; // Only URLs with a whitelisted scheme are indexed. diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc index 776e18f..cc10215 100644 --- a/chrome/browser/history/in_memory_url_index_unittest.cc +++ b/chrome/browser/history/in_memory_url_index_unittest.cc @@ -43,122 +43,165 @@ class InMemoryURLIndexTest : public testing::Test, protected: // Test setup. - virtual void SetUp() { - // Create and populate a working copy of the URL history database. - FilePath history_proto_path; - PathService::Get(chrome::DIR_TEST_DATA, &history_proto_path); - history_proto_path = history_proto_path.Append( - FILE_PATH_LITERAL("History")); - history_proto_path = history_proto_path.Append(TestDBName()); - EXPECT_TRUE(file_util::PathExists(history_proto_path)); - - std::ifstream proto_file(history_proto_path.value().c_str()); - static const size_t kCommandBufferMaxSize = 2048; - char sql_cmd_line[kCommandBufferMaxSize]; - - sql::Connection& db(GetDB()); - { - sql::Transaction transaction(&db); - transaction.Begin(); - while (!proto_file.eof()) { - proto_file.getline(sql_cmd_line, kCommandBufferMaxSize); - if (!proto_file.eof()) { - // We only process lines which begin with a upper-case letter. - // TODO(mrossetti): Can iswupper() be used here? - if (sql_cmd_line[0] >= 'A' && sql_cmd_line[0] <= 'Z') { - std::string sql_cmd(sql_cmd_line); - sql::Statement sql_stmt(db.GetUniqueStatement(sql_cmd_line)); - EXPECT_TRUE(sql_stmt.Run()); - } - } - } - transaction.Commit(); - } - proto_file.close(); - - // Update the last_visit_time table column - // such that it represents a time relative to 'now'. - sql::Statement statement(db.GetUniqueStatement( - "SELECT" HISTORY_URL_ROW_FIELDS "FROM urls;")); - EXPECT_TRUE(statement); - base::Time time_right_now = base::Time::NowFromSystemTime(); - base::TimeDelta day_delta = base::TimeDelta::FromDays(1); - { - sql::Transaction transaction(&db); - transaction.Begin(); - while (statement.Step()) { - URLRow row; - FillURLRow(statement, &row); - base::Time last_visit = time_right_now; - for (int64 i = row.last_visit().ToInternalValue(); i > 0; --i) - last_visit -= day_delta; - row.set_last_visit(last_visit); - UpdateURLRow(row.id(), row); - } - transaction.Commit(); - } - } + virtual void SetUp(); - virtual FilePath::StringType TestDBName() const { - return FILE_PATH_LITERAL("url_history_provider_test.db.txt"); - } + // Allows the database containing the test data to be customized by + // subclasses. + virtual FilePath::StringType TestDBName() const; + // Convenience function to create a URLRow with basic data for |url|, |title|, + // |visit_count|, and |typed_count|. |last_visit_ago| gives the number of + // days from now to set the URL's last_visit. URLRow MakeURLRow(const char* url, const char* title, int visit_count, int last_visit_ago, - int typed_count) { - URLRow row(GURL(url), 0); - row.set_title(UTF8ToUTF16(title)); - row.set_visit_count(visit_count); - row.set_typed_count(typed_count); - row.set_last_visit(base::Time::NowFromSystemTime() - - base::TimeDelta::FromDays(last_visit_ago)); - return row; - } - - InMemoryURLIndex::String16Vector Make1Term(const char* term) { - InMemoryURLIndex::String16Vector terms; - terms.push_back(UTF8ToUTF16(term)); - return terms; - } + int typed_count); + // Convenience functions for easily creating vectors of search terms. + InMemoryURLIndex::String16Vector Make1Term(const char* term) const; InMemoryURLIndex::String16Vector Make2Terms(const char* term_1, - const char* term_2) { - InMemoryURLIndex::String16Vector terms; - terms.push_back(UTF8ToUTF16(term_1)); - terms.push_back(UTF8ToUTF16(term_2)); - return terms; - } + const char* term_2) const; + + // Validates that the given |term| is contained in |cache| and that it is + // marked as in-use. + void CheckTerm(const InMemoryURLIndex::SearchTermCacheMap& cache, + string16 term) const; scoped_ptr<InMemoryURLIndex> url_index_; }; +void InMemoryURLIndexTest::SetUp() { + // Create and populate a working copy of the URL history database. + FilePath history_proto_path; + PathService::Get(chrome::DIR_TEST_DATA, &history_proto_path); + history_proto_path = history_proto_path.Append( + FILE_PATH_LITERAL("History")); + history_proto_path = history_proto_path.Append(TestDBName()); + EXPECT_TRUE(file_util::PathExists(history_proto_path)); + + std::ifstream proto_file(history_proto_path.value().c_str()); + static const size_t kCommandBufferMaxSize = 2048; + char sql_cmd_line[kCommandBufferMaxSize]; + + sql::Connection& db(GetDB()); + { + sql::Transaction transaction(&db); + transaction.Begin(); + while (!proto_file.eof()) { + proto_file.getline(sql_cmd_line, kCommandBufferMaxSize); + if (!proto_file.eof()) { + // We only process lines which begin with a upper-case letter. + // TODO(mrossetti): Can iswupper() be used here? + if (sql_cmd_line[0] >= 'A' && sql_cmd_line[0] <= 'Z') { + std::string sql_cmd(sql_cmd_line); + sql::Statement sql_stmt(db.GetUniqueStatement(sql_cmd_line)); + EXPECT_TRUE(sql_stmt.Run()); + } + } + } + transaction.Commit(); + } + proto_file.close(); + + // Update the last_visit_time table column + // such that it represents a time relative to 'now'. + sql::Statement statement(db.GetUniqueStatement( + "SELECT" HISTORY_URL_ROW_FIELDS "FROM urls;")); + EXPECT_TRUE(statement); + base::Time time_right_now = base::Time::NowFromSystemTime(); + base::TimeDelta day_delta = base::TimeDelta::FromDays(1); + { + sql::Transaction transaction(&db); + transaction.Begin(); + while (statement.Step()) { + URLRow row; + FillURLRow(statement, &row); + base::Time last_visit = time_right_now; + for (int64 i = row.last_visit().ToInternalValue(); i > 0; --i) + last_visit -= day_delta; + row.set_last_visit(last_visit); + UpdateURLRow(row.id(), row); + } + transaction.Commit(); + } +} + +FilePath::StringType InMemoryURLIndexTest::TestDBName() const { + return FILE_PATH_LITERAL("url_history_provider_test.db.txt"); +} + +URLRow InMemoryURLIndexTest::MakeURLRow(const char* url, + const char* title, + int visit_count, + int last_visit_ago, + int typed_count) { + URLRow row(GURL(url), 0); + row.set_title(UTF8ToUTF16(title)); + row.set_visit_count(visit_count); + row.set_typed_count(typed_count); + row.set_last_visit(base::Time::NowFromSystemTime() - + base::TimeDelta::FromDays(last_visit_ago)); + return row; +} + +InMemoryURLIndex::String16Vector InMemoryURLIndexTest::Make1Term( + const char* term) const { + InMemoryURLIndex::String16Vector terms; + terms.push_back(UTF8ToUTF16(term)); + return terms; +} + +InMemoryURLIndex::String16Vector InMemoryURLIndexTest::Make2Terms( + const char* term_1, + const char* term_2) const { + InMemoryURLIndex::String16Vector terms; + terms.push_back(UTF8ToUTF16(term_1)); + terms.push_back(UTF8ToUTF16(term_2)); + return terms; +} + +void InMemoryURLIndexTest::CheckTerm( + const InMemoryURLIndex::SearchTermCacheMap& cache, + string16 term) const { + InMemoryURLIndex::SearchTermCacheMap::const_iterator cache_iter( + cache.find(term)); + ASSERT_NE(cache.end(), cache_iter) + << "Cache does not contain '" << term << "' but should."; + InMemoryURLIndex::SearchTermCacheItem cache_item = cache_iter->second; + EXPECT_TRUE(cache_item.used_) + << "Cache item '" << term << "' should be marked as being in use."; +} + class LimitedInMemoryURLIndexTest : public InMemoryURLIndexTest { protected: - FilePath::StringType TestDBName() const { - return FILE_PATH_LITERAL("url_history_provider_test_limited.db.txt"); - } + FilePath::StringType TestDBName() const; }; +FilePath::StringType LimitedInMemoryURLIndexTest::TestDBName() const { + return FILE_PATH_LITERAL("url_history_provider_test_limited.db.txt"); +} + class ExpandedInMemoryURLIndexTest : public InMemoryURLIndexTest { protected: - virtual void SetUp() { - InMemoryURLIndexTest::SetUp(); - // Add 600 more history items. - // NOTE: Keep the string length constant at least the length of the format - // string plus 5 to account for a 3 digit number and terminator. - char url_format[] = "http://www.google.com/%d"; - const size_t kMaxLen = arraysize(url_format) + 5; - char url_string[kMaxLen + 1]; - for (int i = 0; i < 600; ++i) { - base::snprintf(url_string, kMaxLen, url_format, i); - URLRow row(MakeURLRow(url_string, "Google Search", 20, 0, 20)); - AddURL(row); - } - } + virtual void SetUp(); }; +void ExpandedInMemoryURLIndexTest::SetUp() { + InMemoryURLIndexTest::SetUp(); + // Add 600 more history items. + // NOTE: Keep the string length constant at least the length of the format + // string plus 5 to account for a 3 digit number and terminator. + char url_format[] = "http://www.google.com/%d"; + const size_t kMaxLen = arraysize(url_format) + 5; + char url_string[kMaxLen + 1]; + for (int i = 0; i < 600; ++i) { + base::snprintf(url_string, kMaxLen, url_format, i); + URLRow row(MakeURLRow(url_string, "Google Search", 20, 0, 20)); + AddURL(row); + } +} + TEST_F(InMemoryURLIndexTest, Construction) { url_index_.reset(new InMemoryURLIndex(FilePath(FILE_PATH_LITERAL("/dummy")))); EXPECT_TRUE(url_index_.get()); @@ -185,13 +228,12 @@ TEST_F(LimitedInMemoryURLIndexTest, Initialization) { TEST_F(InMemoryURLIndexTest, Retrieval) { url_index_.reset(new InMemoryURLIndex(FilePath(FILE_PATH_LITERAL("/dummy")))); url_index_->Init(this, "en,ja,hi,zh"); - InMemoryURLIndex::String16Vector terms; // The term will be lowercased by the search. // See if a very specific term gives a single result. - terms.push_back(ASCIIToUTF16("DrudgeReport")); - ScoredHistoryMatches matches = url_index_->HistoryItemsForTerms(terms); - EXPECT_EQ(1U, matches.size()); + ScoredHistoryMatches matches = + url_index_->HistoryItemsForTerms(Make1Term("DrudgeReport")); + ASSERT_EQ(1U, matches.size()); // Verify that we got back the result we expected. EXPECT_EQ(5, matches[0].url_info.id()); @@ -199,18 +241,14 @@ TEST_F(InMemoryURLIndexTest, Retrieval) { EXPECT_EQ(ASCIIToUTF16("DRUDGE REPORT 2010"), matches[0].url_info.title()); // Search which should result in multiple results. - terms.clear(); - terms.push_back(ASCIIToUTF16("drudge")); - matches = url_index_->HistoryItemsForTerms(terms); + matches = url_index_->HistoryItemsForTerms(Make1Term("drudge")); ASSERT_EQ(2U, matches.size()); // The results should be in descending score order. EXPECT_GE(matches[0].raw_score, matches[1].raw_score); // Search which should result in nearly perfect result. - terms.clear(); - terms.push_back(ASCIIToUTF16("https")); - terms.push_back(ASCIIToUTF16("NearlyPerfectResult")); - matches = url_index_->HistoryItemsForTerms(terms); + matches = url_index_->HistoryItemsForTerms(Make2Terms("https", + "NearlyPerfectResult")); ASSERT_EQ(1U, matches.size()); // The results should have a very high score. EXPECT_GT(matches[0].raw_score, 900); @@ -221,7 +259,7 @@ TEST_F(InMemoryURLIndexTest, Retrieval) { matches[0].url_info.title()); // Search which should result in very poor result. - terms.clear(); + InMemoryURLIndex::String16Vector terms; terms.push_back(ASCIIToUTF16("z")); terms.push_back(ASCIIToUTF16("y")); terms.push_back(ASCIIToUTF16("x")); @@ -236,26 +274,21 @@ TEST_F(InMemoryURLIndexTest, Retrieval) { matches[0].url_info.title()); // Search which will match at the end of an URL with encoded characters. - terms.clear(); - terms.push_back(ASCIIToUTF16("ice")); - matches = url_index_->HistoryItemsForTerms(terms); + matches = url_index_->HistoryItemsForTerms(Make1Term("ice")); ASSERT_EQ(1U, matches.size()); } TEST_F(ExpandedInMemoryURLIndexTest, ShortCircuit) { url_index_.reset(new InMemoryURLIndex(FilePath(FILE_PATH_LITERAL("/dummy")))); url_index_->Init(this, "en,ja,hi,zh"); - InMemoryURLIndex::String16Vector terms; // A search for 'w' should short-circuit and not return any matches. - terms.push_back(ASCIIToUTF16("w")); - ScoredHistoryMatches matches = url_index_->HistoryItemsForTerms(terms); + ScoredHistoryMatches matches = + url_index_->HistoryItemsForTerms(Make1Term("w")); EXPECT_TRUE(matches.empty()); // A search for 'working' should not short-circuit. - terms.clear(); - terms.push_back(ASCIIToUTF16("working")); - matches = url_index_->HistoryItemsForTerms(terms); + matches = url_index_->HistoryItemsForTerms(Make1Term("working")); EXPECT_EQ(1U, matches.size()); } @@ -271,7 +304,7 @@ TEST_F(InMemoryURLIndexTest, TitleSearch) { terms.push_back(ASCIIToUTF16("RATE")); terms.push_back(ASCIIToUTF16("DROPS")); ScoredHistoryMatches matches = url_index_->HistoryItemsForTerms(terms); - EXPECT_EQ(1U, matches.size()); + ASSERT_EQ(1U, matches.size()); // Verify that we got back the result we expected. EXPECT_EQ(1, matches[0].url_info.id()); @@ -282,19 +315,34 @@ TEST_F(InMemoryURLIndexTest, TitleSearch) { matches[0].url_info.title()); } -TEST_F(InMemoryURLIndexTest, Char16Utilities) { - string16 term = ASCIIToUTF16("drudgereport"); - string16 expected = ASCIIToUTF16("drugepot"); - EXPECT_EQ(expected.size(), - InMemoryURLIndex::Char16SetFromString16(term).size()); - InMemoryURLIndex::Char16Vector c_vector = - InMemoryURLIndex::Char16VectorFromString16(term); - ASSERT_EQ(expected.size(), c_vector.size()); - - InMemoryURLIndex::Char16Vector::const_iterator c_iter = c_vector.begin(); - for (string16::const_iterator s_iter = expected.begin(); - s_iter != expected.end(); ++s_iter, ++c_iter) - EXPECT_EQ(*s_iter, *c_iter); +TEST_F(InMemoryURLIndexTest, NonUniqueTermCharacterSets) { + url_index_.reset(new InMemoryURLIndex()); + url_index_->Init(this, "en,ja,hi,zh"); + + // The presence of duplicate characters should succeed. Exercise by cycling + // through a string with several duplicate characters. + ScoredHistoryMatches matches = + url_index_->HistoryItemsForTerms(Make1Term("ABRA")); + ASSERT_EQ(1U, matches.size()); + EXPECT_EQ(28, matches[0].url_info.id()); + EXPECT_EQ("http://www.ddj.com/windows/184416623", + matches[0].url_info.url().spec()); + + matches = url_index_->HistoryItemsForTerms(Make1Term("ABRACAD")); + ASSERT_EQ(1U, matches.size()); + EXPECT_EQ(28, matches[0].url_info.id()); + + matches = url_index_->HistoryItemsForTerms(Make1Term("ABRACADABRA")); + ASSERT_EQ(1U, matches.size()); + EXPECT_EQ(28, matches[0].url_info.id()); + + matches = url_index_->HistoryItemsForTerms(Make1Term("ABRACADABR")); + ASSERT_EQ(1U, matches.size()); + EXPECT_EQ(28, matches[0].url_info.id()); + + matches = url_index_->HistoryItemsForTerms(Make1Term("ABRACA")); + ASSERT_EQ(1U, matches.size()); + EXPECT_EQ(28, matches[0].url_info.id()); } TEST_F(InMemoryURLIndexTest, StaticFunctions) { @@ -396,48 +444,74 @@ TEST_F(InMemoryURLIndexTest, TypedCharacterCaching) { // Verify that match results for previously typed characters are retained // (in the term_char_word_set_cache_) and reused, if possible, in future // autocompletes. + typedef InMemoryURLIndex::SearchTermCacheMap::iterator CacheIter; + typedef InMemoryURLIndex::SearchTermCacheItem CacheItem; + url_index_.reset(new InMemoryURLIndex(FilePath(FILE_PATH_LITERAL("/dummy")))); url_index_->Init(this, "en,ja,hi,zh"); - // Verify that we can find something that already exists. - InMemoryURLIndex::String16Vector terms; - string16 term = ASCIIToUTF16("drudgerepo"); - terms.push_back(term); - EXPECT_EQ(1U, url_index_->HistoryItemsForTerms(terms).size()); + InMemoryURLIndex::SearchTermCacheMap& cache(url_index_->search_term_cache_); - { - // Exercise the term matching cache with the same term. - InMemoryURLIndex::Char16Vector uni_chars = - InMemoryURLIndex::Char16VectorFromString16(term); - EXPECT_EQ(uni_chars.size(), 7U); // Equivalent to 'degopru' - EXPECT_EQ(6U, url_index_->CachedResultsIndexForTerm(uni_chars)); - } - - { - // Back off a character. - InMemoryURLIndex::Char16Vector uni_chars = - InMemoryURLIndex::Char16VectorFromString16(ASCIIToUTF16("drudgerep")); - EXPECT_EQ(6U, uni_chars.size()); // Equivalent to 'degpru' - EXPECT_EQ(5U, url_index_->CachedResultsIndexForTerm(uni_chars)); - } + // The cache should be empty at this point. + EXPECT_EQ(0U, cache.size()); - { - // Add a couple of characters. - InMemoryURLIndex::Char16Vector uni_chars = - InMemoryURLIndex::Char16VectorFromString16( - ASCIIToUTF16("drudgereporta")); - EXPECT_EQ(9U, uni_chars.size()); // Equivalent to 'adegoprtu' - EXPECT_EQ(6U, url_index_->CachedResultsIndexForTerm(uni_chars)); - } + // Now simulate typing search terms into the omnibox and check the state of + // the cache as each item is 'typed'. - { - // Use different string. - InMemoryURLIndex::Char16Vector uni_chars = - InMemoryURLIndex::Char16VectorFromString16(ASCIIToUTF16("abcde")); - EXPECT_EQ(5U, uni_chars.size()); - EXPECT_EQ(static_cast<size_t>(-1), - url_index_->CachedResultsIndexForTerm(uni_chars)); - } + // Simulate typing "r" giving "r" in the simulated omnibox. The results for + // 'r' will be not cached because it is only 1 character long. + InMemoryURLIndex::String16Vector terms; + string16 term_r = ASCIIToUTF16("r"); + terms.push_back(term_r); + url_index_->HistoryItemsForTerms(terms); + EXPECT_EQ(0U, cache.size()); + + // Simulate typing "re" giving "r re" in the simulated omnibox. + string16 term_re = ASCIIToUTF16("re"); + terms.push_back(term_re); + // 're' should be cached at this point but not 'r' as it is a single + // character. + ASSERT_EQ(2U, terms.size()); + url_index_->HistoryItemsForTerms(terms); + ASSERT_EQ(1U, cache.size()); + CheckTerm(cache, term_re); + + // Simulate typing "reco" giving "r re reco" in the simulated omnibox. + string16 term_reco = ASCIIToUTF16("reco"); + terms.push_back(term_reco); + // 're' and 'reco' should be cached at this point but not 'r' as it is a + // single character. + url_index_->HistoryItemsForTerms(terms); + ASSERT_EQ(2U, cache.size()); + CheckTerm(cache, term_re); + CheckTerm(cache, term_reco); + + terms.clear(); // Simulate pressing <ESC>. + + // Simulate typing "mort". + string16 term_mort = ASCIIToUTF16("mort"); + terms.push_back(term_mort); + // Since we now have only one search term, the cached results for 're' and + // 'reco' should be purged, giving us only 1 item in the cache (for 'mort'). + url_index_->HistoryItemsForTerms(terms); + ASSERT_EQ(1U, cache.size()); + CheckTerm(cache, term_mort); + + // Simulate typing "reco" giving "mort reco" in the simulated omnibox. + terms.push_back(term_reco); + url_index_->HistoryItemsForTerms(terms); + ASSERT_EQ(2U, cache.size()); + CheckTerm(cache, term_mort); + CheckTerm(cache, term_reco); + + // Simulate a <DELETE> by removing the 'reco' and adding back the 'rec'. + terms.resize(terms.size() - 1); + string16 term_rec = ASCIIToUTF16("rec"); + terms.push_back(term_rec); + url_index_->HistoryItemsForTerms(terms); + ASSERT_EQ(2U, cache.size()); + CheckTerm(cache, term_mort); + CheckTerm(cache, term_rec); } TEST_F(InMemoryURLIndexTest, Scoring) { diff --git a/chrome/test/data/History/url_history_provider_test.db.txt b/chrome/test/data/History/url_history_provider_test.db.txt index 5fd7821..9db0ca9 100644 --- a/chrome/test/data/History/url_history_provider_test.db.txt +++ b/chrome/test/data/History/url_history_provider_test.db.txt @@ -50,7 +50,7 @@ INSERT INTO "urls" VALUES(24,'http://www.danilatos.com/event-test/ExperimentTest INSERT INTO "urls" VALUES(25,'http://www.codeguru.com/','CodeGuru : codeguru',6,6,0,0,110); // Qualifies INSERT INTO "urls" VALUES(26,'http://www.codeproject.com/','Your Development Resource - CodeProject',6,6,0,0,369); // Qualifies INSERT INTO "urls" VALUES(27,'http://www.tomshardware.com/us/#redir','Tom''s Hardware: Hardware News, Tests and Reviews',6,6,0,0,65); // Qualifies -INSERT INTO "urls" VALUES(28,'http://www.ddj.com/windows/184416623','Dr. Dobb''s | Avoiding the Visual C++ Runtime Library | 2 1, 2003',6,6,0,0,0); // Qualifies +INSERT INTO "urls" VALUES(28,'http://www.ddj.com/windows/184416623','Dr. ABRACADABRA''s | Avoiding the Visual C++ Runtime Library | 2 1, 2003',6,6,0,0,0); // Qualifies INSERT INTO "urls" VALUES(29,'http://svcs.cnn.com/weather/getForecast?time=34&mode=json_html&zipCode=336736767676&locCode=EGLL&celcius=true&csiID=csi2','',6,6,0,1,0); // Qualifies INSERT INTO "urls" VALUES(30,'http://www.drudgery.com/Dogs%20and%20Mice','Life in the Slow Lane',8,2,2,0,0); // Qualifies INSERT INTO "urls" VALUES(31,'http://www.redrudgerydo.com/','Music of the Wild Landscape',0,0,6,0,0); |