// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/omnibox/browser/url_index_private_data.h" #include #include #include #include #include #include #include "base/basictypes.h" #include "base/files/file_util.h" #include "base/i18n/break_iterator.h" #include "base/i18n/case_conversion.h" #include "base/metrics/histogram.h" #include "base/strings/string_split.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" #include "base/time/time.h" #include "components/bookmarks/browser/bookmark_model.h" #include "components/bookmarks/browser/bookmark_utils.h" #include "components/history/core/browser/history_database.h" #include "components/history/core/browser/history_db_task.h" #include "components/history/core/browser/history_service.h" #include "components/omnibox/browser/in_memory_url_index.h" #include "components/url_formatter/url_formatter.h" #if defined(USE_SYSTEM_PROTOBUF) #include #else #include "third_party/protobuf/src/google/protobuf/repeated_field.h" #endif using google::protobuf::RepeatedField; using google::protobuf::RepeatedPtrField; using in_memory_url_index::InMemoryURLIndexCacheItem; namespace { static const size_t kMaxVisitsToStoreInCache = 10u; } typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem WordListItem; typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry WordMapEntry; typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem; typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem CharWordMapItem; typedef in_memory_url_index:: InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry; typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem WordIDHistoryMapItem; typedef in_memory_url_index:: InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry WordIDHistoryMapEntry; typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem HistoryInfoMapItem; typedef in_memory_url_index:: InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry HistoryInfoMapEntry; typedef in_memory_url_index:: InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo HistoryInfoMapEntry_VisitInfo; typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem WordStartsMapItem; typedef in_memory_url_index:: InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry WordStartsMapEntry; // Algorithm Functions --------------------------------------------------------- // Comparison function for sorting search terms by descending length. bool LengthGreater(const base::string16& string_a, const base::string16& string_b) { return string_a.length() > string_b.length(); } // UpdateRecentVisitsFromHistoryDBTask ----------------------------------------- // HistoryDBTask used to update the recent visit data for a particular // row from the history database. class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask { public: explicit UpdateRecentVisitsFromHistoryDBTask( URLIndexPrivateData* private_data, history::URLID url_id); bool RunOnDBThread(history::HistoryBackend* backend, history::HistoryDatabase* db) override; void DoneRunOnMainThread() override; private: ~UpdateRecentVisitsFromHistoryDBTask() override; // The URLIndexPrivateData that gets updated after the historyDB // task returns. URLIndexPrivateData* private_data_; // The ID of the URL to get visits for and then update. history::URLID url_id_; // Whether fetching the recent visits for the URL succeeded. bool succeeded_; // The awaited data that's shown to private_data_ for it to copy and // store. history::VisitVector recent_visits_; DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask); }; UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask( URLIndexPrivateData* private_data, history::URLID url_id) : private_data_(private_data), url_id_(url_id), succeeded_(false) { } bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread( history::HistoryBackend* backend, history::HistoryDatabase* db) { // Make sure the private data is going to get as many recent visits as // ScoredHistoryMatch::GetFrequency() hopes to use. DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); succeeded_ = db->GetMostRecentVisitsForURL(url_id_, kMaxVisitsToStoreInCache, &recent_visits_); if (!succeeded_) recent_visits_.clear(); return true; // Always claim to be done; do not retry failures. } void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() { if (succeeded_) private_data_->UpdateRecentVisits(url_id_, recent_visits_); } UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() { } // URLIndexPrivateData --------------------------------------------------------- URLIndexPrivateData::URLIndexPrivateData() : restored_cache_version_(0), saved_cache_version_(kCurrentCacheFileVersion), pre_filter_item_count_(0), post_filter_item_count_(0), post_scoring_item_count_(0) { } ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms( base::string16 search_string, size_t cursor_position, size_t max_matches, const std::string& languages, bookmarks::BookmarkModel* bookmark_model) { // If cursor position is set and useful (not at either end of the // string), allow the search string to be broken at cursor position. // We do this by pretending there's a space where the cursor is. if ((cursor_position != base::string16::npos) && (cursor_position < search_string.length()) && (cursor_position > 0)) { search_string.insert(cursor_position, base::ASCIIToUTF16(" ")); } pre_filter_item_count_ = 0; post_filter_item_count_ = 0; post_scoring_item_count_ = 0; // The search string we receive may contain escaped characters. For reducing // the index we need individual, lower-cased words, ignoring escapings. For // the final filtering we need whitespace separated substrings possibly // containing escaped characters. base::string16 lower_raw_string(base::i18n::ToLower(search_string)); base::string16 lower_unescaped_string = net::UnescapeURLComponent(lower_raw_string, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); // Extract individual 'words' (as opposed to 'terms'; see below) from the // search string. When the user types "colspec=ID%20Mstone Release" we get // four 'words': "colspec", "id", "mstone" and "release". String16Vector lower_words( String16VectorFromString16(lower_unescaped_string, false, NULL)); ScoredHistoryMatches scored_items; // Do nothing if we have indexed no words (probably because we've not been // initialized yet) or the search string has no words. if (word_list_.empty() || lower_words.empty()) { search_term_cache_.clear(); // Invalidate the term cache. return scored_items; } // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep // approach. ResetSearchTermCache(); HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words); // Trim the candidate pool if it is large. Note that we do not filter out // items that do not contain the search terms as proper substrings -- doing // so is the performance-costly operation we are trying to avoid in order // to maintain omnibox responsiveness. const size_t kItemsToScoreLimit = 500; pre_filter_item_count_ = history_id_set.size(); // If we trim the results set we do not want to cache the results for next // time as the user's ultimately desired result could easily be eliminated // in this early rough filter. bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit); if (was_trimmed) { HistoryIDVector history_ids; std::copy(history_id_set.begin(), history_id_set.end(), std::back_inserter(history_ids)); // Trim down the set by sorting by typed-count, visit-count, and last // visit. HistoryItemFactorGreater item_factor_functor(history_info_map_); std::partial_sort(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, history_ids.end(), item_factor_functor); history_id_set.clear(); std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit, std::inserter(history_id_set, history_id_set.end())); post_filter_item_count_ = history_id_set.size(); } // Pass over all of the candidates filtering out any without a proper // substring match, inserting those which pass in order by score. Note that // in this step we are using the raw search string complete with escaped // URL elements. When the user has specifically typed something akin to // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that // specific substring appears in the URL or page title. // We call these 'terms' (as opposed to 'words'; see above) as in this case // we only want to break up the search string on 'true' whitespace rather than // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we // get two 'terms': "colspec=id%20mstone" and "release". String16Vector lower_raw_terms = base::SplitString( lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE, base::SPLIT_WANT_NONEMPTY); if (lower_raw_terms.empty()) { // Don't score matches when there are no terms to score against. (It's // possible that the word break iterater that extracts words to search // for in the database allows some whitespace "words" whereas SplitString // excludes a long list of whitespace.) One could write a scoring // function that gives a reasonable order to matches when there // are no terms (i.e., all the words are some form of whitespace), // but this is such a rare edge case that it's not worth the time. return scored_items; } scored_items = std::for_each( history_id_set.begin(), history_id_set.end(), AddHistoryMatch(bookmark_model, *this, languages, lower_raw_string, lower_raw_terms, base::Time::Now())).ScoredMatches(); // Select and sort only the top |max_matches| results. if (scored_items.size() > max_matches) { std::partial_sort(scored_items.begin(), scored_items.begin() + max_matches, scored_items.end(), ScoredHistoryMatch::MatchScoreGreater); scored_items.resize(max_matches); } else { std::sort(scored_items.begin(), scored_items.end(), ScoredHistoryMatch::MatchScoreGreater); } post_scoring_item_count_ = scored_items.size(); if (was_trimmed) { search_term_cache_.clear(); // Invalidate the term cache. } else { // Remove any stale SearchTermCacheItems. for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); cache_iter != search_term_cache_.end(); ) { if (!cache_iter->second.used_) search_term_cache_.erase(cache_iter++); else ++cache_iter; } } return scored_items; } bool URLIndexPrivateData::UpdateURL( history::HistoryService* history_service, const history::URLRow& row, const std::string& languages, const std::set& scheme_whitelist, base::CancelableTaskTracker* tracker) { // The row may or may not already be in our index. If it is not already // indexed and it qualifies then it gets indexed. If it is already // indexed and still qualifies then it gets updated, otherwise it // is deleted from the index. bool row_was_updated = false; history::URLID row_id = row.id(); HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id); if (row_pos == history_info_map_.end()) { // This new row should be indexed if it qualifies. history::URLRow new_row(row); new_row.set_id(row_id); row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) && IndexRow(NULL, history_service, new_row, languages, scheme_whitelist, tracker); } else if (RowQualifiesAsSignificant(row, base::Time())) { // This indexed row still qualifies and will be re-indexed. // The url won't have changed but the title, visit count, etc. // might have changed. history::URLRow& row_to_update = row_pos->second.url_row; bool title_updated = row_to_update.title() != row.title(); if (row_to_update.visit_count() != row.visit_count() || row_to_update.typed_count() != row.typed_count() || row_to_update.last_visit() != row.last_visit() || title_updated) { row_to_update.set_visit_count(row.visit_count()); row_to_update.set_typed_count(row.typed_count()); row_to_update.set_last_visit(row.last_visit()); // If something appears to have changed, update the recent visits // information. ScheduleUpdateRecentVisits(history_service, row_id, tracker); // While the URL is guaranteed to remain stable, the title may have // changed. If so, then update the index with the changed words. if (title_updated) { // Clear all words associated with this row and re-index both the // URL and title. RemoveRowWordsFromIndex(row_to_update); row_to_update.set_title(row.title()); RowWordStarts word_starts; AddRowWordsToIndex(row_to_update, &word_starts, languages); word_starts_map_[row_id] = word_starts; } row_was_updated = true; } } else { // This indexed row no longer qualifies and will be de-indexed by // clearing all words associated with this row. RemoveRowFromIndex(row); row_was_updated = true; } if (row_was_updated) search_term_cache_.clear(); // This invalidates the cache. return row_was_updated; } void URLIndexPrivateData::UpdateRecentVisits( history::URLID url_id, const history::VisitVector& recent_visits) { HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id); if (row_pos != history_info_map_.end()) { VisitInfoVector* visits = &row_pos->second.visits; visits->clear(); const size_t size = std::min(recent_visits.size(), kMaxVisitsToStoreInCache); visits->reserve(size); for (size_t i = 0; i < size; i++) { // Copy from the history::VisitVector the only fields visits needs. visits->push_back(std::make_pair(recent_visits[i].visit_time, recent_visits[i].transition)); } } // Else: Oddly, the URL doesn't seem to exist in the private index. // Ignore this update. This can happen if, for instance, the user // removes the URL from URLIndexPrivateData before the historyDB call // returns. } void URLIndexPrivateData::ScheduleUpdateRecentVisits( history::HistoryService* history_service, history::URLID url_id, base::CancelableTaskTracker* tracker) { history_service->ScheduleDBTask( scoped_ptr( new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker); } // Helper functor for DeleteURL. class HistoryInfoMapItemHasURL { public: explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {} bool operator()(const std::pair& item) { return item.second.url_row.url() == url_; } private: const GURL& url_; }; bool URLIndexPrivateData::DeleteURL(const GURL& url) { // Find the matching entry in the history_info_map_. HistoryInfoMap::iterator pos = std::find_if( history_info_map_.begin(), history_info_map_.end(), HistoryInfoMapItemHasURL(url)); if (pos == history_info_map_.end()) return false; RemoveRowFromIndex(pos->second.url_row); search_term_cache_.clear(); // This invalidates the cache. return true; } // static scoped_refptr URLIndexPrivateData::RestoreFromFile( const base::FilePath& file_path, const std::string& languages) { base::TimeTicks beginning_time = base::TimeTicks::Now(); if (!base::PathExists(file_path)) return NULL; std::string data; // If there is no cache file then simply give up. This will cause us to // attempt to rebuild from the history database. if (!base::ReadFileToString(file_path, &data)) return NULL; scoped_refptr restored_data(new URLIndexPrivateData); InMemoryURLIndexCacheItem index_cache; if (!index_cache.ParseFromArray(data.c_str(), data.size())) { LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from " << file_path.value(); return restored_data; } if (!restored_data->RestorePrivateData(index_cache, languages)) return NULL; UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime", base::TimeTicks::Now() - beginning_time); UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", restored_data->history_id_word_map_.size()); UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size()); UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", restored_data->word_map_.size()); UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", restored_data->char_word_map_.size()); if (restored_data->Empty()) return NULL; // 'No data' is the same as a failed reload. return restored_data; } // static scoped_refptr URLIndexPrivateData::RebuildFromHistory( history::HistoryDatabase* history_db, const std::string& languages, const std::set& scheme_whitelist) { if (!history_db) return NULL; base::TimeTicks beginning_time = base::TimeTicks::Now(); scoped_refptr rebuilt_data(new URLIndexPrivateData); history::URLDatabase::URLEnumerator history_enum; if (!history_db->InitURLEnumeratorForSignificant(&history_enum)) return NULL; rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now(); for (history::URLRow row; history_enum.GetNextURL(&row);) { rebuilt_data->IndexRow( history_db, NULL, row, languages, scheme_whitelist, NULL); } UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime", base::TimeTicks::Now() - beginning_time); UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems", rebuilt_data->history_id_word_map_.size()); UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords", rebuilt_data->word_map_.size()); UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars", rebuilt_data->char_word_map_.size()); return rebuilt_data; } // static bool URLIndexPrivateData::WritePrivateDataToCacheFileTask( scoped_refptr private_data, const base::FilePath& file_path) { DCHECK(private_data.get()); DCHECK(!file_path.empty()); return private_data->SaveToFile(file_path); } scoped_refptr URLIndexPrivateData::Duplicate() const { scoped_refptr data_copy = new URLIndexPrivateData; data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_; data_copy->word_list_ = word_list_; data_copy->available_words_ = available_words_; data_copy->word_map_ = word_map_; data_copy->char_word_map_ = char_word_map_; data_copy->word_id_history_map_ = word_id_history_map_; data_copy->history_id_word_map_ = history_id_word_map_; data_copy->history_info_map_ = history_info_map_; data_copy->word_starts_map_ = word_starts_map_; return data_copy; // Not copied: // search_term_cache_ // pre_filter_item_count_ // post_filter_item_count_ // post_scoring_item_count_ } bool URLIndexPrivateData::Empty() const { return history_info_map_.empty(); } void URLIndexPrivateData::Clear() { last_time_rebuilt_from_history_ = base::Time(); word_list_.clear(); available_words_.clear(); word_map_.clear(); char_word_map_.clear(); word_id_history_map_.clear(); history_id_word_map_.clear(); history_info_map_.clear(); word_starts_map_.clear(); } URLIndexPrivateData::~URLIndexPrivateData() {} HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords( const String16Vector& unsorted_words) { // Break the terms down into individual terms (words), get the candidate // set for each term, and intersect each to get a final candidate list. // Note that a single 'term' from the user's perspective might be // a string like "http://www.somewebsite.com" which, from our perspective, // is four words: 'http', 'www', 'somewebsite', and 'com'. HistoryIDSet history_id_set; String16Vector words(unsorted_words); // Sort the words into the longest first as such are likely to narrow down // the results quicker. Also, single character words are the most expensive // to process so save them for last. std::sort(words.begin(), words.end(), LengthGreater); for (String16Vector::iterator iter = words.begin(); iter != words.end(); ++iter) { base::string16 uni_word = *iter; HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word); if (term_history_set.empty()) { history_id_set.clear(); break; } if (iter == words.begin()) { history_id_set.swap(term_history_set); } else { HistoryIDSet new_history_id_set = base::STLSetIntersection( history_id_set, term_history_set); history_id_set.swap(new_history_id_set); } } return history_id_set; } HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm( const base::string16& term) { if (term.empty()) return HistoryIDSet(); // TODO(mrossetti): Consider optimizing for very common terms such as // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently // occuring words in the user's searches. size_t term_length = term.length(); WordIDSet word_id_set; if (term_length > 1) { // See if this term or a prefix thereof is present in the cache. base::string16 term_lower = base::i18n::ToLower(term); SearchTermCacheMap::iterator best_prefix(search_term_cache_.end()); for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin(); cache_iter != search_term_cache_.end(); ++cache_iter) { if (base::StartsWith(term_lower, base::i18n::ToLower(cache_iter->first), base::CompareCase::SENSITIVE) && (best_prefix == search_term_cache_.end() || cache_iter->first.length() > best_prefix->first.length())) best_prefix = cache_iter; } // If a prefix was found then determine the leftover characters to be used // for further refining the results from that prefix. Char16Set prefix_chars; base::string16 leftovers(term); if (best_prefix != search_term_cache_.end()) { // If the prefix is an exact match for the term then grab the cached // results and we're done. size_t prefix_length = best_prefix->first.length(); if (prefix_length == term_length) { best_prefix->second.used_ = true; return best_prefix->second.history_id_set_; } // Otherwise we have a handy starting point. // If there are no history results for this prefix then we can bail early // as there will be no history results for the full term. if (best_prefix->second.history_id_set_.empty()) { search_term_cache_[term] = SearchTermCacheItem(); return HistoryIDSet(); } word_id_set = best_prefix->second.word_id_set_; prefix_chars = Char16SetFromString16(best_prefix->first); leftovers = term.substr(prefix_length); } // Filter for each remaining, unique character in the term. Char16Set leftover_chars = Char16SetFromString16(leftovers); Char16Set unique_chars = base::STLSetDifference(leftover_chars, prefix_chars); // Reduce the word set with any leftover, unprocessed characters. if (!unique_chars.empty()) { WordIDSet leftover_set(WordIDSetForTermChars(unique_chars)); // We might come up empty on the leftovers. if (leftover_set.empty()) { search_term_cache_[term] = SearchTermCacheItem(); return HistoryIDSet(); } // Or there may not have been a prefix from which to start. if (prefix_chars.empty()) { word_id_set.swap(leftover_set); } else { WordIDSet new_word_id_set = base::STLSetIntersection( word_id_set, leftover_set); word_id_set.swap(new_word_id_set); } } // We must filter the word list because the resulting word set surely // contains words which do not have the search term as a proper subset. for (WordIDSet::iterator word_set_iter = word_id_set.begin(); word_set_iter != word_id_set.end(); ) { if (word_list_[*word_set_iter].find(term) == base::string16::npos) word_id_set.erase(word_set_iter++); else ++word_set_iter; } } else { word_id_set = WordIDSetForTermChars(Char16SetFromString16(term)); } // If any words resulted then we can compose a set of history IDs by unioning // the sets from each word. HistoryIDSet history_id_set; if (!word_id_set.empty()) { for (WordIDSet::iterator word_id_iter = word_id_set.begin(); word_id_iter != word_id_set.end(); ++word_id_iter) { WordID word_id = *word_id_iter; WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); if (word_iter != word_id_history_map_.end()) { HistoryIDSet& word_history_id_set(word_iter->second); history_id_set.insert(word_history_id_set.begin(), word_history_id_set.end()); } } } // Record a new cache entry for this word if the term is longer than // a single character. if (term_length > 1) search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set); return history_id_set; } WordIDSet URLIndexPrivateData::WordIDSetForTermChars( const Char16Set& term_chars) { WordIDSet word_id_set; for (Char16Set::const_iterator c_iter = term_chars.begin(); c_iter != term_chars.end(); ++c_iter) { CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter); if (char_iter == char_word_map_.end()) { // A character was not found so there are no matching results: bail. word_id_set.clear(); break; } WordIDSet& char_word_id_set(char_iter->second); // It is possible for there to no longer be any words associated with // a particular character. Give up in that case. if (char_word_id_set.empty()) { word_id_set.clear(); break; } if (c_iter == term_chars.begin()) { // First character results becomes base set of results. word_id_set = char_word_id_set; } else { // Subsequent character results get intersected in. WordIDSet new_word_id_set = base::STLSetIntersection( word_id_set, char_word_id_set); word_id_set.swap(new_word_id_set); } } return word_id_set; } bool URLIndexPrivateData::IndexRow( history::HistoryDatabase* history_db, history::HistoryService* history_service, const history::URLRow& row, const std::string& languages, const std::set& scheme_whitelist, base::CancelableTaskTracker* tracker) { const GURL& gurl(row.url()); // Index only URLs with a whitelisted scheme. if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist)) return false; history::URLID row_id = row.id(); // Strip out username and password before saving and indexing. base::string16 url(url_formatter::FormatUrl( gurl, languages, url_formatter::kFormatUrlOmitUsernamePassword, net::UnescapeRule::NONE, nullptr, nullptr, nullptr)); HistoryID history_id = static_cast(row_id); DCHECK_LT(history_id, std::numeric_limits::max()); // Add the row for quick lookup in the history info store. history::URLRow new_row(GURL(url), row_id); new_row.set_visit_count(row.visit_count()); new_row.set_typed_count(row.typed_count()); new_row.set_last_visit(row.last_visit()); new_row.set_title(row.title()); history_info_map_[history_id].url_row = new_row; // Index the words contained in the URL and title of the row. RowWordStarts word_starts; AddRowWordsToIndex(new_row, &word_starts, languages); word_starts_map_[history_id] = word_starts; // Update the recent visits information or schedule the update // as appropriate. if (history_db) { // We'd like to check that we're on the history DB thread. // However, unittest code actually calls this on the UI thread. // So we don't do any thread checks. history::VisitVector recent_visits; // Make sure the private data is going to get as many recent visits as // ScoredHistoryMatch::GetFrequency() hopes to use. DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore); if (history_db->GetMostRecentVisitsForURL(row_id, kMaxVisitsToStoreInCache, &recent_visits)) UpdateRecentVisits(row_id, recent_visits); } else { DCHECK(tracker); DCHECK(history_service); ScheduleUpdateRecentVisits(history_service, row_id, tracker); } return true; } void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow& row, RowWordStarts* word_starts, const std::string& languages) { HistoryID history_id = static_cast(row.id()); // Split URL into individual, unique words then add in the title words. const GURL& gurl(row.url()); const base::string16& url = bookmarks::CleanUpUrlForMatching(gurl, languages, NULL); String16Set url_words = String16SetFromString16(url, word_starts ? &word_starts->url_word_starts_ : NULL); const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title()); String16Set title_words = String16SetFromString16(title, word_starts ? &word_starts->title_word_starts_ : NULL); String16Set words = base::STLSetUnion(url_words, title_words); for (String16Set::iterator word_iter = words.begin(); word_iter != words.end(); ++word_iter) AddWordToIndex(*word_iter, history_id); search_term_cache_.clear(); // Invalidate the term cache. } void URLIndexPrivateData::AddWordToIndex(const base::string16& term, HistoryID history_id) { WordMap::iterator word_pos = word_map_.find(term); if (word_pos != word_map_.end()) UpdateWordHistory(word_pos->second, history_id); else AddWordHistory(term, history_id); } void URLIndexPrivateData::AddWordHistory(const base::string16& term, HistoryID history_id) { WordID word_id = word_list_.size(); if (available_words_.empty()) { word_list_.push_back(term); } else { word_id = *(available_words_.begin()); word_list_[word_id] = term; available_words_.erase(word_id); } word_map_[term] = word_id; HistoryIDSet history_id_set; history_id_set.insert(history_id); word_id_history_map_[word_id] = history_id_set; AddToHistoryIDWordMap(history_id, word_id); // For each character in the newly added word (i.e. a word that is not // already in the word index), add the word to the character index. Char16Set characters = Char16SetFromString16(term); for (Char16Set::iterator uni_char_iter = characters.begin(); uni_char_iter != characters.end(); ++uni_char_iter) { base::char16 uni_char = *uni_char_iter; CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); if (char_iter != char_word_map_.end()) { // Update existing entry in the char/word index. WordIDSet& word_id_set(char_iter->second); word_id_set.insert(word_id); } else { // Create a new entry in the char/word index. WordIDSet word_id_set; word_id_set.insert(word_id); char_word_map_[uni_char] = word_id_set; } } } void URLIndexPrivateData::UpdateWordHistory(WordID word_id, HistoryID history_id) { WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); DCHECK(history_pos != word_id_history_map_.end()); HistoryIDSet& history_id_set(history_pos->second); history_id_set.insert(history_id); AddToHistoryIDWordMap(history_id, word_id); } void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id, WordID word_id) { HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id); if (iter != history_id_word_map_.end()) { WordIDSet& word_id_set(iter->second); word_id_set.insert(word_id); } else { WordIDSet word_id_set; word_id_set.insert(word_id); history_id_word_map_[history_id] = word_id_set; } } void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow& row) { RemoveRowWordsFromIndex(row); HistoryID history_id = static_cast(row.id()); history_info_map_.erase(history_id); word_starts_map_.erase(history_id); } void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow& row) { // Remove the entries in history_id_word_map_ and word_id_history_map_ for // this row. HistoryID history_id = static_cast(row.id()); WordIDSet word_id_set = history_id_word_map_[history_id]; history_id_word_map_.erase(history_id); // Reconcile any changes to word usage. for (WordIDSet::iterator word_id_iter = word_id_set.begin(); word_id_iter != word_id_set.end(); ++word_id_iter) { WordID word_id = *word_id_iter; word_id_history_map_[word_id].erase(history_id); if (!word_id_history_map_[word_id].empty()) continue; // The word is still in use. // The word is no longer in use. Reconcile any changes to character usage. base::string16 word = word_list_[word_id]; Char16Set characters = Char16SetFromString16(word); for (Char16Set::iterator uni_char_iter = characters.begin(); uni_char_iter != characters.end(); ++uni_char_iter) { base::char16 uni_char = *uni_char_iter; char_word_map_[uni_char].erase(word_id); if (char_word_map_[uni_char].empty()) char_word_map_.erase(uni_char); // No longer in use. } // Complete the removal of references to the word. word_id_history_map_.erase(word_id); word_map_.erase(word); word_list_[word_id] = base::string16(); available_words_.insert(word_id); } } void URLIndexPrivateData::ResetSearchTermCache() { for (SearchTermCacheMap::iterator iter = search_term_cache_.begin(); iter != search_term_cache_.end(); ++iter) iter->second.used_ = false; } bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) { base::TimeTicks beginning_time = base::TimeTicks::Now(); InMemoryURLIndexCacheItem index_cache; SavePrivateData(&index_cache); std::string data; if (!index_cache.SerializeToString(&data)) { LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache."; return false; } int size = data.size(); if (base::WriteFile(file_path, data.c_str(), size) != size) { LOG(WARNING) << "Failed to write " << file_path.value(); return false; } UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime", base::TimeTicks::Now() - beginning_time); return true; } void URLIndexPrivateData::SavePrivateData( InMemoryURLIndexCacheItem* cache) const { DCHECK(cache); cache->set_last_rebuild_timestamp( last_time_rebuilt_from_history_.ToInternalValue()); cache->set_version(saved_cache_version_); // history_item_count_ is no longer used but rather than change the protobuf // definition use a placeholder. This will go away with the switch to SQLite. cache->set_history_item_count(0); SaveWordList(cache); SaveWordMap(cache); SaveCharWordMap(cache); SaveWordIDHistoryMap(cache); SaveHistoryInfoMap(cache); SaveWordStartsMap(cache); } void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const { if (word_list_.empty()) return; WordListItem* list_item = cache->mutable_word_list(); list_item->set_word_count(word_list_.size()); for (String16Vector::const_iterator iter = word_list_.begin(); iter != word_list_.end(); ++iter) list_item->add_word(base::UTF16ToUTF8(*iter)); } void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const { if (word_map_.empty()) return; WordMapItem* map_item = cache->mutable_word_map(); map_item->set_item_count(word_map_.size()); for (WordMap::const_iterator iter = word_map_.begin(); iter != word_map_.end(); ++iter) { WordMapEntry* map_entry = map_item->add_word_map_entry(); map_entry->set_word(base::UTF16ToUTF8(iter->first)); map_entry->set_word_id(iter->second); } } void URLIndexPrivateData::SaveCharWordMap( InMemoryURLIndexCacheItem* cache) const { if (char_word_map_.empty()) return; CharWordMapItem* map_item = cache->mutable_char_word_map(); map_item->set_item_count(char_word_map_.size()); for (CharWordIDMap::const_iterator iter = char_word_map_.begin(); iter != char_word_map_.end(); ++iter) { CharWordMapEntry* map_entry = map_item->add_char_word_map_entry(); map_entry->set_char_16(iter->first); const WordIDSet& word_id_set(iter->second); map_entry->set_item_count(word_id_set.size()); for (WordIDSet::const_iterator set_iter = word_id_set.begin(); set_iter != word_id_set.end(); ++set_iter) map_entry->add_word_id(*set_iter); } } void URLIndexPrivateData::SaveWordIDHistoryMap( InMemoryURLIndexCacheItem* cache) const { if (word_id_history_map_.empty()) return; WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map(); map_item->set_item_count(word_id_history_map_.size()); for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin(); iter != word_id_history_map_.end(); ++iter) { WordIDHistoryMapEntry* map_entry = map_item->add_word_id_history_map_entry(); map_entry->set_word_id(iter->first); const HistoryIDSet& history_id_set(iter->second); map_entry->set_item_count(history_id_set.size()); for (HistoryIDSet::const_iterator set_iter = history_id_set.begin(); set_iter != history_id_set.end(); ++set_iter) map_entry->add_history_id(*set_iter); } } void URLIndexPrivateData::SaveHistoryInfoMap( InMemoryURLIndexCacheItem* cache) const { if (history_info_map_.empty()) return; HistoryInfoMapItem* map_item = cache->mutable_history_info_map(); map_item->set_item_count(history_info_map_.size()); for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); iter != history_info_map_.end(); ++iter) { HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry(); map_entry->set_history_id(iter->first); const history::URLRow& url_row(iter->second.url_row); // Note: We only save information that contributes to the index so there // is no need to save search_term_cache_ (not persistent). map_entry->set_visit_count(url_row.visit_count()); map_entry->set_typed_count(url_row.typed_count()); map_entry->set_last_visit(url_row.last_visit().ToInternalValue()); map_entry->set_url(url_row.url().spec()); map_entry->set_title(base::UTF16ToUTF8(url_row.title())); const VisitInfoVector& visits(iter->second.visits); for (VisitInfoVector::const_iterator visit_iter = visits.begin(); visit_iter != visits.end(); ++visit_iter) { HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits(); visit_info->set_visit_time(visit_iter->first.ToInternalValue()); visit_info->set_transition_type(visit_iter->second); } } } void URLIndexPrivateData::SaveWordStartsMap( InMemoryURLIndexCacheItem* cache) const { if (word_starts_map_.empty()) return; // For unit testing: Enable saving of the cache as an earlier version to // allow testing of cache file upgrading in ReadFromFile(). // TODO(mrossetti): Instead of intruding on production code with this kind of // test harness, save a copy of an older version cache with known results. // Implement this when switching the caching over to SQLite. if (saved_cache_version_ < 1) return; WordStartsMapItem* map_item = cache->mutable_word_starts_map(); map_item->set_item_count(word_starts_map_.size()); for (WordStartsMap::const_iterator iter = word_starts_map_.begin(); iter != word_starts_map_.end(); ++iter) { WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry(); map_entry->set_history_id(iter->first); const RowWordStarts& word_starts(iter->second); for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin(); i != word_starts.url_word_starts_.end(); ++i) map_entry->add_url_word_starts(*i); for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin(); i != word_starts.title_word_starts_.end(); ++i) map_entry->add_title_word_starts(*i); } } bool URLIndexPrivateData::RestorePrivateData( const InMemoryURLIndexCacheItem& cache, const std::string& languages) { last_time_rebuilt_from_history_ = base::Time::FromInternalValue(cache.last_rebuild_timestamp()); const base::TimeDelta rebuilt_ago = base::Time::Now() - last_time_rebuilt_from_history_; if ((rebuilt_ago > base::TimeDelta::FromDays(7)) || (rebuilt_ago < base::TimeDelta::FromDays(-1))) { // Cache is more than a week old or, somehow, from some time in the future. // It's probably a good time to rebuild the index from history to // allow synced entries to now appear, expired entries to disappear, etc. // Allow one day in the future to make the cache not rebuild on simple // system clock changes such as time zone changes. return false; } if (cache.has_version()) { if (cache.version() < kCurrentCacheFileVersion) { // Don't try to restore an old format cache file. (This will cause // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData // from history.) return false; } restored_cache_version_ = cache.version(); } return RestoreWordList(cache) && RestoreWordMap(cache) && RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) && RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages); } bool URLIndexPrivateData::RestoreWordList( const InMemoryURLIndexCacheItem& cache) { if (!cache.has_word_list()) return false; const WordListItem& list_item(cache.word_list()); uint32 expected_item_count = list_item.word_count(); uint32 actual_item_count = list_item.word_size(); if (actual_item_count == 0 || actual_item_count != expected_item_count) return false; const RepeatedPtrField& words(list_item.word()); for (RepeatedPtrField::const_iterator iter = words.begin(); iter != words.end(); ++iter) word_list_.push_back(base::UTF8ToUTF16(*iter)); return true; } bool URLIndexPrivateData::RestoreWordMap( const InMemoryURLIndexCacheItem& cache) { if (!cache.has_word_map()) return false; const WordMapItem& list_item(cache.word_map()); uint32 expected_item_count = list_item.item_count(); uint32 actual_item_count = list_item.word_map_entry_size(); if (actual_item_count == 0 || actual_item_count != expected_item_count) return false; const RepeatedPtrField& entries(list_item.word_map_entry()); for (RepeatedPtrField::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id(); return true; } bool URLIndexPrivateData::RestoreCharWordMap( const InMemoryURLIndexCacheItem& cache) { if (!cache.has_char_word_map()) return false; const CharWordMapItem& list_item(cache.char_word_map()); uint32 expected_item_count = list_item.item_count(); uint32 actual_item_count = list_item.char_word_map_entry_size(); if (actual_item_count == 0 || actual_item_count != expected_item_count) return false; const RepeatedPtrField& entries(list_item.char_word_map_entry()); for (RepeatedPtrField::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) { expected_item_count = iter->item_count(); actual_item_count = iter->word_id_size(); if (actual_item_count == 0 || actual_item_count != expected_item_count) return false; base::char16 uni_char = static_cast(iter->char_16()); WordIDSet word_id_set; const RepeatedField& word_ids(iter->word_id()); for (RepeatedField::const_iterator jiter = word_ids.begin(); jiter != word_ids.end(); ++jiter) word_id_set.insert(*jiter); char_word_map_[uni_char] = word_id_set; } return true; } bool URLIndexPrivateData::RestoreWordIDHistoryMap( const InMemoryURLIndexCacheItem& cache) { if (!cache.has_word_id_history_map()) return false; const WordIDHistoryMapItem& list_item(cache.word_id_history_map()); uint32 expected_item_count = list_item.item_count(); uint32 actual_item_count = list_item.word_id_history_map_entry_size(); if (actual_item_count == 0 || actual_item_count != expected_item_count) return false; const RepeatedPtrField& entries(list_item.word_id_history_map_entry()); for (RepeatedPtrField::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) { expected_item_count = iter->item_count(); actual_item_count = iter->history_id_size(); if (actual_item_count == 0 || actual_item_count != expected_item_count) return false; WordID word_id = iter->word_id(); HistoryIDSet history_id_set; const RepeatedField& history_ids(iter->history_id()); for (RepeatedField::const_iterator jiter = history_ids.begin(); jiter != history_ids.end(); ++jiter) { history_id_set.insert(*jiter); AddToHistoryIDWordMap(*jiter, word_id); } word_id_history_map_[word_id] = history_id_set; } return true; } bool URLIndexPrivateData::RestoreHistoryInfoMap( const InMemoryURLIndexCacheItem& cache) { if (!cache.has_history_info_map()) return false; const HistoryInfoMapItem& list_item(cache.history_info_map()); uint32 expected_item_count = list_item.item_count(); uint32 actual_item_count = list_item.history_info_map_entry_size(); if (actual_item_count == 0 || actual_item_count != expected_item_count) return false; const RepeatedPtrField& entries(list_item.history_info_map_entry()); for (RepeatedPtrField::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) { HistoryID history_id = iter->history_id(); GURL url(iter->url()); history::URLRow url_row(url, history_id); url_row.set_visit_count(iter->visit_count()); url_row.set_typed_count(iter->typed_count()); url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit())); if (iter->has_title()) { base::string16 title(base::UTF8ToUTF16(iter->title())); url_row.set_title(title); } history_info_map_[history_id].url_row = url_row; // Restore visits list. VisitInfoVector visits; visits.reserve(iter->visits_size()); for (int i = 0; i < iter->visits_size(); ++i) { visits.push_back(std::make_pair( base::Time::FromInternalValue(iter->visits(i).visit_time()), ui::PageTransitionFromInt(iter->visits(i).transition_type()))); } history_info_map_[history_id].visits = visits; } return true; } bool URLIndexPrivateData::RestoreWordStartsMap( const InMemoryURLIndexCacheItem& cache, const std::string& languages) { // Note that this function must be called after RestoreHistoryInfoMap() has // been run as the word starts may have to be recalculated from the urls and // page titles. if (cache.has_word_starts_map()) { const WordStartsMapItem& list_item(cache.word_starts_map()); uint32 expected_item_count = list_item.item_count(); uint32 actual_item_count = list_item.word_starts_map_entry_size(); if (actual_item_count == 0 || actual_item_count != expected_item_count) return false; const RepeatedPtrField& entries(list_item.word_starts_map_entry()); for (RepeatedPtrField::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) { HistoryID history_id = iter->history_id(); RowWordStarts word_starts; // Restore the URL word starts. const RepeatedField& url_starts(iter->url_word_starts()); for (RepeatedField::const_iterator jiter = url_starts.begin(); jiter != url_starts.end(); ++jiter) word_starts.url_word_starts_.push_back(*jiter); // Restore the page title word starts. const RepeatedField& title_starts(iter->title_word_starts()); for (RepeatedField::const_iterator jiter = title_starts.begin(); jiter != title_starts.end(); ++jiter) word_starts.title_word_starts_.push_back(*jiter); word_starts_map_[history_id] = word_starts; } } else { // Since the cache did not contain any word starts we must rebuild then from // the URL and page titles. for (HistoryInfoMap::const_iterator iter = history_info_map_.begin(); iter != history_info_map_.end(); ++iter) { RowWordStarts word_starts; const history::URLRow& row(iter->second.url_row); const base::string16& url = bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL); String16VectorFromString16(url, false, &word_starts.url_word_starts_); const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title()); String16VectorFromString16(title, false, &word_starts.title_word_starts_); word_starts_map_[iter->first] = word_starts; } } return true; } // static bool URLIndexPrivateData::URLSchemeIsWhitelisted( const GURL& gurl, const std::set& whitelist) { return whitelist.find(gurl.scheme()) != whitelist.end(); } // SearchTermCacheItem --------------------------------------------------------- URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem( const WordIDSet& word_id_set, const HistoryIDSet& history_id_set) : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) { } URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) { } URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() { } // URLIndexPrivateData::AddHistoryMatch ---------------------------------------- URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch( bookmarks::BookmarkModel* bookmark_model, const URLIndexPrivateData& private_data, const std::string& languages, const base::string16& lower_string, const String16Vector& lower_terms, const base::Time now) : bookmark_model_(bookmark_model), private_data_(private_data), languages_(languages), lower_string_(lower_string), lower_terms_(lower_terms), now_(now) { // Calculate offsets for each term. For instance, the offset for // ".net" should be 1, indicating that the actual word-part of the term // starts at offset 1. lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u); for (size_t i = 0; i < lower_terms_.size(); ++i) { base::i18n::BreakIterator iter(lower_terms_[i], base::i18n::BreakIterator::BREAK_WORD); // If the iterator doesn't work, assume an offset of 0. if (!iter.Init()) continue; // Find the first word start. If the iterator didn't find a word break, set // an offset of term size. For example, the offset for "://" should be 3, // indicating that the word-part is missing. while (iter.Advance() && !iter.IsWord()) {} lower_terms_to_word_starts_offsets_[i] = iter.prev(); } } URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() { } void URLIndexPrivateData::AddHistoryMatch::operator()( const HistoryID history_id) { HistoryInfoMap::const_iterator hist_pos = private_data_.history_info_map_.find(history_id); if (hist_pos != private_data_.history_info_map_.end()) { const history::URLRow& hist_item = hist_pos->second.url_row; const VisitInfoVector& visits = hist_pos->second.visits; WordStartsMap::const_iterator starts_pos = private_data_.word_starts_map_.find(history_id); DCHECK(starts_pos != private_data_.word_starts_map_.end()); ScoredHistoryMatch match( hist_item, visits, languages_, lower_string_, lower_terms_, lower_terms_to_word_starts_offsets_, starts_pos->second, bookmark_model_ && bookmark_model_->IsBookmarked(hist_item.url()), now_); if (match.raw_score > 0) scored_matches_.push_back(match); } } // URLIndexPrivateData::HistoryItemFactorGreater ------------------------------- URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater( const HistoryInfoMap& history_info_map) : history_info_map_(history_info_map) { } URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() { } bool URLIndexPrivateData::HistoryItemFactorGreater::operator()( const HistoryID h1, const HistoryID h2) { HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1)); if (entry1 == history_info_map_.end()) return false; HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2)); if (entry2 == history_info_map_.end()) return true; const history::URLRow& r1(entry1->second.url_row); const history::URLRow& r2(entry2->second.url_row); // First cut: typed count, visit count, recency. // TODO(mrossetti): This is too simplistic. Consider an approach which ranks // recently visited (within the last 12/24 hours) as highly important. Get // input from mpearson. if (r1.typed_count() != r2.typed_count()) return (r1.typed_count() > r2.typed_count()); if (r1.visit_count() != r2.visit_count()) return (r1.visit_count() > r2.visit_count()); return (r1.last_visit() > r2.last_visit()); }