// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/browser/history/in_memory_url_index.h" #include #include #include "app/l10n_util.h" #include "base/i18n/word_iterator.h" #include "base/string_util.h" #include "base/time.h" #include "base/utf_string_conversions.h" #include "chrome/browser/autocomplete/history_provider_util.h" #include "chrome/browser/history/url_database.h" #include "net/base/escape.h" #include "net/base/net_util.h" #include "unicode/utypes.h" // for int32_t using base::Time; using base::TimeDelta; namespace history { ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, size_t input_location, bool match_in_scheme, bool innermost_match, int score) : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), raw_score(score) { } struct InMemoryURLIndex::TermCharWordSet { TermCharWordSet(const Char16Set& char_set, const WordIDSet& word_id_set, bool used) : char_set_(char_set), word_id_set_(word_id_set), used_(used) {} bool IsNotUsed() const { return !used_; } Char16Set char_set_; WordIDSet word_id_set_; bool used_; // true if this set has been used for the current term search. }; InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {} InMemoryURLIndex::~InMemoryURLIndex() {} static const int32_t kURLToLowerBufferSize = 2048; // Indexing bool InMemoryURLIndex::Init(history::URLDatabase* history_db, const std::string& languages) { // TODO(mrossetti): Register for profile/language change notifications. languages_ = languages; // Reset our indexes. char_word_map_.clear(); word_id_history_map_.clear(); if (!history_db) return false; URLDatabase::URLEnumerator history_enum; if (history_db->InitURLEnumeratorForEverything(&history_enum)) { URLRow row; Time recent_threshold = InMemoryURLIndex::RecentThreshold(); while (history_enum.GetNextURL(&row)) { // Do some filtering so that we only get history items which could // possibly pass the HistoryURLProvider::CullPoorMatches filter later. if ((row.typed_count() > kLowQualityMatchTypedLimit) || (row.visit_count() > kLowQualityMatchVisitLimit) || (row.last_visit() >= recent_threshold)) { if (!IndexRow(row)) return false; } } } return true; } bool InMemoryURLIndex::IndexRow(const URLRow& row) { const GURL& gurl(row.url()); string16 url(net::FormatUrl(gurl, languages_, net::kFormatUrlOmitUsernamePassword, UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, NULL, NULL, NULL)); // TODO(mrossetti): Detect row_id > std::numeric_limits::max(). HistoryID history_id = static_cast(row.id()); // Add the row for quick lookup in the history info store. url = l10n_util::ToLower(url); URLRow new_row(GURL(url), row.id()); new_row.set_visit_count(row.visit_count()); new_row.set_typed_count(row.typed_count()); new_row.set_last_visit(row.last_visit()); new_row.set_title(row.title()); history_info_map_.insert(std::make_pair(history_id, new_row)); // Split into individual, unique words. String16Set words = WordSetFromString16(url); // For each word, add a new entry into the word index referring to the // associated history item. for (String16Set::iterator iter = words.begin(); iter != words.end(); ++iter) { String16Set::value_type uni_word = *iter; AddWordToIndex(uni_word, history_id); } ++history_item_count_; return true; } // Searching ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( const String16Vector& terms) { ScoredHistoryMatches scored_items; if (!terms.empty()) { // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- // and-sweep approach. ResetTermCharWordSetCache(); // Lowercase the terms. // TODO(mrossetti): Another opportunity for a transform algorithm. String16Vector lower_terms; for (String16Vector::const_iterator term_iter = terms.begin(); term_iter != terms.end(); ++term_iter) { String16Vector::value_type lower_string(*term_iter); std::transform(lower_string.begin(), lower_string.end(), lower_string.begin(), tolower); lower_terms.push_back(lower_string); } String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); // Pass over all of the candidates filtering out any without a proper // substring match, inserting those which pass in order by score. scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), AddHistoryMatch(*this, lower_terms)).ScoredMatches(); } // Remove any stale TermCharWordSet's. term_char_word_set_cache_.erase( std::remove_if(term_char_word_set_cache_.begin(), term_char_word_set_cache_.end(), std::mem_fun_ref(&TermCharWordSet::IsNotUsed)), term_char_word_set_cache_.end()); return scored_items; } void InMemoryURLIndex::ResetTermCharWordSetCache() { // TODO(mrossetti): Consider keeping more of the cache around for possible // repeat searches, except a 'shortcuts' approach might be better for that. // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); iter != term_char_word_set_cache_.end(); ++iter) iter->used_ = false; } InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( const string16& uni_string) { // Break the terms down into individual terms (words), get the candidate // set for each term, and intersect each to get a final candidate list. // Note that a single 'term' from the user's perspective might be // a string like "http://www.somewebsite.com" which, from our perspective, // is four words: 'http', 'www', 'somewebsite', and 'com'. HistoryIDSet history_id_set; String16Set words = WordSetFromString16(uni_string); bool first_word = true; for (String16Set::iterator iter = words.begin(); iter != words.end(); ++iter) { String16Set::value_type uni_word = *iter; HistoryIDSet term_history_id_set = InMemoryURLIndex::HistoryIDsForTerm(uni_word); if (first_word) { history_id_set.swap(term_history_id_set); first_word = false; } else { HistoryIDSet old_history_id_set(history_id_set); history_id_set.clear(); std::set_intersection(old_history_id_set.begin(), old_history_id_set.end(), term_history_id_set.begin(), term_history_id_set.end(), std::inserter(history_id_set, history_id_set.begin())); } } return history_id_set; } InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( const string16& uni_word) { InMemoryURLIndex::HistoryIDSet history_id_set; // For each character in the word, get the char/word_id map entry and // intersect with the set in an incremental manner. Char16Set uni_chars = CharactersFromString16(uni_word); WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); // If any words resulted then we can compose a set of history IDs by unioning // the sets from each word. if (!word_id_set.empty()) { for (WordIDSet::iterator word_id_iter = word_id_set.begin(); word_id_iter != word_id_set.end(); ++word_id_iter) { WordID word_id = *word_id_iter; WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); if (word_iter != word_id_history_map_.end()) { HistoryIDSet& word_history_id_set(word_iter->second); history_id_set.insert(word_history_id_set.begin(), word_history_id_set.end()); } } } return history_id_set; } // Utility Functions // static InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( const string16& uni_string) { String16Set words; WordIterator iter(&uni_string, WordIterator::BREAK_WORD); if (iter.Init()) { while (iter.Advance()) { if (iter.IsWord()) words.insert(iter.GetWord()); } } return words; } // static InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16( const string16& uni_word) { Char16Set characters; for (string16::const_iterator iter = uni_word.begin(); iter != uni_word.end(); ++iter) characters.insert(*iter); return characters; } void InMemoryURLIndex::AddWordToIndex(const string16& uni_word, HistoryID history_id) { WordMap::iterator word_pos = word_map_.find(uni_word); if (word_pos != word_map_.end()) UpdateWordHistory(word_pos->second, history_id); else AddWordHistory(uni_word, history_id); } void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); DCHECK(history_pos != word_id_history_map_.end()); HistoryIDSet& history_id_set(history_pos->second); history_id_set.insert(history_id); } // Add a new word to the word list and the word map, and then create a // new entry in the word/history map. void InMemoryURLIndex::AddWordHistory(const string16& uni_word, HistoryID history_id) { word_list_.push_back(uni_word); WordID word_id = word_list_.size() - 1; word_map_.insert(std::make_pair(uni_word, word_id)); HistoryIDSet history_id_set; history_id_set.insert(history_id); word_id_history_map_.insert(std::make_pair(word_id, history_id_set)); // For each character in the newly added word (i.e. a word that is not // already in the word index), add the word to the character index. Char16Set characters = CharactersFromString16(uni_word); for (Char16Set::iterator uni_char_iter = characters.begin(); uni_char_iter != characters.end(); ++uni_char_iter) { Char16Set::value_type uni_string = *uni_char_iter; CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string); if (char_iter != char_word_map_.end()) { // Update existing entry in the char/word index. WordIDSet& word_id_set(char_iter->second); word_id_set.insert(word_id); } else { // Create a new entry in the char/word index. WordIDSet word_id_set; word_id_set.insert(word_id); char_word_map_.insert(std::make_pair(uni_string, word_id_set)); } } } InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars( InMemoryURLIndex::Char16Set const& uni_chars) { TermCharWordSet* best_term_char_word_set = NULL; bool set_found = false; size_t outstanding_count = 0; Char16Set outstanding_chars; for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); iter != term_char_word_set_cache_.end(); ++iter) { TermCharWordSetVector::value_type& term_char_word_set(*iter); Char16Set& char_set(term_char_word_set.char_set_); Char16Set exclusions; std::set_difference(char_set.begin(), char_set.end(), uni_chars.begin(), uni_chars.end(), std::inserter(exclusions, exclusions.begin())); if (exclusions.empty()) { // Do a reverse difference so that we know which characters remain // to be indexed. Then decide if this is a better match than any // previous cached set. std::set_difference(uni_chars.begin(), uni_chars.end(), char_set.begin(), char_set.end(), std::inserter(exclusions, exclusions.begin())); if (!set_found || exclusions.size() < outstanding_count) { outstanding_chars = exclusions; best_term_char_word_set = &*iter; outstanding_count = exclusions.size(); set_found = true; } } } WordIDSet word_id_set; if (set_found && outstanding_count == 0) { // If there were no oustanding characters then we can use the cached one. best_term_char_word_set->used_ = true; word_id_set = best_term_char_word_set->word_id_set_; } else { // Some or all of the characters remain to be indexed. bool first_character = true; if (set_found) { first_character = false; word_id_set = best_term_char_word_set->word_id_set_; } else { outstanding_chars = uni_chars; } for (Char16Set::iterator uni_char_iter = outstanding_chars.begin(); uni_char_iter != outstanding_chars.end(); ++uni_char_iter) { Char16Set::value_type uni_char = *uni_char_iter; CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); if (char_iter == char_word_map_.end()) { // The character was not found so bail. word_id_set.clear(); break; } WordIDSet& char_word_id_set(char_iter->second); if (first_character) { word_id_set = char_word_id_set; first_character = false; } else { WordIDSet old_word_id_set(word_id_set); word_id_set.clear(); std::set_intersection(old_word_id_set.begin(), old_word_id_set.end(), char_word_id_set.begin(), char_word_id_set.end(), std::inserter(word_id_set, word_id_set.begin())); } } // Update the cache. if (!set_found || outstanding_count) { term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars, word_id_set, true)); } } return word_id_set; } // static int InMemoryURLIndex::RawScoreForURL(const URLRow& row, const String16Vector& terms, size_t* first_term_location) { GURL gurl = row.url(); if (!gurl.is_valid()) return 0; string16 url = UTF8ToUTF16(gurl.spec()); // Collect all term start positions so we can see if they appear in order. std::vector term_locations; int out_of_order = 0; // Count the terms which are out of order. size_t start_location_total = 0; size_t term_length_total = 0; for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); ++iter) { string16 term = *iter; size_t term_location = url.find(term); if (term_location == string16::npos) return 0; // A term was not found. This should not happen. if (iter != terms.begin()) { // See if this term is out-of-order. for (std::vector::const_iterator order_iter = term_locations.begin(); order_iter != term_locations.end(); ++order_iter) { if (term_location <= *order_iter) ++out_of_order; } } else { *first_term_location = term_location; } term_locations.push_back(term_location); start_location_total += term_location; term_length_total += term.size(); } // Calculate a raw score. // TODO(mrossetti): This is good enough for now but must be fine-tuned. const float kOrderMaxValue = 10.0; float order_value = 10.0; if (terms.size() > 1) { int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; order_value = (static_cast(max_possible_out_of_order - out_of_order) / max_possible_out_of_order) * kOrderMaxValue; } const float kStartMaxValue = 10.0; const size_t kMaxSignificantStart = 20; float start_value = (static_cast(kMaxSignificantStart - std::min(kMaxSignificantStart, start_location_total / terms.size()))) / static_cast(kMaxSignificantStart) * kStartMaxValue; const float kCompleteMaxValue = 10.0; float complete_value = (static_cast(term_length_total) / static_cast(url.size())) * kStartMaxValue; const float kLastVisitMaxValue = 10.0; const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); int64 delta_time = (kMaxSignificantDay - std::min((base::Time::Now() - row.last_visit()), kMaxSignificantDay)).ToInternalValue(); float last_visit_value = (static_cast(delta_time) / static_cast(kMaxSignificantDay.ToInternalValue())) * kLastVisitMaxValue; const float kVisitCountMaxValue = 10.0; const int kMaxSignificantVisits = 10; float visit_count_value = (static_cast(std::min(row.visit_count(), kMaxSignificantVisits))) / static_cast(kMaxSignificantVisits) * kVisitCountMaxValue; const float kTypedCountMaxValue = 20.0; const int kMaxSignificantTyped = 10; float typed_count_value = (static_cast(std::min(row.typed_count(), kMaxSignificantTyped))) / static_cast(kMaxSignificantTyped) * kTypedCountMaxValue; float raw_score = order_value + start_value + complete_value + last_visit_value + visit_count_value + typed_count_value; // Normalize the score. const float kMaxNormalizedRawScore = 1000.0; raw_score = (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + kLastVisitMaxValue + kVisitCountMaxValue + kTypedCountMaxValue)) * kMaxNormalizedRawScore; return static_cast(raw_score); } // static Time InMemoryURLIndex::RecentThreshold() { return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); } InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( const InMemoryURLIndex& index, const String16Vector& lower_terms) : index_(index), lower_terms_(lower_terms) { } InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} void InMemoryURLIndex::AddHistoryMatch::operator()( const InMemoryURLIndex::HistoryID history_id) { HistoryInfoMap::const_iterator hist_pos = index_.history_info_map_.find(history_id); if (hist_pos != index_.history_info_map_.end()) { const URLRow& hist_item = hist_pos->second; // TODO(mrossetti): Accommodate multiple term highlighting. size_t input_location = 0; int score = InMemoryURLIndex::RawScoreForURL( hist_item, lower_terms_, &input_location); if (score != 0) { // We only retain the top 10 highest scoring results so // see if this one fits into the top 10 and, if so, where. ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); while (scored_iter != scored_matches_.end() && (*scored_iter).raw_score > score) ++scored_iter; if ((scored_matches_.size() < 10) || (scored_iter != scored_matches_.end())) { // Create and insert the new item. // TODO(mrossetti): Properly set |match_in_scheme| and // |innermost_match|. bool match_in_scheme = false; bool innermost_match = true; ScoredHistoryMatch match(hist_item, input_location, match_in_scheme, innermost_match, score); if (!scored_matches_.empty()) scored_matches_.insert(scored_iter, match); else scored_matches_.push_back(match); // Trim any entries beyond 10. if (scored_matches_.size() > 10) { scored_matches_.erase(scored_matches_.begin() + 10, scored_matches_.end()); } } } } } } // namespace history