diff options
author | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-06-30 19:01:22 +0000 |
---|---|---|
committer | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-06-30 19:01:22 +0000 |
commit | a88fdf531a6a3f34e22d3d8443516b3970e4362f (patch) | |
tree | ddf2df86e958505d27f739a2cd67211fcedf2f23 /chrome | |
parent | a38b3b5ed715dcf37adabaf8632baf26efdce8a0 (diff) | |
download | chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.zip chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.tar.gz chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.tar.bz2 |
Add initial url/title history indexing and search provider support -- not hooked up at this point.
BUG=None
TEST=None
Review URL: http://codereview.chromium.org/2866025
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@51284 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome')
-rw-r--r-- | chrome/browser/history/in_memory_url_index.cc | 637 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index.h | 240 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index_unittest.cc | 5 | ||||
-rw-r--r-- | chrome/chrome_browser.gypi | 2 | ||||
-rw-r--r-- | chrome/chrome_tests.gypi | 1 |
5 files changed, 885 insertions, 0 deletions
diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc new file mode 100644 index 0000000..8c016a3 --- /dev/null +++ b/chrome/browser/history/in_memory_url_index.cc @@ -0,0 +1,637 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/browser/history/in_memory_url_index.h" + +#include <algorithm> +#include <limits> +#include <sstream> + +#include "app/sql/statement.h" +#include "base/string_util.h" +#include "third_party/icu/public/common/unicode/brkiter.h" +#include "third_party/icu/public/common/unicode/utypes.h" + +using std::string; +using icu::Locale; +using icu::BreakIterator; +using icu::RegexMatcher; +using icu::RuleBasedBreakIterator; +using icu::UnicodeString; +using icu::StringCharacterIterator; + +namespace history { + +// Define the following to '1' if the parameters to the URL should not be +// indexed. +#define TRIM_PARAMETERS 1 + +// Define the following to specify that only the most recent X history +// entries are to be considered. +#define LIMIT_HISTORY_TO 10000 + +// Indexing + +bool InMemoryURLIndex::IndexURLHistory(const FilePath& history_path) { + bool success = false; + // Reset our indexes. + char_word_map_.clear(); + word_id_history_map_.clear(); + if (history_database_.Open(history_path)) { + if (!history_database_.DoesTableExist("urls")) { + // TODO(mrossetti): Consider filtering only the most recent and the most + // often referenced history items. Also consider usage count when scoring. + std::ostringstream query_stream; + query_stream << "SELECT id, url, title FROM urls"; +#if LIMIT_HISTORY_TO + query_stream << " ORDER BY last_visit_time DESC LIMIT " << + LIMIT_HISTORY_TO; +#endif // LIMIT_HISTORY_TO + query_stream << ";"; + string query = query_stream.str(); + sql::Statement s(history_database_.GetUniqueStatement(query.c_str())); + if (!s) { + NOTREACHED() << "Statement prepare failed"; + return false; + } + success = true; + while (s.Step() && success) { + // Verify we got the expected columns. + // We expect: 1) id, 2) url, 3) title. + if (s.ColumnCount() == 3) { + int64 row_id(s.ColumnInt64(0)); + string url(s.ColumnString(1)); + string title(s.ColumnString(2)); + if (!IndexRow(row_id, url, title)) + success = false; + } else { + success = false; + } + } + } + } + return success; +} + +// NOTE: Not thread-safe because RuleBasedBreakIterator::getRuleStatus isn't. +bool InMemoryURLIndex::IndexRow(int64 row_id, string url, string title) { + UnicodeString uni_string = UnicodeString::fromUTF8(url.c_str()); + +#ifdef TRIM_PARAMETERS + if (!TrimParameters(uni_string)) + return false; +#endif + if (!ConvertPercentEncoding(uni_string)) + return false; + + // TODO(mrossetti): Detect test_id > std::numeric_limits<HistoryID>::max(). + HistoryID history_id = static_cast<HistoryID>(row_id); + + // Add the row for quick lookup in the history info store. + // NOTE: Add visit count and time to history item result for later scoring. + transform(url.begin(), url.end(), url.begin(), tolower); + string lower_title(title); + transform(lower_title.begin(), lower_title.end(), lower_title.begin(), + tolower); + string time_string(""); + URLHistoryItem hist_item(history_id, url, lower_title, 0, time_string); + history_info_map_.insert(std::make_pair(history_id, hist_item)); + + // Catenate URL and title then split into individual, unique words. + if (title[0]) { + uni_string.append(" "); + uni_string.append(UnicodeString::fromUTF8(title.c_str())); + } + UnicodeStringSet words = WordsFromUnicodeString(uni_string); + + // For each word, add a new entry into the word index referring to the + // associated history item. + for (UnicodeStringSet::iterator iter = words.begin(); + iter != words.end(); ++iter) { + UnicodeStringSet::value_type uni_word = *iter; + AddWordToIndex(uni_word, history_id); + } + ++history_item_count_; + return true; +} + +void InMemoryURLIndex::Reset() { + history_database_.Close(); + word_list_.clear(); + word_map_.clear(); + char_word_map_.clear(); + word_id_history_map_.clear(); + history_info_map_.clear(); + history_item_count_ = 0; +} + +// Searching + +URLItemVector InMemoryURLIndex::HistoryItemsForTerms( + SearchTermsVector const& terms) { + URLItemVector item_vector; + if (terms.size()) { + // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- + // and-sweep approach. + ResetTermCharWordSetCache(); + + // Lowercase the terms. + // TODO(mrossetti): Another opportunity for a transform algorithm. + SearchTermsVector lower_terms; + for (SearchTermsVector::const_iterator term_iter = terms.begin(); + term_iter != terms.end(); ++term_iter) { + SearchTermsVector::value_type lower_string(*term_iter); + transform(lower_string.begin(), lower_string.end(), lower_string.begin(), + tolower); + lower_terms.push_back(lower_string); + } + + // Composite a string by joining all terms, separated by a space. + // TODO(mrossetti): Use a transform algorithm, eh. + SearchTermsVector::value_type all_terms = lower_terms[0]; + for (SearchTermsVector::iterator term_iter = ++lower_terms.begin(); + term_iter != lower_terms.end(); ++term_iter) { + all_terms += " " + *term_iter; + } + + UnicodeString uni_string = UnicodeString::fromUTF8(all_terms.c_str()); + HistoryIDSet history_id_set = HistoryIDSetFromWords(uni_string); + + if (history_id_set.size()) { + // For each item in the final candidate list, perform a substring match + // for each original individual term and weed out non-matches. + for (HistoryIDSet::iterator id_iter = history_id_set.begin(); + id_iter != history_id_set.end(); ++id_iter) { + HistoryIDSet::value_type history_id = *id_iter; + HistoryInfoMap::iterator hist_pos = + history_info_map_.find(history_id); + if (hist_pos != history_info_map_.end()) { + URLHistoryItem& hist_item(hist_pos->second); + if (!hist_item.TermIsMissing(&lower_terms)) + item_vector.push_back(hist_item); + } else { + // Handle unexpected missing history item here. + } + } + + // Score. + // TODO(mrossetti): Implement scoring after looking at current impl. + } + } + + // Remove any stale TermCharWordSet's. + TermCharWordSetVector new_vector; + for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); + iter != term_char_word_set_cache_.end(); ++iter) { + if (iter->used_) + new_vector.push_back(*iter); + } + term_char_word_set_cache_ = new_vector; + + return item_vector; +} + +HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(UnicodeString uni_word) { + HistoryIDSet history_id_set; + + // For each character in the word, get the char/word_id map entry and + // intersect with the set in an incremental manner. + UCharSet uni_chars = CharactersFromUnicodeString(uni_word); + WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); + + // If any words resulted then we can compose a set of history IDs by unioning + // the sets from each word. + if (word_id_set.size()) { + for (WordIDSet::iterator word_id_iter = word_id_set.begin(); + word_id_iter != word_id_set.end(); ++word_id_iter) { + WordID word_id = *word_id_iter; + WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); + if (word_iter != word_id_history_map_.end()) { + HistoryIDSet& word_history_id_set(word_iter->second); + history_id_set.insert(word_history_id_set.begin(), + word_history_id_set.end()); + } + } + } + + return history_id_set; +} + +void InMemoryURLIndex::ResetTermCharWordSetCache() { + // TODO(mrossetti): Consider keeping more of the cache around for possible + // repeat searches, except a 'shortcuts' approach might be better for that. + // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. + for (TermCharWordSetVector::iterator iter = + term_char_word_set_cache_.begin(); + iter != term_char_word_set_cache_.end(); ++iter) + iter->used_ = false; +} + +HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(icu::UnicodeString + uni_string) { + // Break the terms down into individual terms (words), get the candidate + // set for each term, and intersect each to get a final candidate list. + HistoryIDSet history_id_set; + UnicodeStringSet words = WordsFromUnicodeString(uni_string); + bool first_word = true; + for (UnicodeStringSet::iterator iter = words.begin(); + iter != words.end(); ++iter) { + UnicodeStringSet::value_type uni_word = *iter; + HistoryIDSet term_history_id_set = + InMemoryURLIndex::HistoryIDsForTerm(uni_word); + if (first_word) { + history_id_set = term_history_id_set; + first_word = false; + } else { + HistoryIDSet old_history_id_set(history_id_set); + history_id_set.clear(); + std::set_intersection(old_history_id_set.begin(), + old_history_id_set.end(), + term_history_id_set.begin(), + term_history_id_set.end(), + std::inserter(history_id_set, + history_id_set.begin())); + } + } + return history_id_set; +} + +// Utility Functions + +UnicodeStringSet InMemoryURLIndex::WordsFromUnicodeString( + UnicodeString uni_string) { + UnicodeStringSet words; + UErrorCode icu_status = U_ZERO_ERROR; + // TODO(mrossetti): Get locale from browser: + // icu::Locale(g_browser_process->GetApplicationLocale().c_str()) + Locale locale(Locale::getDefault()); + + uni_string.toLower(locale); // Convert to lowercase. + + // Replace all | and _'s with a space. + // TODO(mrossetti): Roll this into the word_breaker_. + if (!bar_matcher_.get()) { + bar_matcher_.reset(new RegexMatcher("[|_]+", 0, icu_status)); + if (U_FAILURE(icu_status)) { + // There was a syntax error in the regular expression. + return words; + } + } + bar_matcher_->reset(uni_string); + uni_string = bar_matcher_->replaceAll(" ", icu_status); + if (U_FAILURE(icu_status)) { + // There was a failure during the replace. + return words; + } + + // Break up the string into individual words. + if (!word_breaker_.get()) { + word_breaker_.reset(static_cast<RuleBasedBreakIterator*> + (RuleBasedBreakIterator::createWordInstance(locale, icu_status))); + if (U_FAILURE(icu_status)) { + word_breaker_.reset(); + return words; + } + } + word_breaker_->setText(uni_string); + int32_t start = 0; + int32_t previous_boundary = 0; + while (start != BreakIterator::DONE) { + int32_t breakType = word_breaker_->getRuleStatus(); + if (breakType != UBRK_WORD_NONE) { + // TODO(mrossetti): Numbers may have non-numeric characters that + // will have to be stripped before term matching. Do this during + // term matching. + // number: UBRK_WORD_NUMBER <= breakType < UBRK_WORD_NUMBER_LIMIT + // word: UBRK_WORD_LETTER <= breakType < UBRK_WORD_LETTER_LIMIT + // kana: UBRK_WORD_KANA <= breakType < UBRK_WORD_KANA_LIMIT + // ideo: UBRK_WORD_IDEO <= breakType < UBRK_WORD_IDEO_LIMIT + int32_t current_boundary = word_breaker_->current(); + int32_t word_length = current_boundary - previous_boundary; + UnicodeString uni_word(uni_string, previous_boundary, word_length); + + // Remove non-word characters from the string. + if (!non_alnum_matcher_.get()) { + non_alnum_matcher_.reset(new RegexMatcher("\\W+", 0, icu_status)); + if (U_FAILURE(icu_status)) { + // There was a syntax errors in the regular expression. + return words; + } + } + non_alnum_matcher_->reset(uni_word); + uni_word = non_alnum_matcher_->replaceAll("", icu_status); + if (U_FAILURE(icu_status)) { + // There was a failure during the replace. + break; + } + + if (uni_word.length()) + words.insert(uni_word); + previous_boundary = current_boundary; + } + start = word_breaker_->next(); + } + return words; +} + +UCharSet InMemoryURLIndex::CharactersFromUnicodeString( + icu::UnicodeString uni_word) { + UCharSet characters; + icu::StringCharacterIterator ci(uni_word); + for (UChar uni_char = ci.first(); + uni_char != icu::CharacterIterator::DONE; uni_char = ci.next()) + characters.insert(uni_char); + return characters; +} + +bool InMemoryURLIndex::TrimParameters(icu::UnicodeString& uni_string) { + // Trim the URL at the first '?'. + UErrorCode icu_status = U_ZERO_ERROR; + if (!question_matcher_.get()) { + question_matcher_.reset(new RegexMatcher("\\?", 0, icu_status)); + if (U_FAILURE(icu_status)) { + // There was a syntax error in the regular expression. + return false; + } + } + question_matcher_->reset(uni_string); + if (question_matcher_->find()) { + int32_t start = question_matcher_->start(icu_status); + // TODO(mrossetti): Check status here. + uni_string.handleReplaceBetween(start, uni_string.length(), ""); + } + return true; +} + +bool InMemoryURLIndex::ConvertPercentEncoding(icu::UnicodeString& uni_string) { + // TODO(mrossetti): Convert, don't remove, the '%xx' encodings. + UErrorCode icu_status = U_ZERO_ERROR; + if (!percent_matcher_.get()) { + percent_matcher_.reset(new RegexMatcher("%[0-9a-fA-F]{2}", 0, icu_status)); + if (U_FAILURE(icu_status)) { + // There was a syntax error in the regular expression. + return false; + } + } + percent_matcher_->reset(uni_string); + uni_string = percent_matcher_->replaceAll(" ", icu_status); + if (U_FAILURE(icu_status)) { + // There was a failure during the replace. + return false; + } + return true; +} + +void InMemoryURLIndex::AddWordToIndex(icu::UnicodeString& uni_word, + HistoryID history_id) { + WordMap::iterator word_pos = word_map_.find(uni_word); + if (word_pos != word_map_.end()) { + // Update existing entry in the word/history index. + WordID word_id = word_pos->second; + WordIDHistoryMap::iterator history_pos = + word_id_history_map_.find(word_id); + if (history_pos != word_id_history_map_.end()) { + HistoryIDSet& history_id_set(history_pos->second); + history_id_set.insert(history_id); + } else { + // TODO(mrossetti): Handle the unexpected failure here. + } + } else { + // Add a new word to the word list and the word map, and then create a + // new entry in the word/history map. + word_list_.push_back(uni_word); + WordID word_id = word_list_.size() - 1; + if (word_map_.insert(std::make_pair(uni_word, word_id)).second) { + HistoryIDSet history_id_set; + history_id_set.insert(history_id); + if (word_id_history_map_.insert( + std::make_pair(word_id, history_id_set)).second) { + // For each character in the newly added word (i.e. a word that is not + // already in the word index), add the word to the character index. + UCharSet characters = + InMemoryURLIndex::CharactersFromUnicodeString(uni_word); + for (UCharSet::iterator uni_char_iter = characters.begin(); + uni_char_iter != characters.end(); ++uni_char_iter) { + UCharSet::value_type uni_string = *uni_char_iter; + CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string); + if (char_iter != char_word_map_.end()) { + // Update existing entry in the char/word index. + WordIDSet& word_id_set(char_iter->second); + word_id_set.insert(word_id); + } else { + // Create a new entry in the char/word index. + WordIDSet word_id_set; + word_id_set.insert(word_id); + if (!char_word_map_.insert(std::make_pair(uni_string, + word_id_set)).second) { + // TODO(mrossetti): Handle the unexpected failure here. + } + } + } + } else { + // TODO(mrossetti): Handle the unexpected failure here. + } + } else { + // TODO(mrossetti): Handle the unexpected failure here. + } + } +} + +WordIDSet InMemoryURLIndex::WordIDSetForTermChars(UCharSet const& uni_chars) { + TermCharWordSet* best_term_char_word_set = NULL; + bool set_found = false; + size_t outstanding_count = 0; + UCharSet outstanding_chars; + for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); + iter != term_char_word_set_cache_.end(); ++iter) { + TermCharWordSetVector::value_type& term_char_word_set(*iter); + UCharSet& char_set(term_char_word_set.char_set_); + UCharSet exclusions; + set_difference(char_set.begin(), char_set.end(), + uni_chars.begin(), uni_chars.end(), + std::inserter(exclusions, exclusions.begin())); + if (exclusions.size() == 0) { + // Do a reverse difference so that we know which characters remain + // to be indexed. Then decide if this is a better match than any + // previous cached set. + set_difference(uni_chars.begin(), uni_chars.end(), + char_set.begin(), char_set.end(), + std::inserter(exclusions, exclusions.begin())); + if (!set_found || exclusions.size() < outstanding_count) { + outstanding_chars = exclusions; + best_term_char_word_set = &*iter; + outstanding_count = exclusions.size(); + set_found = true; + } + } + } + + WordIDSet word_id_set; + if (set_found && outstanding_count == 0) { + // If there were no oustanding characters then we can use the cached one. + best_term_char_word_set->used_ = true; + word_id_set = best_term_char_word_set->word_id_set_; + } else { + // Some or all of the characters remain to be indexed. + bool first_character = true; + if (set_found) { + first_character = false; + word_id_set = best_term_char_word_set->word_id_set_; + } else { + outstanding_chars = uni_chars; + } + + for (UCharSet::iterator uni_char_iter = outstanding_chars.begin(); + uni_char_iter != outstanding_chars.end(); ++uni_char_iter) { + UCharSet::value_type uni_char = *uni_char_iter; + CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); + if (char_iter != char_word_map_.end()) { + WordIDSet& char_word_id_set(char_iter->second); + if (first_character) { + word_id_set = char_word_id_set; + first_character = false; + } else { + WordIDSet old_word_id_set(word_id_set); + word_id_set.clear(); + std::set_intersection(old_word_id_set.begin(), + old_word_id_set.end(), + char_word_id_set.begin(), + char_word_id_set.end(), + std::inserter(word_id_set, + word_id_set.begin())); + } + } else { + // The character was not found so bail. + word_id_set.clear(); + break; + } + } + // Update the cache. + if (!set_found || outstanding_count) { + TermCharWordSet new_char_word_set(uni_chars, word_id_set, true); + term_char_word_set_cache_.push_back(new_char_word_set); + } + } + return word_id_set; +} + +// Statistics + +#if HISTORY_INDEX_STATISTICS_ENABLED + +uint64 InMemoryURLIndex::IndexedHistoryCount() { + return history_item_count_; +} + +uint64 InMemoryURLIndex::IndexedWordCount() { + return word_list_.size(); +} + +// Return a count of the characters which have been indexed. +uint64 InMemoryURLIndex::IndexedCharacterCount() { + return char_word_map_.size(); +} + +UnicodeStringVector InMemoryURLIndex::IndexedWords() const { + UnicodeStringVector words(word_list_); + return words; // Return a copy. +} + +uint64 InMemoryURLIndex::NumberOfHistoryItemsForWord(UnicodeString uni_word) { + uint64 history_count = 0; + WordMap::iterator word_pos = word_map_.find(uni_word); + if (word_pos != word_map_.end()) { + WordID word_id = word_pos->second; + WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); + if (history_pos != word_id_history_map_.end()) + history_count = history_pos->second.size(); + } + return history_count; +} + +UnicodeStringVector InMemoryURLIndex::IndexedCharacters() { + UnicodeStringVector characters; + for (CharWordIDMap::iterator iter = char_word_map_.begin(); + iter != char_word_map_.end(); ++iter) + characters.push_back(iter->first); + return characters; +} + +uint64 InMemoryURLIndex::NumberOfWordsForChar(UChar uni_char) { + uint64 word_count = 0; + CharWordIDMap::iterator char_pos = char_word_map_.find(uni_char); + if (char_pos != char_word_map_.end()) + word_count = char_pos->second.size(); + return word_count; +} + +uint64 InMemoryURLIndex::WordListSize() { + uint64 listSize = 0; + // TODO(mrossetti): Use an algorithm. + for (UnicodeStringVector::iterator iter = word_list_.begin(); + iter != word_list_.end(); ++iter) { + listSize += iter->length(); // vector overhead is ignored. + } + return listSize; +} + +uint64 InMemoryURLIndex::WordMapSize() { + uint64 mapSize = 0; + for (WordMap::iterator iter = word_map_.begin(); iter != word_map_.end(); + ++iter) { + mapSize += sizeof(WordID) + iter->first.length(); + } + return mapSize; +} + +uint64 InMemoryURLIndex::WordHistoryMapSize() { + uint64 mapSize = 0; + for (WordIDHistoryMap::iterator iter = word_id_history_map_.begin(); + iter != word_id_history_map_.end(); ++iter) { + mapSize += sizeof(WordID) + (iter->second.size() * sizeof(HistoryID)); + } + return mapSize; +} + +uint64 InMemoryURLIndex::CharWordMapSize() { + uint64 mapSize = 0; + for (CharWordIDMap::iterator iter = char_word_map_.begin(); + iter != char_word_map_.end(); ++iter) { + mapSize += sizeof(UChar) + (iter->second.size() * sizeof(WordID)); + } + return mapSize; +} + +uint64 InMemoryURLIndex::HistoryInfoSize() { + uint64 mapSize = 0; + for (HistoryInfoMap::iterator iter = history_info_map_.begin(); + iter != history_info_map_.end(); ++iter) { + mapSize += sizeof(iter->first); + URLHistoryItem& hist_item(iter->second); + mapSize += sizeof(hist_item.history_id_); + mapSize += hist_item.url_.size(); + mapSize += hist_item.title_.size(); + mapSize += sizeof(hist_item.visit_count_); + mapSize += hist_item.last_visit_time_.size(); + } + return mapSize; +} + +#endif // HISTORY_INDEX_STATISTICS_ENABLED + +// URLHistoryItem Methods + +bool URLHistoryItem::TermIsMissing(SearchTermsVector* terms_vector) const { + for (SearchTermsVector::iterator term_iter = terms_vector->begin(); + term_iter != terms_vector->end(); ++term_iter) { + SearchTermsVector::value_type& term(*term_iter); + if (term.length() > 1 && + url_.find(term) == string::npos && + title_.find(term) == string::npos) + return true; + } + return false; +} + +} // namespace history diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h new file mode 100644 index 0000000..43b4236 --- /dev/null +++ b/chrome/browser/history/in_memory_url_index.h @@ -0,0 +1,240 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ +#define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ + +#include <map> +#include <set> +#include <string> +#include <vector> + +#include "app/sql/connection.h" +#include "base/basictypes.h" +#include "base/scoped_ptr.h" +#include "base/string16.h" +#include "third_party/icu/public/common/unicode/rbbi.h" +#include "third_party/icu/public/i18n/unicode/regex.h" +#include "third_party/icu/public/common/unicode/unistr.h" + +namespace history { + +// Define the following if your application would like to collect information +// about the history index and memory usage. +// #define HISTORY_INDEX_STATISTICS_ENABLED 1 + +// TODO(mrossetti): Move as many as possible of these typedefs, structs, +// and support classes into the InMemoryURLIndex class. + +// UnicodeString compare functor which assists in handling UBool. +class UnicodeStringCompare { + public: + bool operator() (const icu::UnicodeString& lhs, + const icu::UnicodeString& rhs) { + return (lhs < rhs) ? true : false; + } +}; + +// Convenience types +typedef std::vector<icu::UnicodeString> UnicodeStringVector; +typedef std::set<icu::UnicodeString, UnicodeStringCompare> UnicodeStringSet; +typedef std::set<UChar> UCharSet; +typedef std::vector<std::string> SearchTermsVector; + +// An index into list of all of the words we have indexed. +typedef int16 WordID; + +// A map allowing a WordID to be determined given a word. +typedef std::map<icu::UnicodeString, WordID, UnicodeStringCompare> WordMap; + +// A map from character to word_id's. +typedef std::set<WordID> WordIDSet; // An index into the WordList. +typedef std::map<UChar, WordIDSet> CharWordIDMap; + +// A map from word_id to history item. +typedef int16 HistoryID; // uint16 might actually work for us. +typedef std::set<HistoryID> HistoryIDSet; +typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; + +// Support caching of term character intersections so that we can optimize +// searches which build upon a previous search. +struct TermCharWordSet { + TermCharWordSet(UCharSet char_set, WordIDSet word_id_set, bool used) + : char_set_(char_set), word_id_set_(word_id_set), used_(used) {} + + UCharSet char_set_; + WordIDSet word_id_set_; + bool used_; // true if this set has been used for the current term search. +}; +typedef std::vector<TermCharWordSet> TermCharWordSetVector; + +// Describes an entry in the urls table of the History database. +struct URLHistoryItem { + URLHistoryItem(HistoryID history_id, + std::string& url, + std::string& title, + int visit_count, + std::string& last_visit_time) + : history_id_(history_id), url_(url), title_(title), + visit_count_(visit_count), last_visit_time_(last_visit_time) {} + + // Return true if any of the search terms do not match a substring. + // TODO(mrossetti): Also need to score and add UI hints. + bool TermIsMissing(SearchTermsVector* terms_vector) const; + + HistoryID history_id_; + std::string url_; + std::string title_; + int visit_count_; + std::string last_visit_time_; + // TODO(mrossetti): Make a class. Add fields for score and to provide + // term matching metrics so a UI can highlight the exact match. +}; +typedef std::vector<URLHistoryItem> URLItemVector; + +// TODO(mrossetti): Rip out either the SQLite filtering or the in-memory +// HistoryInfo structure. +// A map from history_id to the history's URL and title. +typedef std::map<HistoryID, URLHistoryItem> HistoryInfoMap; + +// The history source. +class InMemoryURLIndex { + public: + InMemoryURLIndex() + : history_item_count_(0), question_matcher_(NULL), + percent_matcher_(NULL), non_alnum_matcher_(NULL), bar_matcher_(NULL), + word_breaker_(NULL) {} + ~InMemoryURLIndex() {} + + // Open and index the URL history database. + bool IndexURLHistory(const FilePath& history_path); + + // Reset the history index. + void Reset(); + + // Given a vector containing one or more words as utf8 strings, scan the + // history index and return a vector with all scored, matching history items. + // Each term must occur somewhere in the history item for the item to + // qualify, however, the terms do not necessarily have to be adjacent. + URLItemVector HistoryItemsForTerms(SearchTermsVector const& terms); + +#if HISTORY_INDEX_STATISTICS_ENABLED + + // Return a count of the history items which have been indexed. + uint64 IndexedHistoryCount(); + + // Return a count of the words which have been indexed. + uint64 IndexedWordCount(); + + // Return a count of the characters which have been indexed. + uint64 IndexedCharacterCount(); + + // Return a vector containing all of the words which have been indexed + // as UTF8 strings. + UnicodeStringVector IndexedWords() const; + + // Return the count of history items for the given word. + uint64 NumberOfHistoryItemsForWord(icu::UnicodeString uni_word); + + // Return a vector containing all of the characters which have been indexed + // as UTF8 strings. + UnicodeStringVector IndexedCharacters(); + + // Return the count of words for the given character. + uint64 NumberOfWordsForChar(UChar uni_char); + + // Return the total size of the word list vector. + uint64 WordListSize(); + + // Return the total size of the word/word-id map. + uint64 WordMapSize(); + + // Return the total size of the word_id_history_map_. + uint64 WordHistoryMapSize(); + + // Return the total size of the char_word_map_. + uint64 CharWordMapSize(); + + // Return the total size of the char_word_map_. + uint64 HistoryInfoSize(); + +#endif // HISTORY_INDEX_STATISTICS_ENABLED + + private: + // Break an UnicodeString down into individual words. + UnicodeStringSet WordsFromUnicodeString(icu::UnicodeString uni_string); + + // Given a set of UChars, find words containing those characters. If any + // existing, cached set is a proper subset then start with that cached + // set. Update the cache. + WordIDSet WordIDSetForTermChars(UCharSet const& uni_chars); + + // URL History indexing support functions. + + // Index one URL history item. + bool IndexRow(int64 row_id, std::string url, std::string title); + + // Break an UnicodeString down into its individual characters. + // Note that this is temporarily intended to work on a single word, but + // _will_ work on a string of words, perhaps with unexpected results. + // TODO(mrossetti): Lots of optimizations possible here for not restarting + // a search if the user is just typing along. Also, change this to uniString + // and properly handle substring matches, scoring and sorting the results + // by score. Also, provide the metrics for where the matches occur so that + // the UI can highlight the matched sections. + UCharSet CharactersFromUnicodeString(icu::UnicodeString uni_word); + + // Trim the parameters from the URL being indexed. Return true if there + // were not errors encountered during the operation. + bool TrimParameters(icu::UnicodeString& uni_string); + + // Convert %xx encodings in the URL being indexed. Return true if there + // were not errors encountered during the operation. + bool ConvertPercentEncoding(icu::UnicodeString& uni_string); + + // Given a single word, add a reference to the containing history item + // to the index. + void AddWordToIndex(icu::UnicodeString& uni_word, HistoryID history_id); + + // URL History index search support functions. + + // Given a single search term (i.e. one word, utf8) return a set of + // history IDs which map to that term but do not necessarily match + // as a substring. + HistoryIDSet HistoryIDsForTerm(icu::UnicodeString uni_word); + + // Clear the search term cache. This cache holds on to the intermediate + // word results for each previously typed character to eliminate the need + // to re-perform set operations for previously typed characters. + void ResetTermCharWordSetCache(); + + // Compose a set of history item IDs by intersecting the set for each word + // in |uni_string|. + HistoryIDSet HistoryIDSetFromWords(icu::UnicodeString uni_string); + + private: + UnicodeStringVector word_list_; // A list of all of indexed words. + // NOTE: A word will _never_ be removed from + // this vector to insure that the word_id + // remains immutable throughout the session, + // reducing maintenance complexity. + sql::Connection history_database_; + uint64 history_item_count_; + WordMap word_map_; + CharWordIDMap char_word_map_; + WordIDHistoryMap word_id_history_map_; + TermCharWordSetVector term_char_word_set_cache_; + HistoryInfoMap history_info_map_; + + // Cached items. + scoped_ptr<icu::RegexMatcher> question_matcher_; + scoped_ptr<icu::RegexMatcher> percent_matcher_; + scoped_ptr<icu::RegexMatcher> non_alnum_matcher_; + scoped_ptr<icu::RegexMatcher> bar_matcher_; + scoped_ptr<icu::RuleBasedBreakIterator> word_breaker_; +}; + +} // namespace history + +#endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc new file mode 100644 index 0000000..8dc9569 --- /dev/null +++ b/chrome/browser/history/in_memory_url_index_unittest.cc @@ -0,0 +1,5 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// TODO(mrossetti): Implement the unit tests diff --git a/chrome/chrome_browser.gypi b/chrome/chrome_browser.gypi index 4292a17..6415453 100644 --- a/chrome/chrome_browser.gypi +++ b/chrome/chrome_browser.gypi @@ -1572,6 +1572,8 @@ 'browser/history/in_memory_database.h', 'browser/history/in_memory_history_backend.cc', 'browser/history/in_memory_history_backend.h', + 'browser/history/in_memory_url_index.cc', + 'browser/history/in_memory_url_index.h', 'browser/history/page_usage_data.cc', 'browser/history/page_usage_data.h', 'browser/history/query_parser.cc', diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi index 69cef8d..0a55ebd 100644 --- a/chrome/chrome_tests.gypi +++ b/chrome/chrome_tests.gypi @@ -843,6 +843,7 @@ 'browser/history/history_querying_unittest.cc', 'browser/history/history_types_unittest.cc', 'browser/history/history_unittest.cc', + 'browser/history/in_memory_url_index_unittest.cc', 'browser/history/query_parser_unittest.cc', 'browser/history/snippet_unittest.cc', 'browser/history/starred_url_database_unittest.cc', |