summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-06-25 17:40:13 +0000
committermrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-06-25 17:40:13 +0000
commit0b26c592f978b858fc6a9accf1e7d689df4cfd97 (patch)
treef22daf8d88ab563155cd568097c164e00c7869a8
parent431efb8ec6ad909c3c99b2edd7c5b8e2a0f2040a (diff)
downloadchromium_src-0b26c592f978b858fc6a9accf1e7d689df4cfd97.zip
chromium_src-0b26c592f978b858fc6a9accf1e7d689df4cfd97.tar.gz
chromium_src-0b26c592f978b858fc6a9accf1e7d689df4cfd97.tar.bz2
Add initial url/title history indexing and search provider support -- not hooked up at this point.
BUG=None TEST=None Review URL: http://codereview.chromium.org/2818029 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@50863 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--chrome/browser/history/history_index.cc698
-rw-r--r--chrome/browser/history/history_index.h211
-rw-r--r--chrome/browser/history/history_index_unittest.cc5
-rw-r--r--chrome/chrome_browser.gypi2
-rw-r--r--chrome/chrome_tests.gypi1
5 files changed, 917 insertions, 0 deletions
diff --git a/chrome/browser/history/history_index.cc b/chrome/browser/history/history_index.cc
new file mode 100644
index 0000000..aa7a72d
--- /dev/null
+++ b/chrome/browser/history/history_index.cc
@@ -0,0 +1,698 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/history/history_index.h"
+
+#include <algorithm>
+#include <sstream>
+
+#include "third_party/icu/public/common/unicode/brkiter.h"
+#include "third_party/icu/public/common/unicode/utypes.h"
+
+using std::string;
+using icu::Locale;
+using icu::BreakIterator;
+using icu::RegexMatcher;
+using icu::RuleBasedBreakIterator;
+using icu::UnicodeString;
+
+// Define the following to '1' if the parameters to the URL should not be
+// indexed.
+#define TRIM_PARAMETERS 1
+
+// Define the following to specify that only the most recent X history
+// entries are to be considered.
+#define LIMIT_HISTORY_TO 10000
+
+
+HistoryIndex::~HistoryIndex() {
+ if (history_database_)
+ sqlite3_close(history_database_);
+ delete question_matcher_;
+ delete percent_matcher_;
+ delete non_alnum_matcher_;
+ delete bar_matcher_;
+ delete word_breaker_;
+}
+
+// Indexing
+
+bool HistoryIndex::IndexURLHistory(std::string history_path) {
+ bool success = false;
+ // Reset our indexes.
+ char_word_map_.clear();
+ word_id_history_map_.clear();
+ if (history_database_)
+ sqlite3_close(history_database_);
+
+ int result = sqlite3_open(history_path.c_str(), &history_database_);
+ if (result == SQLITE_OK) {
+ // TODO(mrossetti): Consider filtering only the most recent and the most
+ // often referenced history items. Also consider usage count when scoring.
+ std::ostringstream query_stream;
+ query_stream << "SELECT id, url, title FROM urls";
+#if LIMIT_HISTORY_TO
+ query_stream << " ORDER BY last_visit_time DESC LIMIT " << LIMIT_HISTORY_TO;
+#endif // LIMIT_HISTORY_TO
+ query_stream << ";";
+ result = sqlite3_exec(history_database_,
+ query_stream.str().c_str(),
+ &URLHistoryQueryHandler,
+ reinterpret_cast<void*>(this),
+ NULL);
+ if (result == SQLITE_OK) {
+ success = true;
+ }
+ }
+ return success;
+}
+
+// NOTE: Not thread-safe because RuleBasedBreakIterator::getRuleStatus isn't.
+bool HistoryIndex::IndexRow(const char* id, const char* url,
+ const char* title) {
+ UnicodeString uni_string = UnicodeString::fromUTF8(url);
+ UErrorCode icu_status = U_ZERO_ERROR;
+
+#ifdef TRIM_PARAMETERS
+ // Trim the URL at the first '?'.
+ if (!question_matcher_) {
+ question_matcher_ = new RegexMatcher("\\?", 0, icu_status);
+ if (U_FAILURE(icu_status)) {
+ // There was a syntax error in the regular expression.
+ return false;
+ }
+ }
+ question_matcher_->reset(uni_string);
+ if (question_matcher_->find()) {
+ int32_t start = question_matcher_->start(icu_status);
+ // TODO(mrossetti): Check status here.
+ uni_string.handleReplaceBetween(start, uni_string.length(), "");
+ }
+#endif
+
+ // Eliminate any '%xx' encodings in the URL.
+ // TODO(mrossetti): We should probably convert the '%xx' encodings.
+ if (!percent_matcher_) {
+ percent_matcher_ = new RegexMatcher("%[0-9a-fA-F]{2}", 0, icu_status);
+ if (U_FAILURE(icu_status)) {
+ // There was a syntax error in the regular expression.
+ return false;
+ }
+ }
+ percent_matcher_->reset(uni_string);
+ uni_string = percent_matcher_->replaceAll(" ", icu_status);
+ if (U_FAILURE(icu_status)) {
+ // There was a failure during the replace.
+ return false;
+ }
+
+ // Determine the row ID in the database.
+ sqlite_int64 test_id = strtol(id, NULL, 10);
+ if (test_id < 0 || test_id > UINT16_MAX) {
+ // TODO(mrossetti): Handle error condition here.
+ }
+ HistoryID history_id = labs(test_id);
+
+ // Add the row for quick lookup in the history info store.
+ // NOTE: Update the following when visit count and time becomes important.
+ string lower_url(url);
+ string lower_title(title);
+ string time_string("");
+ // Lowercase URL and Title.
+ transform(lower_url.begin(), lower_url.end(), lower_url.begin(),
+ tolower);
+ transform(lower_title.begin(), lower_title.end(), lower_title.begin(),
+ tolower);
+ HistoryItem hist_item(history_id, lower_url, lower_title, 0, time_string);
+ history_info_map_.insert(std::make_pair(history_id, hist_item));
+
+ // Catenate the URL and the title in preparation for coming up with words.
+ if (title[0]) {
+ uni_string.append(" ");
+ uni_string.append(UnicodeString::fromUTF8(title));
+ }
+
+ // Break down the URL and title into individual, unique words.
+ UnicodeStringSet words = WordsFromUnicodeString(uni_string);
+
+ // For each word, add a new entry into the word index referring to the
+ // history item.
+ for (UnicodeStringSet::iterator iter = words.begin();
+ iter != words.end(); ++iter) {
+ UnicodeString uni_word = *iter;
+ WordMap::iterator word_pos = word_map_.find(uni_word);
+ if (word_pos != word_map_.end()) {
+ // Update existing entry in the word/history index.
+ WordID word_id = word_pos->second;
+ WordIDHistoryMap::iterator history_pos =
+ word_id_history_map_.find(word_id);
+ if (history_pos != word_id_history_map_.end()) {
+ HistoryIDSet& history_id_set(history_pos->second);
+ history_id_set.insert(history_id);
+ } else {
+ // TODO(mrossetti): Handle the unexpected failure here.
+ }
+ } else {
+ // Add a new word to the word list and the word map, and then create a
+ // new entry in the word/history map.
+ word_list_.push_back(uni_word);
+ WordID word_id = word_list_.size() - 1;
+ if (word_map_.insert(std::make_pair(uni_word, word_id)).second) {
+ HistoryIDSet history_id_set;
+ history_id_set.insert(history_id);
+ if (word_id_history_map_.insert(
+ std::make_pair(word_id, history_id_set)).second) {
+ // For each character in the newly added word (i.e. a word that is not
+ // already in the word index), add the word to the character index.
+ UCharSet characters =
+ HistoryIndex::CharactersFromUnicodeString(uni_word);
+ for (UCharSet::iterator uni_char_iter = characters.begin();
+ uni_char_iter != characters.end(); ++uni_char_iter) {
+ UChar uni_string = *uni_char_iter;
+ CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string);
+ if (char_iter != char_word_map_.end()) {
+ // Update existing entry in the char/word index.
+ WordIDSet& word_id_set(char_iter->second);
+ word_id_set.insert(word_id);
+ } else {
+ // Create a new entry in the char/word index.
+ WordIDSet word_id_set;
+ word_id_set.insert(word_id);
+ if (!char_word_map_.insert(std::make_pair(uni_string,
+ word_id_set)).second) {
+ // TODO(mrossetti): Handle the unexpected failure here.
+ }
+ }
+ }
+ } else {
+ // TODO(mrossetti): Handle the unexpected failure here.
+ }
+ } else {
+ // TODO(mrossetti): Handle the unexpected failure here.
+ }
+ }
+ }
+ ++history_item_count_;
+ return true;
+}
+
+void HistoryIndex::Reset() {
+ if (history_database_) {
+ sqlite3_close(history_database_);
+ history_database_ = NULL;
+ }
+ word_list_.clear();
+ word_map_.clear();
+ char_word_map_.clear();
+ word_id_history_map_.clear();
+ history_info_map_.clear();
+ history_item_count_ = 0;
+}
+
+int HistoryIndex::URLHistoryQueryHandler(void* context, int columns,
+ char** column_strings,
+ char** column_names) {
+ int success = SQLITE_ABORT;
+ // Verify we got the expected columns.
+ // We expect: 1) id, 2) url, 3) title.
+ if (columns == 3) {
+ HistoryIndex* delegate(static_cast<HistoryIndex*>(context));
+ if (delegate->IndexRow(column_strings[0], column_strings[1],
+ column_strings[2]))
+ success = SQLITE_OK;
+ }
+ return success;
+}
+
+// Searching
+
+HistoryItemVector HistoryIndex::HistoryItemsForTerms(
+ SearchTermsVector const& terms) {
+ HistoryItemVector item_vector;
+ if (history_database_ && terms.size()) {
+ // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
+ // and-sweep approach.
+ // TODO(mrossetti): Consider keeping more of the cache around for possible
+ // repeat searches, except a 'shortcuts' approach might be better for that.
+ // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
+ for (TermCharWordSetVector::iterator iter =
+ term_char_word_set_cache_.begin();
+ iter != term_char_word_set_cache_.end(); ++iter)
+ iter->used_ = false;
+
+ // Lowercase the terms.
+ SearchTermsVector lower_terms;
+ for (SearchTermsVector::const_iterator term_iter = terms.begin();
+ term_iter != terms.end(); ++term_iter) {
+ string lower_string(*term_iter);
+ transform(lower_string.begin(), lower_string.end(), lower_string.begin(),
+ tolower);
+ lower_terms.push_back(lower_string);
+ }
+
+ // Composite a string by joining all terms, separated by a space.
+ // TODO(mrossetti): Use a transform algorithm, eh.
+ string all_terms = lower_terms[0];
+ for (SearchTermsVector::iterator term_iter = ++lower_terms.begin();
+ term_iter != lower_terms.end(); ++term_iter) {
+ all_terms += " " + *term_iter;
+ }
+
+ UnicodeString uni_string = UnicodeString::fromUTF8(all_terms.c_str());
+
+ // Break the terms down into individual terms (words), get the candidate
+ // set for each term, and intersect each to get a final candidate list.
+ HistoryIDSet history_id_set;
+ UnicodeStringSet words = WordsFromUnicodeString(uni_string);
+ bool first_word = true;
+ for (UnicodeStringSet::iterator iter = words.begin();
+ iter != words.end(); ++iter) {
+ UnicodeString uni_word = *iter;
+ HistoryIDSet term_history_id_set =
+ HistoryIndex::HistoryIDsForTerm(uni_word);
+ if (first_word) {
+ history_id_set = term_history_id_set;
+ first_word = false;
+ } else {
+ HistoryIDSet old_history_id_set(history_id_set);
+ history_id_set.clear();
+ std::set_intersection(old_history_id_set.begin(),
+ old_history_id_set.end(),
+ term_history_id_set.begin(),
+ term_history_id_set.end(),
+ std::inserter(history_id_set,
+ history_id_set.begin()));
+ }
+ }
+
+ if (history_id_set.size()) {
+ // For each item in the final candidate list, perform a substring match
+ // for each original individual term and weed out non-matches.
+ if (use_SQLite_filter_) {
+ std::ostringstream query_stream;
+ query_stream << "SELECT id, url, title, visit_count, last_visit_time "
+ "FROM urls WHERE id IN ( ";
+
+ // I'm sure there's an algorithm for this!
+ bool first_id = true;
+ for (HistoryIDSet::iterator id_iter = history_id_set.begin();
+ id_iter != history_id_set.end(); ++id_iter) {
+ if (first_id) {
+ query_stream << *id_iter;
+ first_id = false;
+ } else {
+ query_stream << "," << *id_iter;
+ }
+ }
+ query_stream << ");";
+
+ HistoryItemVector term_query_results;
+ int result = sqlite3_exec(history_database_,
+ query_stream.str().c_str(),
+ &URLTermQueryHandler,
+ reinterpret_cast<void*>(&term_query_results),
+ NULL);
+ if (result == SQLITE_OK) {
+ // Reduce the results by filtering out non-substring matches.
+#if 0
+ // TODO(mrossetti): Regain your C++-fu and get the following to work.
+ term_query_results.erase(remove_if(
+ term_query_results.begin(),
+ term_query_results.end(),
+ bind2nd(mem_fun_ref(&HistoryItem::TermIsMissing), &lower_terms)));
+ item_vector = term_query_results;
+#else
+ for (HistoryItemVector::iterator hist_iter =
+ term_query_results.begin();
+ hist_iter != term_query_results.end(); ++hist_iter) {
+ HistoryItem& hist_item(*hist_iter);
+ if (!hist_item.TermIsMissing(&lower_terms))
+ item_vector.push_back(hist_item);
+ }
+#endif
+ }
+ // TODO(mrossetti): Handle any sqlite3 errors.
+ } else {
+ for (HistoryIDSet::iterator id_iter = history_id_set.begin();
+ id_iter != history_id_set.end(); ++id_iter) {
+ HistoryID history_id = *id_iter;
+ HistoryInfoMap::iterator hist_pos =
+ history_info_map_.find(history_id);
+ if (hist_pos != history_info_map_.end()) {
+ HistoryItem& hist_item(hist_pos->second);
+ if (!hist_item.TermIsMissing(&lower_terms))
+ item_vector.push_back(hist_item);
+ } else {
+ // Handle unexpected missing history item here.
+ }
+ }
+ }
+
+ // Score.
+ // TODO(mrossetti): Implement scoring after looking at current impl.
+ }
+ }
+
+ // Remove any stale TermCharWordSet's.
+ TermCharWordSetVector new_vector;
+ for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
+ iter != term_char_word_set_cache_.end(); ++iter) {
+ if (iter->used_)
+ new_vector.push_back(*iter);
+ }
+ term_char_word_set_cache_ = new_vector;
+
+ return item_vector;
+}
+
+HistoryIDSet HistoryIndex::HistoryIDsForTerm(UnicodeString uni_word) {
+ HistoryIDSet history_id_set;
+
+ // For each character in the word, get the char/word_id map entry and
+ // intersect with the set in an incremental manner.
+ UCharSet uni_chars = CharactersFromUnicodeString(uni_word);
+ WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
+
+ // If any words resulted then we can compose a set of history IDs by unioning
+ // the sets from each word.
+ if (word_id_set.size()) {
+ for (WordIDSet::iterator word_id_iter = word_id_set.begin();
+ word_id_iter != word_id_set.end(); ++word_id_iter) {
+ WordID word_id = *word_id_iter;
+ WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
+ if (word_iter != word_id_history_map_.end()) {
+ HistoryIDSet& word_history_id_set(word_iter->second);
+ history_id_set.insert(word_history_id_set.begin(),
+ word_history_id_set.end());
+ }
+ }
+ }
+
+ return history_id_set;
+}
+
+int HistoryIndex::URLTermQueryHandler(void* context, int columns,
+ char** column_strings,
+ char** column_names) {
+ int success = SQLITE_ABORT;
+ // Verify we got the expected columns.
+ // We expect: 1) id, 2) url, 3) title, 4) visit_count, 5) last_visit_time.
+ if (columns == 5) {
+ HistoryItemVector* items = static_cast<HistoryItemVector*>(context);
+ HistoryID history_id = atoll(column_strings[0]);
+ string url(column_strings[1]);
+ string title(column_strings[2]);
+ int visit_count = atoi(column_strings[3]);
+ string last_visit_time(column_strings[4]);
+ HistoryItem item(history_id, url, title, visit_count, last_visit_time);
+ items->push_back(item);
+ success = SQLITE_OK;
+ }
+ return success;
+}
+
+// Utility Functions
+
+UnicodeStringSet HistoryIndex::WordsFromUnicodeString(
+ UnicodeString uni_string) {
+ UnicodeStringSet words;
+ UErrorCode icu_status = U_ZERO_ERROR;
+ // TODO(mrossetti): Get locale from browser:
+ // icu::Locale(g_browser_process->GetApplicationLocale().c_str())
+ Locale locale(Locale::getDefault());
+
+ uni_string.toLower(locale); // Convert to lowercase.
+
+ // Replace all | & _'s with a space.
+ // TODO(mrossetti): Roll this into the word_breaker_.
+ if (!bar_matcher_) {
+ bar_matcher_ = new RegexMatcher("[|_]+", 0, icu_status);
+ if (U_FAILURE(icu_status)) {
+ // There was a syntax error in the regular expression.
+ return words;
+ }
+ }
+ bar_matcher_->reset(uni_string);
+ uni_string = bar_matcher_->replaceAll(" ", icu_status);
+ if (U_FAILURE(icu_status)) {
+ // There was a failure during the replace.
+ return words;
+ }
+
+ // Break up the string into individual words.
+ if (!word_breaker_) {
+ word_breaker_ = static_cast<RuleBasedBreakIterator*>
+ (RuleBasedBreakIterator::createWordInstance(locale, icu_status));
+ if (U_FAILURE(icu_status)) {
+ delete word_breaker_;
+ return words;
+ }
+ }
+ word_breaker_->setText(uni_string);
+ int32_t start = 0;
+ int32_t previous_boundary = 0;
+ while (start != BreakIterator::DONE) {
+ int32_t breakType = word_breaker_->getRuleStatus();
+ if (breakType != UBRK_WORD_NONE) {
+ // TODO(mrossetti): Numbers may have non-numeric characters that
+ // will have to be stripped before term matching. Do this during
+ // term matching.
+ // number: UBRK_WORD_NUMBER <= breakType < UBRK_WORD_NUMBER_LIMIT
+ // word: UBRK_WORD_LETTER <= breakType < UBRK_WORD_LETTER_LIMIT
+ // kana: UBRK_WORD_KANA <= breakType < UBRK_WORD_KANA_LIMIT
+ // ideo: UBRK_WORD_IDEO <= breakType < UBRK_WORD_IDEO_LIMIT
+ int32_t current_boundary = word_breaker_->current();
+ int32_t word_length = current_boundary - previous_boundary;
+ UnicodeString uni_word(uni_string, previous_boundary, word_length);
+
+ // Remove non-word characters from the string.
+ if (!non_alnum_matcher_) {
+ non_alnum_matcher_ = new RegexMatcher("\\W+", 0, icu_status);
+ if (U_FAILURE(icu_status)) {
+ // There was a syntax errors in the regular expression.
+ return words;
+ }
+ }
+ non_alnum_matcher_->reset(uni_word);
+ uni_word = non_alnum_matcher_->replaceAll("", icu_status);
+ if (U_FAILURE(icu_status)) {
+ // There was a failure during the replace.
+ break;
+ }
+
+ if (uni_word.length())
+ words.insert(uni_word);
+ previous_boundary = current_boundary;
+ }
+ start = word_breaker_->next();
+ }
+ return words;
+}
+
+UCharSet HistoryIndex::CharactersFromUnicodeString(
+ icu::UnicodeString uni_word) {
+ UCharSet characters;
+ icu::StringCharacterIterator ci(uni_word);
+ for (UChar uni_char = ci.first();
+ uni_char != icu::CharacterIterator::DONE; uni_char = ci.next())
+ characters.insert(uni_char);
+ return characters;
+}
+
+WordIDSet HistoryIndex::WordIDSetForTermChars(UCharSet const& uni_chars) {
+ TermCharWordSet* best_term_char_word_set = NULL;
+ bool set_found = false;
+ size_t outstanding_count = 0;
+ UCharSet outstanding_chars;
+ for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
+ iter != term_char_word_set_cache_.end(); ++iter) {
+ TermCharWordSet& term_char_word_set(*iter);
+ UCharSet& char_set(term_char_word_set.char_set_);
+ UCharSet exclusions;
+ set_difference(char_set.begin(), char_set.end(),
+ uni_chars.begin(), uni_chars.end(),
+ std::inserter(exclusions, exclusions.begin()));
+ if (exclusions.size() == 0) {
+ // Do a reverse difference so that we know which characters remain
+ // to be indexed. Then decide if this is a better match than any
+ // previous cached set.
+ set_difference(uni_chars.begin(), uni_chars.end(),
+ char_set.begin(), char_set.end(),
+ std::inserter(exclusions, exclusions.begin()));
+ if (!set_found || exclusions.size() < outstanding_count) {
+ outstanding_chars = exclusions;
+ best_term_char_word_set = &*iter;
+ outstanding_count = exclusions.size();
+ set_found = true;
+ }
+ }
+ }
+
+ WordIDSet word_id_set;
+ if (set_found && outstanding_count == 0) {
+ // If there were no oustanding characters then we can use the cached one.
+ best_term_char_word_set->used_ = true;
+ word_id_set = best_term_char_word_set->word_id_set_;
+ } else {
+ // Some or all of the characters remain to be indexed.
+ bool first_character = true;
+ if (set_found) {
+ first_character = false;
+ word_id_set = best_term_char_word_set->word_id_set_;
+ } else {
+ outstanding_chars = uni_chars;
+ }
+
+ for (UCharSet::iterator uni_char_iter = outstanding_chars.begin();
+ uni_char_iter != outstanding_chars.end(); ++uni_char_iter) {
+ UChar uni_char = *uni_char_iter;
+ CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
+ if (char_iter != char_word_map_.end()) {
+ WordIDSet& char_word_id_set(char_iter->second);
+ if (first_character) {
+ word_id_set = char_word_id_set;
+ first_character = false;
+ } else {
+ WordIDSet old_word_id_set(word_id_set);
+ word_id_set.clear();
+ std::set_intersection(old_word_id_set.begin(),
+ old_word_id_set.end(),
+ char_word_id_set.begin(),
+ char_word_id_set.end(),
+ std::inserter(word_id_set,
+ word_id_set.begin()));
+ }
+ } else {
+ // The character was not found so bail.
+ word_id_set.clear();
+ break;
+ }
+ }
+ // Update the cache.
+ if (!set_found || outstanding_count) {
+ TermCharWordSet new_char_word_set(uni_chars, word_id_set, true);
+ term_char_word_set_cache_.push_back(new_char_word_set);
+ }
+ }
+ return word_id_set;
+}
+
+// Statistics
+
+#if HISTORY_INDEX_STATISTICS_ENABLED
+
+u_int64_t HistoryIndex::IndexedHistoryCount() {
+ return history_item_count_;
+}
+
+u_int64_t HistoryIndex::IndexedWordCount() {
+ return word_list_.size();
+}
+
+// Return a count of the characters which have been indexed.
+u_int64_t HistoryIndex::IndexedCharacterCount() {
+ return char_word_map_.size();
+}
+
+UnicodeStringVector HistoryIndex::IndexedWords() const {
+ UnicodeStringVector words(word_list_);
+ return words; // Return a copy.
+}
+
+u_int64_t HistoryIndex::NumberOfHistoryItemsForWord(UnicodeString uni_word) {
+ u_int64_t history_count = 0;
+ WordMap::iterator word_pos = word_map_.find(uni_word);
+ if (word_pos != word_map_.end()) {
+ WordID word_id = word_pos->second;
+ WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
+ if (history_pos != word_id_history_map_.end())
+ history_count = history_pos->second.size();
+ }
+ return history_count;
+}
+
+UnicodeStringVector HistoryIndex::IndexedCharacters() {
+ UnicodeStringVector characters;
+ for (CharWordIDMap::iterator iter = char_word_map_.begin();
+ iter != char_word_map_.end(); ++iter)
+ characters.push_back(iter->first);
+ return characters;
+}
+
+u_int64_t HistoryIndex::NumberOfWordsForChar(UChar uni_char) {
+ u_int64_t word_count = 0;
+ CharWordIDMap::iterator char_pos = char_word_map_.find(uni_char);
+ if (char_pos != char_word_map_.end())
+ word_count = char_pos->second.size();
+ return word_count;
+}
+
+u_int64_t HistoryIndex::WordListSize() {
+ u_int64_t listSize = 0;
+ // TODO(mrossetti): Use an algorithm.
+ for (UnicodeStringVector::iterator iter = word_list_.begin();
+ iter != word_list_.end(); ++iter) {
+ listSize += iter->length(); // vector overhead is ignored.
+ }
+ return listSize;
+}
+
+u_int64_t HistoryIndex::WordMapSize() {
+ u_int64_t mapSize = 0;
+ for (WordMap::iterator iter = word_map_.begin(); iter != word_map_.end();
+ ++iter) {
+ mapSize += sizeof(WordID) + iter->first.length();
+ }
+ return mapSize;
+}
+
+u_int64_t HistoryIndex::WordHistoryMapSize() {
+ u_int64_t mapSize = 0;
+ for (WordIDHistoryMap::iterator iter = word_id_history_map_.begin();
+ iter != word_id_history_map_.end(); ++iter) {
+ mapSize += sizeof(WordID) + (iter->second.size() * sizeof(HistoryID));
+ }
+ return mapSize;
+}
+
+u_int64_t HistoryIndex::CharWordMapSize() {
+ u_int64_t mapSize = 0;
+ for (CharWordIDMap::iterator iter = char_word_map_.begin();
+ iter != char_word_map_.end(); ++iter) {
+ mapSize += sizeof(UChar) + (iter->second.size() * sizeof(WordID));
+ }
+ return mapSize;
+}
+
+u_int64_t HistoryIndex::HistoryInfoSize() {
+ u_int64_t mapSize = 0;
+ for (HistoryInfoMap::iterator iter = history_info_map_.begin();
+ iter != history_info_map_.end(); ++iter) {
+ mapSize += sizeof(iter->first);
+ HistoryItem& hist_item(iter->second);
+ mapSize += sizeof(hist_item.history_id_);
+ mapSize += hist_item.url_.size();
+ mapSize += hist_item.title_.size();
+ mapSize += sizeof(hist_item.visit_count_);
+ mapSize += hist_item.last_visit_time_.size();
+ }
+ return mapSize;
+}
+
+#endif // HISTORY_INDEX_STATISTICS_ENABLED
+
+// HistoryItem Methods
+
+bool HistoryItem::TermIsMissing(SearchTermsVector* terms_vector) const {
+ for (SearchTermsVector::iterator term_iter = terms_vector->begin();
+ term_iter != terms_vector->end(); ++term_iter) {
+ string term = *term_iter;
+ if (term.length() > 1 &&
+ url_.find(term) == string::npos &&
+ title_.find(term) == string::npos)
+ return true;
+ }
+ return false;
+}
+
diff --git a/chrome/browser/history/history_index.h b/chrome/browser/history/history_index.h
new file mode 100644
index 0000000..a2d3750
--- /dev/null
+++ b/chrome/browser/history/history_index.h
@@ -0,0 +1,211 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_BROWSER_HISTORY_HISTORY_INDEX_H_
+#define CHROME_BROWSER_HISTORY_HISTORY_INDEX_H_
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#if defined(USE_SYSTEM_SQLITE)
+#include <sqlite3.h>
+#else
+#include "third_party/sqlite/preprocessed/sqlite3.h"
+#endif
+
+#include "third_party/icu/public/common/unicode/rbbi.h"
+#include "third_party/icu/public/i18n/unicode/regex.h"
+#include "third_party/icu/public/common/unicode/unistr.h"
+
+#define HISTORY_INDEX_STATISTICS_ENABLED 1
+
+// Convenience types
+typedef std::vector<icu::UnicodeString> UnicodeStringVector;
+typedef std::set<icu::UnicodeString> UnicodeStringSet;
+typedef std::set<UChar> UCharSet;
+typedef std::vector<std::string> SearchTermsVector;
+
+// An index into list of all of the words we have indexed.
+typedef uint16_t WordID;
+
+// A map allowing a WordID to be determined given a word.
+typedef std::map<icu::UnicodeString, WordID> WordMap;
+
+// A map from character to word_id's.
+typedef std::set<WordID> WordIDSet; // An index into the WordList.
+typedef std::map<UChar, WordIDSet> CharWordIDMap;
+
+// A map from word_id to history item.
+typedef uint16_t HistoryID; // uint16_t might actually work for us.
+typedef std::set<HistoryID> HistoryIDSet;
+typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
+
+// Support caching of term character intersections so that we can optimize
+// searches which build upon a previous search.
+struct TermCharWordSet {
+ UCharSet char_set_;
+ WordIDSet word_id_set_;
+ bool used_; // true if this set has been used for the current term search.
+ TermCharWordSet(UCharSet char_set, WordIDSet word_id_set, bool used)
+ : char_set_(char_set), word_id_set_(word_id_set), used_(used) { }
+};
+typedef std::vector<TermCharWordSet> TermCharWordSetVector;
+
+// Search results base class.
+struct HistoryItem {
+ HistoryID history_id_;
+ std::string url_;
+ std::string title_;
+ int visit_count_;
+ std::string last_visit_time_;
+ // TODO(mrossetti): Make a class. Add fields for score and to provide
+ // term matching metrics so a UI can highlight the exact match.
+
+ HistoryItem(HistoryID history_id,
+ std::string& url,
+ std::string& title,
+ int visit_count,
+ std::string& last_visit_time)
+ : history_id_(history_id), url_(url), title_(title),
+ visit_count_(visit_count), last_visit_time_(last_visit_time) { }
+
+ // Return true if any of the search terms do not match a substring.
+ // TODO(mrossetti): Also need to score and add UI hints.
+ bool TermIsMissing(SearchTermsVector* terms_vector) const;
+};
+typedef std::vector<HistoryItem> HistoryItemVector;
+
+// TODO(mrossetti): Rip out either the SQLite filtering or the in-memory
+// HistoryInfo structure.
+// A map from history_id to the history's URL and title.
+typedef std::map<HistoryID, HistoryItem> HistoryInfoMap;
+
+// The history source.
+class HistoryIndex {
+ public:
+ HistoryIndex() : use_SQLite_filter_(false), history_database_(NULL),
+ history_item_count_(0), question_matcher_(NULL), percent_matcher_(NULL),
+ non_alnum_matcher_(NULL), bar_matcher_(NULL), word_breaker_(NULL) { }
+ ~HistoryIndex();
+
+ // Open and index the URL history database.
+ bool IndexURLHistory(std::string history_path);
+
+ // Reset the history index.
+ void Reset();
+
+ // Given a vector containing one or more words as utf8 strings, scan the
+ // history index and return a vector with all scored, matching history items.
+ // Each term must occur somewhere in the history item for the item to
+ // qualify, however, the terms do not necessarily have to be adjacent.
+ HistoryItemVector HistoryItemsForTerms(SearchTermsVector const& terms);
+
+ // NOTE: Temporary for timing and memory analysis.
+ void SetUseSQLiteFilter(bool use_filter) { use_SQLite_filter_ = use_filter; }
+
+#if HISTORY_INDEX_STATISTICS_ENABLED
+
+ // Return a count of the history items which have been indexed.
+ u_int64_t IndexedHistoryCount();
+
+ // Return a count of the words which have been indexed.
+ u_int64_t IndexedWordCount();
+
+ // Return a count of the characters which have been indexed.
+ u_int64_t IndexedCharacterCount();
+
+ // Return a vector containing all of the words which have been indexed
+ // as UTF8 strings.
+ UnicodeStringVector IndexedWords() const;
+
+ // Return the count of history items for the given word.
+ u_int64_t NumberOfHistoryItemsForWord(icu::UnicodeString uni_word);
+
+ // Return a vector containing all of the characters which have been indexed
+ // as UTF8 strings.
+ UnicodeStringVector IndexedCharacters();
+
+ // Return the count of words for the given character.
+ u_int64_t NumberOfWordsForChar(UChar uni_char);
+
+ // Return the total size of the word list vector.
+ u_int64_t WordListSize();
+
+ // Return the total size of the word/word-id map.
+ u_int64_t WordMapSize();
+
+ // Return the total size of the word_id_history_map_.
+ u_int64_t WordHistoryMapSize();
+
+ // Return the total size of the char_word_map_.
+ u_int64_t CharWordMapSize();
+
+ // Return the total size of the char_word_map_.
+ u_int64_t HistoryInfoSize();
+
+#endif // HISTORY_INDEX_STATISTICS_ENABLED
+
+ private:
+ // Index one URL history item.
+ bool IndexRow(const char* id, const char* url, const char* title);
+
+ // sqlite3 query callback used during initial history indexing.
+ static int URLHistoryQueryHandler(void* context, int columns,
+ char** column_strings,
+ char** column_names);
+
+ // Given a single search term (i.e. one word, utf8) return a set of
+ // history IDs which map to that term but do not necessarily match
+ // as a substring.
+ HistoryIDSet HistoryIDsForTerm(icu::UnicodeString uni_word);
+
+ // sqlite3 query callback used during final term filtering.
+ static int URLTermQueryHandler(void* context, int columns,
+ char** column_strings,
+ char** column_names);
+
+ // Break an UnicodeString down into individual words.
+ UnicodeStringSet WordsFromUnicodeString(icu::UnicodeString uni_string);
+
+ // Break an UnicodeString down into its individual characters.
+ // Note that this is temporarily intended to work on a single word, but
+ // _will_ work on a string of words, perhaps with unexpected results.
+ // TODO(mrossetti): Lots of optimizations possible here for not restarting
+ // a search if the user is just typing along. Also, change this to uniString
+ // and properly handle substring matches, scoring and sorting the results
+ // by score. Also, provide the metrics for where the matches occur so that
+ // the UI can highlight the matched sections.
+ UCharSet CharactersFromUnicodeString(icu::UnicodeString uni_word);
+
+ // Given a set of UChars, find words containing those characters. If any
+ // existing, cached set is a proper subset then start with that cached
+ // set. Update the cache.
+ WordIDSet WordIDSetForTermChars(UCharSet const& uni_chars);
+
+ private:
+ UnicodeStringVector word_list_; // A list of all of indexed words.
+ // NOTE: A word will _never_ be removed from
+ // this vector to insure that the word_id
+ // remains immutable throughout the session,
+ // reducing maintenance complexity.
+ bool use_SQLite_filter_;
+ sqlite3* history_database_;
+ u_int64_t history_item_count_;
+ WordMap word_map_;
+ CharWordIDMap char_word_map_;
+ WordIDHistoryMap word_id_history_map_;
+ TermCharWordSetVector term_char_word_set_cache_;
+ HistoryInfoMap history_info_map_;
+
+ // Cached items.
+ icu::RegexMatcher *question_matcher_;
+ icu::RegexMatcher *percent_matcher_;
+ icu::RegexMatcher *non_alnum_matcher_;
+ icu::RegexMatcher *bar_matcher_;
+ icu::RuleBasedBreakIterator* word_breaker_;
+};
+
+#endif // CHROME_BROWSER_HISTORY_HISTORY_INDEX_H_
diff --git a/chrome/browser/history/history_index_unittest.cc b/chrome/browser/history/history_index_unittest.cc
new file mode 100644
index 0000000..c8fcff8
--- /dev/null
+++ b/chrome/browser/history/history_index_unittest.cc
@@ -0,0 +1,5 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// TODO(mrossetti): Implement the unit tests.
diff --git a/chrome/chrome_browser.gypi b/chrome/chrome_browser.gypi
index b437f75..da8f86b 100644
--- a/chrome/chrome_browser.gypi
+++ b/chrome/chrome_browser.gypi
@@ -1554,6 +1554,8 @@
'browser/history/history_backend.h',
'browser/history/history_database.cc',
'browser/history/history_database.h',
+ 'browser/history/history_index.cc',
+ 'browser/history/history_index.h',
'browser/history/history_marshaling.h',
'browser/history/history_notifications.h',
'browser/history/history_publisher.cc',
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index 6cd2509..dacbb35 100644
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -823,6 +823,7 @@
'browser/gtk/tabs/tab_renderer_gtk_unittest.cc',
'browser/history/expire_history_backend_unittest.cc',
'browser/history/history_backend_unittest.cc',
+ 'browser/history/history_index_unittest.cc',
'browser/history/history_querying_unittest.cc',
'browser/history/history_types_unittest.cc',
'browser/history/history_unittest.cc',