diff options
author | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-06-25 18:08:20 +0000 |
---|---|---|
committer | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-06-25 18:08:20 +0000 |
commit | ca8b0f7dca7ba5f8f7f86bb8bcd395bc1d25d171 (patch) | |
tree | 0950ed524ca098ce60a291b30b5cfd37cefcadc2 /chrome/browser/history | |
parent | 651f68197d1b8477435e2bf5307e9caa2e8ba667 (diff) | |
download | chromium_src-ca8b0f7dca7ba5f8f7f86bb8bcd395bc1d25d171.zip chromium_src-ca8b0f7dca7ba5f8f7f86bb8bcd395bc1d25d171.tar.gz chromium_src-ca8b0f7dca7ba5f8f7f86bb8bcd395bc1d25d171.tar.bz2 |
Revert 50863 - Add initial url/title history indexing and search provider support -- not hooked up at this point.
BUG=None
TEST=None
Review URL: http://codereview.chromium.org/2818029
TBR=mrossetti@chromium.org
Review URL: http://codereview.chromium.org/2811034
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@50869 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/history')
-rw-r--r-- | chrome/browser/history/history_index.cc | 698 | ||||
-rw-r--r-- | chrome/browser/history/history_index.h | 211 | ||||
-rw-r--r-- | chrome/browser/history/history_index_unittest.cc | 5 |
3 files changed, 0 insertions, 914 deletions
diff --git a/chrome/browser/history/history_index.cc b/chrome/browser/history/history_index.cc deleted file mode 100644 index aa7a72d..0000000 --- a/chrome/browser/history/history_index.cc +++ /dev/null @@ -1,698 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/browser/history/history_index.h" - -#include <algorithm> -#include <sstream> - -#include "third_party/icu/public/common/unicode/brkiter.h" -#include "third_party/icu/public/common/unicode/utypes.h" - -using std::string; -using icu::Locale; -using icu::BreakIterator; -using icu::RegexMatcher; -using icu::RuleBasedBreakIterator; -using icu::UnicodeString; - -// Define the following to '1' if the parameters to the URL should not be -// indexed. -#define TRIM_PARAMETERS 1 - -// Define the following to specify that only the most recent X history -// entries are to be considered. -#define LIMIT_HISTORY_TO 10000 - - -HistoryIndex::~HistoryIndex() { - if (history_database_) - sqlite3_close(history_database_); - delete question_matcher_; - delete percent_matcher_; - delete non_alnum_matcher_; - delete bar_matcher_; - delete word_breaker_; -} - -// Indexing - -bool HistoryIndex::IndexURLHistory(std::string history_path) { - bool success = false; - // Reset our indexes. - char_word_map_.clear(); - word_id_history_map_.clear(); - if (history_database_) - sqlite3_close(history_database_); - - int result = sqlite3_open(history_path.c_str(), &history_database_); - if (result == SQLITE_OK) { - // TODO(mrossetti): Consider filtering only the most recent and the most - // often referenced history items. Also consider usage count when scoring. - std::ostringstream query_stream; - query_stream << "SELECT id, url, title FROM urls"; -#if LIMIT_HISTORY_TO - query_stream << " ORDER BY last_visit_time DESC LIMIT " << LIMIT_HISTORY_TO; -#endif // LIMIT_HISTORY_TO - query_stream << ";"; - result = sqlite3_exec(history_database_, - query_stream.str().c_str(), - &URLHistoryQueryHandler, - reinterpret_cast<void*>(this), - NULL); - if (result == SQLITE_OK) { - success = true; - } - } - return success; -} - -// NOTE: Not thread-safe because RuleBasedBreakIterator::getRuleStatus isn't. -bool HistoryIndex::IndexRow(const char* id, const char* url, - const char* title) { - UnicodeString uni_string = UnicodeString::fromUTF8(url); - UErrorCode icu_status = U_ZERO_ERROR; - -#ifdef TRIM_PARAMETERS - // Trim the URL at the first '?'. - if (!question_matcher_) { - question_matcher_ = new RegexMatcher("\\?", 0, icu_status); - if (U_FAILURE(icu_status)) { - // There was a syntax error in the regular expression. - return false; - } - } - question_matcher_->reset(uni_string); - if (question_matcher_->find()) { - int32_t start = question_matcher_->start(icu_status); - // TODO(mrossetti): Check status here. - uni_string.handleReplaceBetween(start, uni_string.length(), ""); - } -#endif - - // Eliminate any '%xx' encodings in the URL. - // TODO(mrossetti): We should probably convert the '%xx' encodings. - if (!percent_matcher_) { - percent_matcher_ = new RegexMatcher("%[0-9a-fA-F]{2}", 0, icu_status); - if (U_FAILURE(icu_status)) { - // There was a syntax error in the regular expression. - return false; - } - } - percent_matcher_->reset(uni_string); - uni_string = percent_matcher_->replaceAll(" ", icu_status); - if (U_FAILURE(icu_status)) { - // There was a failure during the replace. - return false; - } - - // Determine the row ID in the database. - sqlite_int64 test_id = strtol(id, NULL, 10); - if (test_id < 0 || test_id > UINT16_MAX) { - // TODO(mrossetti): Handle error condition here. - } - HistoryID history_id = labs(test_id); - - // Add the row for quick lookup in the history info store. - // NOTE: Update the following when visit count and time becomes important. - string lower_url(url); - string lower_title(title); - string time_string(""); - // Lowercase URL and Title. - transform(lower_url.begin(), lower_url.end(), lower_url.begin(), - tolower); - transform(lower_title.begin(), lower_title.end(), lower_title.begin(), - tolower); - HistoryItem hist_item(history_id, lower_url, lower_title, 0, time_string); - history_info_map_.insert(std::make_pair(history_id, hist_item)); - - // Catenate the URL and the title in preparation for coming up with words. - if (title[0]) { - uni_string.append(" "); - uni_string.append(UnicodeString::fromUTF8(title)); - } - - // Break down the URL and title into individual, unique words. - UnicodeStringSet words = WordsFromUnicodeString(uni_string); - - // For each word, add a new entry into the word index referring to the - // history item. - for (UnicodeStringSet::iterator iter = words.begin(); - iter != words.end(); ++iter) { - UnicodeString uni_word = *iter; - WordMap::iterator word_pos = word_map_.find(uni_word); - if (word_pos != word_map_.end()) { - // Update existing entry in the word/history index. - WordID word_id = word_pos->second; - WordIDHistoryMap::iterator history_pos = - word_id_history_map_.find(word_id); - if (history_pos != word_id_history_map_.end()) { - HistoryIDSet& history_id_set(history_pos->second); - history_id_set.insert(history_id); - } else { - // TODO(mrossetti): Handle the unexpected failure here. - } - } else { - // Add a new word to the word list and the word map, and then create a - // new entry in the word/history map. - word_list_.push_back(uni_word); - WordID word_id = word_list_.size() - 1; - if (word_map_.insert(std::make_pair(uni_word, word_id)).second) { - HistoryIDSet history_id_set; - history_id_set.insert(history_id); - if (word_id_history_map_.insert( - std::make_pair(word_id, history_id_set)).second) { - // For each character in the newly added word (i.e. a word that is not - // already in the word index), add the word to the character index. - UCharSet characters = - HistoryIndex::CharactersFromUnicodeString(uni_word); - for (UCharSet::iterator uni_char_iter = characters.begin(); - uni_char_iter != characters.end(); ++uni_char_iter) { - UChar uni_string = *uni_char_iter; - CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string); - if (char_iter != char_word_map_.end()) { - // Update existing entry in the char/word index. - WordIDSet& word_id_set(char_iter->second); - word_id_set.insert(word_id); - } else { - // Create a new entry in the char/word index. - WordIDSet word_id_set; - word_id_set.insert(word_id); - if (!char_word_map_.insert(std::make_pair(uni_string, - word_id_set)).second) { - // TODO(mrossetti): Handle the unexpected failure here. - } - } - } - } else { - // TODO(mrossetti): Handle the unexpected failure here. - } - } else { - // TODO(mrossetti): Handle the unexpected failure here. - } - } - } - ++history_item_count_; - return true; -} - -void HistoryIndex::Reset() { - if (history_database_) { - sqlite3_close(history_database_); - history_database_ = NULL; - } - word_list_.clear(); - word_map_.clear(); - char_word_map_.clear(); - word_id_history_map_.clear(); - history_info_map_.clear(); - history_item_count_ = 0; -} - -int HistoryIndex::URLHistoryQueryHandler(void* context, int columns, - char** column_strings, - char** column_names) { - int success = SQLITE_ABORT; - // Verify we got the expected columns. - // We expect: 1) id, 2) url, 3) title. - if (columns == 3) { - HistoryIndex* delegate(static_cast<HistoryIndex*>(context)); - if (delegate->IndexRow(column_strings[0], column_strings[1], - column_strings[2])) - success = SQLITE_OK; - } - return success; -} - -// Searching - -HistoryItemVector HistoryIndex::HistoryItemsForTerms( - SearchTermsVector const& terms) { - HistoryItemVector item_vector; - if (history_database_ && terms.size()) { - // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- - // and-sweep approach. - // TODO(mrossetti): Consider keeping more of the cache around for possible - // repeat searches, except a 'shortcuts' approach might be better for that. - // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. - for (TermCharWordSetVector::iterator iter = - term_char_word_set_cache_.begin(); - iter != term_char_word_set_cache_.end(); ++iter) - iter->used_ = false; - - // Lowercase the terms. - SearchTermsVector lower_terms; - for (SearchTermsVector::const_iterator term_iter = terms.begin(); - term_iter != terms.end(); ++term_iter) { - string lower_string(*term_iter); - transform(lower_string.begin(), lower_string.end(), lower_string.begin(), - tolower); - lower_terms.push_back(lower_string); - } - - // Composite a string by joining all terms, separated by a space. - // TODO(mrossetti): Use a transform algorithm, eh. - string all_terms = lower_terms[0]; - for (SearchTermsVector::iterator term_iter = ++lower_terms.begin(); - term_iter != lower_terms.end(); ++term_iter) { - all_terms += " " + *term_iter; - } - - UnicodeString uni_string = UnicodeString::fromUTF8(all_terms.c_str()); - - // Break the terms down into individual terms (words), get the candidate - // set for each term, and intersect each to get a final candidate list. - HistoryIDSet history_id_set; - UnicodeStringSet words = WordsFromUnicodeString(uni_string); - bool first_word = true; - for (UnicodeStringSet::iterator iter = words.begin(); - iter != words.end(); ++iter) { - UnicodeString uni_word = *iter; - HistoryIDSet term_history_id_set = - HistoryIndex::HistoryIDsForTerm(uni_word); - if (first_word) { - history_id_set = term_history_id_set; - first_word = false; - } else { - HistoryIDSet old_history_id_set(history_id_set); - history_id_set.clear(); - std::set_intersection(old_history_id_set.begin(), - old_history_id_set.end(), - term_history_id_set.begin(), - term_history_id_set.end(), - std::inserter(history_id_set, - history_id_set.begin())); - } - } - - if (history_id_set.size()) { - // For each item in the final candidate list, perform a substring match - // for each original individual term and weed out non-matches. - if (use_SQLite_filter_) { - std::ostringstream query_stream; - query_stream << "SELECT id, url, title, visit_count, last_visit_time " - "FROM urls WHERE id IN ( "; - - // I'm sure there's an algorithm for this! - bool first_id = true; - for (HistoryIDSet::iterator id_iter = history_id_set.begin(); - id_iter != history_id_set.end(); ++id_iter) { - if (first_id) { - query_stream << *id_iter; - first_id = false; - } else { - query_stream << "," << *id_iter; - } - } - query_stream << ");"; - - HistoryItemVector term_query_results; - int result = sqlite3_exec(history_database_, - query_stream.str().c_str(), - &URLTermQueryHandler, - reinterpret_cast<void*>(&term_query_results), - NULL); - if (result == SQLITE_OK) { - // Reduce the results by filtering out non-substring matches. -#if 0 - // TODO(mrossetti): Regain your C++-fu and get the following to work. - term_query_results.erase(remove_if( - term_query_results.begin(), - term_query_results.end(), - bind2nd(mem_fun_ref(&HistoryItem::TermIsMissing), &lower_terms))); - item_vector = term_query_results; -#else - for (HistoryItemVector::iterator hist_iter = - term_query_results.begin(); - hist_iter != term_query_results.end(); ++hist_iter) { - HistoryItem& hist_item(*hist_iter); - if (!hist_item.TermIsMissing(&lower_terms)) - item_vector.push_back(hist_item); - } -#endif - } - // TODO(mrossetti): Handle any sqlite3 errors. - } else { - for (HistoryIDSet::iterator id_iter = history_id_set.begin(); - id_iter != history_id_set.end(); ++id_iter) { - HistoryID history_id = *id_iter; - HistoryInfoMap::iterator hist_pos = - history_info_map_.find(history_id); - if (hist_pos != history_info_map_.end()) { - HistoryItem& hist_item(hist_pos->second); - if (!hist_item.TermIsMissing(&lower_terms)) - item_vector.push_back(hist_item); - } else { - // Handle unexpected missing history item here. - } - } - } - - // Score. - // TODO(mrossetti): Implement scoring after looking at current impl. - } - } - - // Remove any stale TermCharWordSet's. - TermCharWordSetVector new_vector; - for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); - iter != term_char_word_set_cache_.end(); ++iter) { - if (iter->used_) - new_vector.push_back(*iter); - } - term_char_word_set_cache_ = new_vector; - - return item_vector; -} - -HistoryIDSet HistoryIndex::HistoryIDsForTerm(UnicodeString uni_word) { - HistoryIDSet history_id_set; - - // For each character in the word, get the char/word_id map entry and - // intersect with the set in an incremental manner. - UCharSet uni_chars = CharactersFromUnicodeString(uni_word); - WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); - - // If any words resulted then we can compose a set of history IDs by unioning - // the sets from each word. - if (word_id_set.size()) { - for (WordIDSet::iterator word_id_iter = word_id_set.begin(); - word_id_iter != word_id_set.end(); ++word_id_iter) { - WordID word_id = *word_id_iter; - WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); - if (word_iter != word_id_history_map_.end()) { - HistoryIDSet& word_history_id_set(word_iter->second); - history_id_set.insert(word_history_id_set.begin(), - word_history_id_set.end()); - } - } - } - - return history_id_set; -} - -int HistoryIndex::URLTermQueryHandler(void* context, int columns, - char** column_strings, - char** column_names) { - int success = SQLITE_ABORT; - // Verify we got the expected columns. - // We expect: 1) id, 2) url, 3) title, 4) visit_count, 5) last_visit_time. - if (columns == 5) { - HistoryItemVector* items = static_cast<HistoryItemVector*>(context); - HistoryID history_id = atoll(column_strings[0]); - string url(column_strings[1]); - string title(column_strings[2]); - int visit_count = atoi(column_strings[3]); - string last_visit_time(column_strings[4]); - HistoryItem item(history_id, url, title, visit_count, last_visit_time); - items->push_back(item); - success = SQLITE_OK; - } - return success; -} - -// Utility Functions - -UnicodeStringSet HistoryIndex::WordsFromUnicodeString( - UnicodeString uni_string) { - UnicodeStringSet words; - UErrorCode icu_status = U_ZERO_ERROR; - // TODO(mrossetti): Get locale from browser: - // icu::Locale(g_browser_process->GetApplicationLocale().c_str()) - Locale locale(Locale::getDefault()); - - uni_string.toLower(locale); // Convert to lowercase. - - // Replace all | & _'s with a space. - // TODO(mrossetti): Roll this into the word_breaker_. - if (!bar_matcher_) { - bar_matcher_ = new RegexMatcher("[|_]+", 0, icu_status); - if (U_FAILURE(icu_status)) { - // There was a syntax error in the regular expression. - return words; - } - } - bar_matcher_->reset(uni_string); - uni_string = bar_matcher_->replaceAll(" ", icu_status); - if (U_FAILURE(icu_status)) { - // There was a failure during the replace. - return words; - } - - // Break up the string into individual words. - if (!word_breaker_) { - word_breaker_ = static_cast<RuleBasedBreakIterator*> - (RuleBasedBreakIterator::createWordInstance(locale, icu_status)); - if (U_FAILURE(icu_status)) { - delete word_breaker_; - return words; - } - } - word_breaker_->setText(uni_string); - int32_t start = 0; - int32_t previous_boundary = 0; - while (start != BreakIterator::DONE) { - int32_t breakType = word_breaker_->getRuleStatus(); - if (breakType != UBRK_WORD_NONE) { - // TODO(mrossetti): Numbers may have non-numeric characters that - // will have to be stripped before term matching. Do this during - // term matching. - // number: UBRK_WORD_NUMBER <= breakType < UBRK_WORD_NUMBER_LIMIT - // word: UBRK_WORD_LETTER <= breakType < UBRK_WORD_LETTER_LIMIT - // kana: UBRK_WORD_KANA <= breakType < UBRK_WORD_KANA_LIMIT - // ideo: UBRK_WORD_IDEO <= breakType < UBRK_WORD_IDEO_LIMIT - int32_t current_boundary = word_breaker_->current(); - int32_t word_length = current_boundary - previous_boundary; - UnicodeString uni_word(uni_string, previous_boundary, word_length); - - // Remove non-word characters from the string. - if (!non_alnum_matcher_) { - non_alnum_matcher_ = new RegexMatcher("\\W+", 0, icu_status); - if (U_FAILURE(icu_status)) { - // There was a syntax errors in the regular expression. - return words; - } - } - non_alnum_matcher_->reset(uni_word); - uni_word = non_alnum_matcher_->replaceAll("", icu_status); - if (U_FAILURE(icu_status)) { - // There was a failure during the replace. - break; - } - - if (uni_word.length()) - words.insert(uni_word); - previous_boundary = current_boundary; - } - start = word_breaker_->next(); - } - return words; -} - -UCharSet HistoryIndex::CharactersFromUnicodeString( - icu::UnicodeString uni_word) { - UCharSet characters; - icu::StringCharacterIterator ci(uni_word); - for (UChar uni_char = ci.first(); - uni_char != icu::CharacterIterator::DONE; uni_char = ci.next()) - characters.insert(uni_char); - return characters; -} - -WordIDSet HistoryIndex::WordIDSetForTermChars(UCharSet const& uni_chars) { - TermCharWordSet* best_term_char_word_set = NULL; - bool set_found = false; - size_t outstanding_count = 0; - UCharSet outstanding_chars; - for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); - iter != term_char_word_set_cache_.end(); ++iter) { - TermCharWordSet& term_char_word_set(*iter); - UCharSet& char_set(term_char_word_set.char_set_); - UCharSet exclusions; - set_difference(char_set.begin(), char_set.end(), - uni_chars.begin(), uni_chars.end(), - std::inserter(exclusions, exclusions.begin())); - if (exclusions.size() == 0) { - // Do a reverse difference so that we know which characters remain - // to be indexed. Then decide if this is a better match than any - // previous cached set. - set_difference(uni_chars.begin(), uni_chars.end(), - char_set.begin(), char_set.end(), - std::inserter(exclusions, exclusions.begin())); - if (!set_found || exclusions.size() < outstanding_count) { - outstanding_chars = exclusions; - best_term_char_word_set = &*iter; - outstanding_count = exclusions.size(); - set_found = true; - } - } - } - - WordIDSet word_id_set; - if (set_found && outstanding_count == 0) { - // If there were no oustanding characters then we can use the cached one. - best_term_char_word_set->used_ = true; - word_id_set = best_term_char_word_set->word_id_set_; - } else { - // Some or all of the characters remain to be indexed. - bool first_character = true; - if (set_found) { - first_character = false; - word_id_set = best_term_char_word_set->word_id_set_; - } else { - outstanding_chars = uni_chars; - } - - for (UCharSet::iterator uni_char_iter = outstanding_chars.begin(); - uni_char_iter != outstanding_chars.end(); ++uni_char_iter) { - UChar uni_char = *uni_char_iter; - CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); - if (char_iter != char_word_map_.end()) { - WordIDSet& char_word_id_set(char_iter->second); - if (first_character) { - word_id_set = char_word_id_set; - first_character = false; - } else { - WordIDSet old_word_id_set(word_id_set); - word_id_set.clear(); - std::set_intersection(old_word_id_set.begin(), - old_word_id_set.end(), - char_word_id_set.begin(), - char_word_id_set.end(), - std::inserter(word_id_set, - word_id_set.begin())); - } - } else { - // The character was not found so bail. - word_id_set.clear(); - break; - } - } - // Update the cache. - if (!set_found || outstanding_count) { - TermCharWordSet new_char_word_set(uni_chars, word_id_set, true); - term_char_word_set_cache_.push_back(new_char_word_set); - } - } - return word_id_set; -} - -// Statistics - -#if HISTORY_INDEX_STATISTICS_ENABLED - -u_int64_t HistoryIndex::IndexedHistoryCount() { - return history_item_count_; -} - -u_int64_t HistoryIndex::IndexedWordCount() { - return word_list_.size(); -} - -// Return a count of the characters which have been indexed. -u_int64_t HistoryIndex::IndexedCharacterCount() { - return char_word_map_.size(); -} - -UnicodeStringVector HistoryIndex::IndexedWords() const { - UnicodeStringVector words(word_list_); - return words; // Return a copy. -} - -u_int64_t HistoryIndex::NumberOfHistoryItemsForWord(UnicodeString uni_word) { - u_int64_t history_count = 0; - WordMap::iterator word_pos = word_map_.find(uni_word); - if (word_pos != word_map_.end()) { - WordID word_id = word_pos->second; - WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); - if (history_pos != word_id_history_map_.end()) - history_count = history_pos->second.size(); - } - return history_count; -} - -UnicodeStringVector HistoryIndex::IndexedCharacters() { - UnicodeStringVector characters; - for (CharWordIDMap::iterator iter = char_word_map_.begin(); - iter != char_word_map_.end(); ++iter) - characters.push_back(iter->first); - return characters; -} - -u_int64_t HistoryIndex::NumberOfWordsForChar(UChar uni_char) { - u_int64_t word_count = 0; - CharWordIDMap::iterator char_pos = char_word_map_.find(uni_char); - if (char_pos != char_word_map_.end()) - word_count = char_pos->second.size(); - return word_count; -} - -u_int64_t HistoryIndex::WordListSize() { - u_int64_t listSize = 0; - // TODO(mrossetti): Use an algorithm. - for (UnicodeStringVector::iterator iter = word_list_.begin(); - iter != word_list_.end(); ++iter) { - listSize += iter->length(); // vector overhead is ignored. - } - return listSize; -} - -u_int64_t HistoryIndex::WordMapSize() { - u_int64_t mapSize = 0; - for (WordMap::iterator iter = word_map_.begin(); iter != word_map_.end(); - ++iter) { - mapSize += sizeof(WordID) + iter->first.length(); - } - return mapSize; -} - -u_int64_t HistoryIndex::WordHistoryMapSize() { - u_int64_t mapSize = 0; - for (WordIDHistoryMap::iterator iter = word_id_history_map_.begin(); - iter != word_id_history_map_.end(); ++iter) { - mapSize += sizeof(WordID) + (iter->second.size() * sizeof(HistoryID)); - } - return mapSize; -} - -u_int64_t HistoryIndex::CharWordMapSize() { - u_int64_t mapSize = 0; - for (CharWordIDMap::iterator iter = char_word_map_.begin(); - iter != char_word_map_.end(); ++iter) { - mapSize += sizeof(UChar) + (iter->second.size() * sizeof(WordID)); - } - return mapSize; -} - -u_int64_t HistoryIndex::HistoryInfoSize() { - u_int64_t mapSize = 0; - for (HistoryInfoMap::iterator iter = history_info_map_.begin(); - iter != history_info_map_.end(); ++iter) { - mapSize += sizeof(iter->first); - HistoryItem& hist_item(iter->second); - mapSize += sizeof(hist_item.history_id_); - mapSize += hist_item.url_.size(); - mapSize += hist_item.title_.size(); - mapSize += sizeof(hist_item.visit_count_); - mapSize += hist_item.last_visit_time_.size(); - } - return mapSize; -} - -#endif // HISTORY_INDEX_STATISTICS_ENABLED - -// HistoryItem Methods - -bool HistoryItem::TermIsMissing(SearchTermsVector* terms_vector) const { - for (SearchTermsVector::iterator term_iter = terms_vector->begin(); - term_iter != terms_vector->end(); ++term_iter) { - string term = *term_iter; - if (term.length() > 1 && - url_.find(term) == string::npos && - title_.find(term) == string::npos) - return true; - } - return false; -} - diff --git a/chrome/browser/history/history_index.h b/chrome/browser/history/history_index.h deleted file mode 100644 index a2d3750..0000000 --- a/chrome/browser/history/history_index.h +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_BROWSER_HISTORY_HISTORY_INDEX_H_ -#define CHROME_BROWSER_HISTORY_HISTORY_INDEX_H_ - -#include <map> -#include <set> -#include <string> -#include <vector> - -#if defined(USE_SYSTEM_SQLITE) -#include <sqlite3.h> -#else -#include "third_party/sqlite/preprocessed/sqlite3.h" -#endif - -#include "third_party/icu/public/common/unicode/rbbi.h" -#include "third_party/icu/public/i18n/unicode/regex.h" -#include "third_party/icu/public/common/unicode/unistr.h" - -#define HISTORY_INDEX_STATISTICS_ENABLED 1 - -// Convenience types -typedef std::vector<icu::UnicodeString> UnicodeStringVector; -typedef std::set<icu::UnicodeString> UnicodeStringSet; -typedef std::set<UChar> UCharSet; -typedef std::vector<std::string> SearchTermsVector; - -// An index into list of all of the words we have indexed. -typedef uint16_t WordID; - -// A map allowing a WordID to be determined given a word. -typedef std::map<icu::UnicodeString, WordID> WordMap; - -// A map from character to word_id's. -typedef std::set<WordID> WordIDSet; // An index into the WordList. -typedef std::map<UChar, WordIDSet> CharWordIDMap; - -// A map from word_id to history item. -typedef uint16_t HistoryID; // uint16_t might actually work for us. -typedef std::set<HistoryID> HistoryIDSet; -typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; - -// Support caching of term character intersections so that we can optimize -// searches which build upon a previous search. -struct TermCharWordSet { - UCharSet char_set_; - WordIDSet word_id_set_; - bool used_; // true if this set has been used for the current term search. - TermCharWordSet(UCharSet char_set, WordIDSet word_id_set, bool used) - : char_set_(char_set), word_id_set_(word_id_set), used_(used) { } -}; -typedef std::vector<TermCharWordSet> TermCharWordSetVector; - -// Search results base class. -struct HistoryItem { - HistoryID history_id_; - std::string url_; - std::string title_; - int visit_count_; - std::string last_visit_time_; - // TODO(mrossetti): Make a class. Add fields for score and to provide - // term matching metrics so a UI can highlight the exact match. - - HistoryItem(HistoryID history_id, - std::string& url, - std::string& title, - int visit_count, - std::string& last_visit_time) - : history_id_(history_id), url_(url), title_(title), - visit_count_(visit_count), last_visit_time_(last_visit_time) { } - - // Return true if any of the search terms do not match a substring. - // TODO(mrossetti): Also need to score and add UI hints. - bool TermIsMissing(SearchTermsVector* terms_vector) const; -}; -typedef std::vector<HistoryItem> HistoryItemVector; - -// TODO(mrossetti): Rip out either the SQLite filtering or the in-memory -// HistoryInfo structure. -// A map from history_id to the history's URL and title. -typedef std::map<HistoryID, HistoryItem> HistoryInfoMap; - -// The history source. -class HistoryIndex { - public: - HistoryIndex() : use_SQLite_filter_(false), history_database_(NULL), - history_item_count_(0), question_matcher_(NULL), percent_matcher_(NULL), - non_alnum_matcher_(NULL), bar_matcher_(NULL), word_breaker_(NULL) { } - ~HistoryIndex(); - - // Open and index the URL history database. - bool IndexURLHistory(std::string history_path); - - // Reset the history index. - void Reset(); - - // Given a vector containing one or more words as utf8 strings, scan the - // history index and return a vector with all scored, matching history items. - // Each term must occur somewhere in the history item for the item to - // qualify, however, the terms do not necessarily have to be adjacent. - HistoryItemVector HistoryItemsForTerms(SearchTermsVector const& terms); - - // NOTE: Temporary for timing and memory analysis. - void SetUseSQLiteFilter(bool use_filter) { use_SQLite_filter_ = use_filter; } - -#if HISTORY_INDEX_STATISTICS_ENABLED - - // Return a count of the history items which have been indexed. - u_int64_t IndexedHistoryCount(); - - // Return a count of the words which have been indexed. - u_int64_t IndexedWordCount(); - - // Return a count of the characters which have been indexed. - u_int64_t IndexedCharacterCount(); - - // Return a vector containing all of the words which have been indexed - // as UTF8 strings. - UnicodeStringVector IndexedWords() const; - - // Return the count of history items for the given word. - u_int64_t NumberOfHistoryItemsForWord(icu::UnicodeString uni_word); - - // Return a vector containing all of the characters which have been indexed - // as UTF8 strings. - UnicodeStringVector IndexedCharacters(); - - // Return the count of words for the given character. - u_int64_t NumberOfWordsForChar(UChar uni_char); - - // Return the total size of the word list vector. - u_int64_t WordListSize(); - - // Return the total size of the word/word-id map. - u_int64_t WordMapSize(); - - // Return the total size of the word_id_history_map_. - u_int64_t WordHistoryMapSize(); - - // Return the total size of the char_word_map_. - u_int64_t CharWordMapSize(); - - // Return the total size of the char_word_map_. - u_int64_t HistoryInfoSize(); - -#endif // HISTORY_INDEX_STATISTICS_ENABLED - - private: - // Index one URL history item. - bool IndexRow(const char* id, const char* url, const char* title); - - // sqlite3 query callback used during initial history indexing. - static int URLHistoryQueryHandler(void* context, int columns, - char** column_strings, - char** column_names); - - // Given a single search term (i.e. one word, utf8) return a set of - // history IDs which map to that term but do not necessarily match - // as a substring. - HistoryIDSet HistoryIDsForTerm(icu::UnicodeString uni_word); - - // sqlite3 query callback used during final term filtering. - static int URLTermQueryHandler(void* context, int columns, - char** column_strings, - char** column_names); - - // Break an UnicodeString down into individual words. - UnicodeStringSet WordsFromUnicodeString(icu::UnicodeString uni_string); - - // Break an UnicodeString down into its individual characters. - // Note that this is temporarily intended to work on a single word, but - // _will_ work on a string of words, perhaps with unexpected results. - // TODO(mrossetti): Lots of optimizations possible here for not restarting - // a search if the user is just typing along. Also, change this to uniString - // and properly handle substring matches, scoring and sorting the results - // by score. Also, provide the metrics for where the matches occur so that - // the UI can highlight the matched sections. - UCharSet CharactersFromUnicodeString(icu::UnicodeString uni_word); - - // Given a set of UChars, find words containing those characters. If any - // existing, cached set is a proper subset then start with that cached - // set. Update the cache. - WordIDSet WordIDSetForTermChars(UCharSet const& uni_chars); - - private: - UnicodeStringVector word_list_; // A list of all of indexed words. - // NOTE: A word will _never_ be removed from - // this vector to insure that the word_id - // remains immutable throughout the session, - // reducing maintenance complexity. - bool use_SQLite_filter_; - sqlite3* history_database_; - u_int64_t history_item_count_; - WordMap word_map_; - CharWordIDMap char_word_map_; - WordIDHistoryMap word_id_history_map_; - TermCharWordSetVector term_char_word_set_cache_; - HistoryInfoMap history_info_map_; - - // Cached items. - icu::RegexMatcher *question_matcher_; - icu::RegexMatcher *percent_matcher_; - icu::RegexMatcher *non_alnum_matcher_; - icu::RegexMatcher *bar_matcher_; - icu::RuleBasedBreakIterator* word_breaker_; -}; - -#endif // CHROME_BROWSER_HISTORY_HISTORY_INDEX_H_ diff --git a/chrome/browser/history/history_index_unittest.cc b/chrome/browser/history/history_index_unittest.cc deleted file mode 100644 index c8fcff8..0000000 --- a/chrome/browser/history/history_index_unittest.cc +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// TODO(mrossetti): Implement the unit tests. |