summaryrefslogtreecommitdiffstats
path: root/chrome/browser/history
diff options
context:
space:
mode:
authormrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-06-30 19:37:53 +0000
committermrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-06-30 19:37:53 +0000
commite211b3b42e93288b60bc5c1c9d27a616edf84bb7 (patch)
treecc671764f257d0f054b86d0c3b683635769d8548 /chrome/browser/history
parent8cf4c691e16a2696ab967287b6c0020c76042ab8 (diff)
downloadchromium_src-e211b3b42e93288b60bc5c1c9d27a616edf84bb7.zip
chromium_src-e211b3b42e93288b60bc5c1c9d27a616edf84bb7.tar.gz
chromium_src-e211b3b42e93288b60bc5c1c9d27a616edf84bb7.tar.bz2
Revert 51284 - Add initial url/title history indexing and search provider support -- not hooked up at this point.
BUG=None TEST=None Review URL: http://codereview.chromium.org/2866025 TBR=mrossetti@chromium.org Review URL: http://codereview.chromium.org/2866035 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@51286 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/history')
-rw-r--r--chrome/browser/history/in_memory_url_index.cc637
-rw-r--r--chrome/browser/history/in_memory_url_index.h240
-rw-r--r--chrome/browser/history/in_memory_url_index_unittest.cc5
3 files changed, 0 insertions, 882 deletions
diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc
deleted file mode 100644
index 8c016a3..0000000
--- a/chrome/browser/history/in_memory_url_index.cc
+++ /dev/null
@@ -1,637 +0,0 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/browser/history/in_memory_url_index.h"
-
-#include <algorithm>
-#include <limits>
-#include <sstream>
-
-#include "app/sql/statement.h"
-#include "base/string_util.h"
-#include "third_party/icu/public/common/unicode/brkiter.h"
-#include "third_party/icu/public/common/unicode/utypes.h"
-
-using std::string;
-using icu::Locale;
-using icu::BreakIterator;
-using icu::RegexMatcher;
-using icu::RuleBasedBreakIterator;
-using icu::UnicodeString;
-using icu::StringCharacterIterator;
-
-namespace history {
-
-// Define the following to '1' if the parameters to the URL should not be
-// indexed.
-#define TRIM_PARAMETERS 1
-
-// Define the following to specify that only the most recent X history
-// entries are to be considered.
-#define LIMIT_HISTORY_TO 10000
-
-// Indexing
-
-bool InMemoryURLIndex::IndexURLHistory(const FilePath& history_path) {
- bool success = false;
- // Reset our indexes.
- char_word_map_.clear();
- word_id_history_map_.clear();
- if (history_database_.Open(history_path)) {
- if (!history_database_.DoesTableExist("urls")) {
- // TODO(mrossetti): Consider filtering only the most recent and the most
- // often referenced history items. Also consider usage count when scoring.
- std::ostringstream query_stream;
- query_stream << "SELECT id, url, title FROM urls";
-#if LIMIT_HISTORY_TO
- query_stream << " ORDER BY last_visit_time DESC LIMIT " <<
- LIMIT_HISTORY_TO;
-#endif // LIMIT_HISTORY_TO
- query_stream << ";";
- string query = query_stream.str();
- sql::Statement s(history_database_.GetUniqueStatement(query.c_str()));
- if (!s) {
- NOTREACHED() << "Statement prepare failed";
- return false;
- }
- success = true;
- while (s.Step() && success) {
- // Verify we got the expected columns.
- // We expect: 1) id, 2) url, 3) title.
- if (s.ColumnCount() == 3) {
- int64 row_id(s.ColumnInt64(0));
- string url(s.ColumnString(1));
- string title(s.ColumnString(2));
- if (!IndexRow(row_id, url, title))
- success = false;
- } else {
- success = false;
- }
- }
- }
- }
- return success;
-}
-
-// NOTE: Not thread-safe because RuleBasedBreakIterator::getRuleStatus isn't.
-bool InMemoryURLIndex::IndexRow(int64 row_id, string url, string title) {
- UnicodeString uni_string = UnicodeString::fromUTF8(url.c_str());
-
-#ifdef TRIM_PARAMETERS
- if (!TrimParameters(uni_string))
- return false;
-#endif
- if (!ConvertPercentEncoding(uni_string))
- return false;
-
- // TODO(mrossetti): Detect test_id > std::numeric_limits<HistoryID>::max().
- HistoryID history_id = static_cast<HistoryID>(row_id);
-
- // Add the row for quick lookup in the history info store.
- // NOTE: Add visit count and time to history item result for later scoring.
- transform(url.begin(), url.end(), url.begin(), tolower);
- string lower_title(title);
- transform(lower_title.begin(), lower_title.end(), lower_title.begin(),
- tolower);
- string time_string("");
- URLHistoryItem hist_item(history_id, url, lower_title, 0, time_string);
- history_info_map_.insert(std::make_pair(history_id, hist_item));
-
- // Catenate URL and title then split into individual, unique words.
- if (title[0]) {
- uni_string.append(" ");
- uni_string.append(UnicodeString::fromUTF8(title.c_str()));
- }
- UnicodeStringSet words = WordsFromUnicodeString(uni_string);
-
- // For each word, add a new entry into the word index referring to the
- // associated history item.
- for (UnicodeStringSet::iterator iter = words.begin();
- iter != words.end(); ++iter) {
- UnicodeStringSet::value_type uni_word = *iter;
- AddWordToIndex(uni_word, history_id);
- }
- ++history_item_count_;
- return true;
-}
-
-void InMemoryURLIndex::Reset() {
- history_database_.Close();
- word_list_.clear();
- word_map_.clear();
- char_word_map_.clear();
- word_id_history_map_.clear();
- history_info_map_.clear();
- history_item_count_ = 0;
-}
-
-// Searching
-
-URLItemVector InMemoryURLIndex::HistoryItemsForTerms(
- SearchTermsVector const& terms) {
- URLItemVector item_vector;
- if (terms.size()) {
- // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
- // and-sweep approach.
- ResetTermCharWordSetCache();
-
- // Lowercase the terms.
- // TODO(mrossetti): Another opportunity for a transform algorithm.
- SearchTermsVector lower_terms;
- for (SearchTermsVector::const_iterator term_iter = terms.begin();
- term_iter != terms.end(); ++term_iter) {
- SearchTermsVector::value_type lower_string(*term_iter);
- transform(lower_string.begin(), lower_string.end(), lower_string.begin(),
- tolower);
- lower_terms.push_back(lower_string);
- }
-
- // Composite a string by joining all terms, separated by a space.
- // TODO(mrossetti): Use a transform algorithm, eh.
- SearchTermsVector::value_type all_terms = lower_terms[0];
- for (SearchTermsVector::iterator term_iter = ++lower_terms.begin();
- term_iter != lower_terms.end(); ++term_iter) {
- all_terms += " " + *term_iter;
- }
-
- UnicodeString uni_string = UnicodeString::fromUTF8(all_terms.c_str());
- HistoryIDSet history_id_set = HistoryIDSetFromWords(uni_string);
-
- if (history_id_set.size()) {
- // For each item in the final candidate list, perform a substring match
- // for each original individual term and weed out non-matches.
- for (HistoryIDSet::iterator id_iter = history_id_set.begin();
- id_iter != history_id_set.end(); ++id_iter) {
- HistoryIDSet::value_type history_id = *id_iter;
- HistoryInfoMap::iterator hist_pos =
- history_info_map_.find(history_id);
- if (hist_pos != history_info_map_.end()) {
- URLHistoryItem& hist_item(hist_pos->second);
- if (!hist_item.TermIsMissing(&lower_terms))
- item_vector.push_back(hist_item);
- } else {
- // Handle unexpected missing history item here.
- }
- }
-
- // Score.
- // TODO(mrossetti): Implement scoring after looking at current impl.
- }
- }
-
- // Remove any stale TermCharWordSet's.
- TermCharWordSetVector new_vector;
- for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
- iter != term_char_word_set_cache_.end(); ++iter) {
- if (iter->used_)
- new_vector.push_back(*iter);
- }
- term_char_word_set_cache_ = new_vector;
-
- return item_vector;
-}
-
-HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(UnicodeString uni_word) {
- HistoryIDSet history_id_set;
-
- // For each character in the word, get the char/word_id map entry and
- // intersect with the set in an incremental manner.
- UCharSet uni_chars = CharactersFromUnicodeString(uni_word);
- WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
-
- // If any words resulted then we can compose a set of history IDs by unioning
- // the sets from each word.
- if (word_id_set.size()) {
- for (WordIDSet::iterator word_id_iter = word_id_set.begin();
- word_id_iter != word_id_set.end(); ++word_id_iter) {
- WordID word_id = *word_id_iter;
- WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
- if (word_iter != word_id_history_map_.end()) {
- HistoryIDSet& word_history_id_set(word_iter->second);
- history_id_set.insert(word_history_id_set.begin(),
- word_history_id_set.end());
- }
- }
- }
-
- return history_id_set;
-}
-
-void InMemoryURLIndex::ResetTermCharWordSetCache() {
- // TODO(mrossetti): Consider keeping more of the cache around for possible
- // repeat searches, except a 'shortcuts' approach might be better for that.
- // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
- for (TermCharWordSetVector::iterator iter =
- term_char_word_set_cache_.begin();
- iter != term_char_word_set_cache_.end(); ++iter)
- iter->used_ = false;
-}
-
-HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(icu::UnicodeString
- uni_string) {
- // Break the terms down into individual terms (words), get the candidate
- // set for each term, and intersect each to get a final candidate list.
- HistoryIDSet history_id_set;
- UnicodeStringSet words = WordsFromUnicodeString(uni_string);
- bool first_word = true;
- for (UnicodeStringSet::iterator iter = words.begin();
- iter != words.end(); ++iter) {
- UnicodeStringSet::value_type uni_word = *iter;
- HistoryIDSet term_history_id_set =
- InMemoryURLIndex::HistoryIDsForTerm(uni_word);
- if (first_word) {
- history_id_set = term_history_id_set;
- first_word = false;
- } else {
- HistoryIDSet old_history_id_set(history_id_set);
- history_id_set.clear();
- std::set_intersection(old_history_id_set.begin(),
- old_history_id_set.end(),
- term_history_id_set.begin(),
- term_history_id_set.end(),
- std::inserter(history_id_set,
- history_id_set.begin()));
- }
- }
- return history_id_set;
-}
-
-// Utility Functions
-
-UnicodeStringSet InMemoryURLIndex::WordsFromUnicodeString(
- UnicodeString uni_string) {
- UnicodeStringSet words;
- UErrorCode icu_status = U_ZERO_ERROR;
- // TODO(mrossetti): Get locale from browser:
- // icu::Locale(g_browser_process->GetApplicationLocale().c_str())
- Locale locale(Locale::getDefault());
-
- uni_string.toLower(locale); // Convert to lowercase.
-
- // Replace all | and _'s with a space.
- // TODO(mrossetti): Roll this into the word_breaker_.
- if (!bar_matcher_.get()) {
- bar_matcher_.reset(new RegexMatcher("[|_]+", 0, icu_status));
- if (U_FAILURE(icu_status)) {
- // There was a syntax error in the regular expression.
- return words;
- }
- }
- bar_matcher_->reset(uni_string);
- uni_string = bar_matcher_->replaceAll(" ", icu_status);
- if (U_FAILURE(icu_status)) {
- // There was a failure during the replace.
- return words;
- }
-
- // Break up the string into individual words.
- if (!word_breaker_.get()) {
- word_breaker_.reset(static_cast<RuleBasedBreakIterator*>
- (RuleBasedBreakIterator::createWordInstance(locale, icu_status)));
- if (U_FAILURE(icu_status)) {
- word_breaker_.reset();
- return words;
- }
- }
- word_breaker_->setText(uni_string);
- int32_t start = 0;
- int32_t previous_boundary = 0;
- while (start != BreakIterator::DONE) {
- int32_t breakType = word_breaker_->getRuleStatus();
- if (breakType != UBRK_WORD_NONE) {
- // TODO(mrossetti): Numbers may have non-numeric characters that
- // will have to be stripped before term matching. Do this during
- // term matching.
- // number: UBRK_WORD_NUMBER <= breakType < UBRK_WORD_NUMBER_LIMIT
- // word: UBRK_WORD_LETTER <= breakType < UBRK_WORD_LETTER_LIMIT
- // kana: UBRK_WORD_KANA <= breakType < UBRK_WORD_KANA_LIMIT
- // ideo: UBRK_WORD_IDEO <= breakType < UBRK_WORD_IDEO_LIMIT
- int32_t current_boundary = word_breaker_->current();
- int32_t word_length = current_boundary - previous_boundary;
- UnicodeString uni_word(uni_string, previous_boundary, word_length);
-
- // Remove non-word characters from the string.
- if (!non_alnum_matcher_.get()) {
- non_alnum_matcher_.reset(new RegexMatcher("\\W+", 0, icu_status));
- if (U_FAILURE(icu_status)) {
- // There was a syntax errors in the regular expression.
- return words;
- }
- }
- non_alnum_matcher_->reset(uni_word);
- uni_word = non_alnum_matcher_->replaceAll("", icu_status);
- if (U_FAILURE(icu_status)) {
- // There was a failure during the replace.
- break;
- }
-
- if (uni_word.length())
- words.insert(uni_word);
- previous_boundary = current_boundary;
- }
- start = word_breaker_->next();
- }
- return words;
-}
-
-UCharSet InMemoryURLIndex::CharactersFromUnicodeString(
- icu::UnicodeString uni_word) {
- UCharSet characters;
- icu::StringCharacterIterator ci(uni_word);
- for (UChar uni_char = ci.first();
- uni_char != icu::CharacterIterator::DONE; uni_char = ci.next())
- characters.insert(uni_char);
- return characters;
-}
-
-bool InMemoryURLIndex::TrimParameters(icu::UnicodeString& uni_string) {
- // Trim the URL at the first '?'.
- UErrorCode icu_status = U_ZERO_ERROR;
- if (!question_matcher_.get()) {
- question_matcher_.reset(new RegexMatcher("\\?", 0, icu_status));
- if (U_FAILURE(icu_status)) {
- // There was a syntax error in the regular expression.
- return false;
- }
- }
- question_matcher_->reset(uni_string);
- if (question_matcher_->find()) {
- int32_t start = question_matcher_->start(icu_status);
- // TODO(mrossetti): Check status here.
- uni_string.handleReplaceBetween(start, uni_string.length(), "");
- }
- return true;
-}
-
-bool InMemoryURLIndex::ConvertPercentEncoding(icu::UnicodeString& uni_string) {
- // TODO(mrossetti): Convert, don't remove, the '%xx' encodings.
- UErrorCode icu_status = U_ZERO_ERROR;
- if (!percent_matcher_.get()) {
- percent_matcher_.reset(new RegexMatcher("%[0-9a-fA-F]{2}", 0, icu_status));
- if (U_FAILURE(icu_status)) {
- // There was a syntax error in the regular expression.
- return false;
- }
- }
- percent_matcher_->reset(uni_string);
- uni_string = percent_matcher_->replaceAll(" ", icu_status);
- if (U_FAILURE(icu_status)) {
- // There was a failure during the replace.
- return false;
- }
- return true;
-}
-
-void InMemoryURLIndex::AddWordToIndex(icu::UnicodeString& uni_word,
- HistoryID history_id) {
- WordMap::iterator word_pos = word_map_.find(uni_word);
- if (word_pos != word_map_.end()) {
- // Update existing entry in the word/history index.
- WordID word_id = word_pos->second;
- WordIDHistoryMap::iterator history_pos =
- word_id_history_map_.find(word_id);
- if (history_pos != word_id_history_map_.end()) {
- HistoryIDSet& history_id_set(history_pos->second);
- history_id_set.insert(history_id);
- } else {
- // TODO(mrossetti): Handle the unexpected failure here.
- }
- } else {
- // Add a new word to the word list and the word map, and then create a
- // new entry in the word/history map.
- word_list_.push_back(uni_word);
- WordID word_id = word_list_.size() - 1;
- if (word_map_.insert(std::make_pair(uni_word, word_id)).second) {
- HistoryIDSet history_id_set;
- history_id_set.insert(history_id);
- if (word_id_history_map_.insert(
- std::make_pair(word_id, history_id_set)).second) {
- // For each character in the newly added word (i.e. a word that is not
- // already in the word index), add the word to the character index.
- UCharSet characters =
- InMemoryURLIndex::CharactersFromUnicodeString(uni_word);
- for (UCharSet::iterator uni_char_iter = characters.begin();
- uni_char_iter != characters.end(); ++uni_char_iter) {
- UCharSet::value_type uni_string = *uni_char_iter;
- CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string);
- if (char_iter != char_word_map_.end()) {
- // Update existing entry in the char/word index.
- WordIDSet& word_id_set(char_iter->second);
- word_id_set.insert(word_id);
- } else {
- // Create a new entry in the char/word index.
- WordIDSet word_id_set;
- word_id_set.insert(word_id);
- if (!char_word_map_.insert(std::make_pair(uni_string,
- word_id_set)).second) {
- // TODO(mrossetti): Handle the unexpected failure here.
- }
- }
- }
- } else {
- // TODO(mrossetti): Handle the unexpected failure here.
- }
- } else {
- // TODO(mrossetti): Handle the unexpected failure here.
- }
- }
-}
-
-WordIDSet InMemoryURLIndex::WordIDSetForTermChars(UCharSet const& uni_chars) {
- TermCharWordSet* best_term_char_word_set = NULL;
- bool set_found = false;
- size_t outstanding_count = 0;
- UCharSet outstanding_chars;
- for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
- iter != term_char_word_set_cache_.end(); ++iter) {
- TermCharWordSetVector::value_type& term_char_word_set(*iter);
- UCharSet& char_set(term_char_word_set.char_set_);
- UCharSet exclusions;
- set_difference(char_set.begin(), char_set.end(),
- uni_chars.begin(), uni_chars.end(),
- std::inserter(exclusions, exclusions.begin()));
- if (exclusions.size() == 0) {
- // Do a reverse difference so that we know which characters remain
- // to be indexed. Then decide if this is a better match than any
- // previous cached set.
- set_difference(uni_chars.begin(), uni_chars.end(),
- char_set.begin(), char_set.end(),
- std::inserter(exclusions, exclusions.begin()));
- if (!set_found || exclusions.size() < outstanding_count) {
- outstanding_chars = exclusions;
- best_term_char_word_set = &*iter;
- outstanding_count = exclusions.size();
- set_found = true;
- }
- }
- }
-
- WordIDSet word_id_set;
- if (set_found && outstanding_count == 0) {
- // If there were no oustanding characters then we can use the cached one.
- best_term_char_word_set->used_ = true;
- word_id_set = best_term_char_word_set->word_id_set_;
- } else {
- // Some or all of the characters remain to be indexed.
- bool first_character = true;
- if (set_found) {
- first_character = false;
- word_id_set = best_term_char_word_set->word_id_set_;
- } else {
- outstanding_chars = uni_chars;
- }
-
- for (UCharSet::iterator uni_char_iter = outstanding_chars.begin();
- uni_char_iter != outstanding_chars.end(); ++uni_char_iter) {
- UCharSet::value_type uni_char = *uni_char_iter;
- CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
- if (char_iter != char_word_map_.end()) {
- WordIDSet& char_word_id_set(char_iter->second);
- if (first_character) {
- word_id_set = char_word_id_set;
- first_character = false;
- } else {
- WordIDSet old_word_id_set(word_id_set);
- word_id_set.clear();
- std::set_intersection(old_word_id_set.begin(),
- old_word_id_set.end(),
- char_word_id_set.begin(),
- char_word_id_set.end(),
- std::inserter(word_id_set,
- word_id_set.begin()));
- }
- } else {
- // The character was not found so bail.
- word_id_set.clear();
- break;
- }
- }
- // Update the cache.
- if (!set_found || outstanding_count) {
- TermCharWordSet new_char_word_set(uni_chars, word_id_set, true);
- term_char_word_set_cache_.push_back(new_char_word_set);
- }
- }
- return word_id_set;
-}
-
-// Statistics
-
-#if HISTORY_INDEX_STATISTICS_ENABLED
-
-uint64 InMemoryURLIndex::IndexedHistoryCount() {
- return history_item_count_;
-}
-
-uint64 InMemoryURLIndex::IndexedWordCount() {
- return word_list_.size();
-}
-
-// Return a count of the characters which have been indexed.
-uint64 InMemoryURLIndex::IndexedCharacterCount() {
- return char_word_map_.size();
-}
-
-UnicodeStringVector InMemoryURLIndex::IndexedWords() const {
- UnicodeStringVector words(word_list_);
- return words; // Return a copy.
-}
-
-uint64 InMemoryURLIndex::NumberOfHistoryItemsForWord(UnicodeString uni_word) {
- uint64 history_count = 0;
- WordMap::iterator word_pos = word_map_.find(uni_word);
- if (word_pos != word_map_.end()) {
- WordID word_id = word_pos->second;
- WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
- if (history_pos != word_id_history_map_.end())
- history_count = history_pos->second.size();
- }
- return history_count;
-}
-
-UnicodeStringVector InMemoryURLIndex::IndexedCharacters() {
- UnicodeStringVector characters;
- for (CharWordIDMap::iterator iter = char_word_map_.begin();
- iter != char_word_map_.end(); ++iter)
- characters.push_back(iter->first);
- return characters;
-}
-
-uint64 InMemoryURLIndex::NumberOfWordsForChar(UChar uni_char) {
- uint64 word_count = 0;
- CharWordIDMap::iterator char_pos = char_word_map_.find(uni_char);
- if (char_pos != char_word_map_.end())
- word_count = char_pos->second.size();
- return word_count;
-}
-
-uint64 InMemoryURLIndex::WordListSize() {
- uint64 listSize = 0;
- // TODO(mrossetti): Use an algorithm.
- for (UnicodeStringVector::iterator iter = word_list_.begin();
- iter != word_list_.end(); ++iter) {
- listSize += iter->length(); // vector overhead is ignored.
- }
- return listSize;
-}
-
-uint64 InMemoryURLIndex::WordMapSize() {
- uint64 mapSize = 0;
- for (WordMap::iterator iter = word_map_.begin(); iter != word_map_.end();
- ++iter) {
- mapSize += sizeof(WordID) + iter->first.length();
- }
- return mapSize;
-}
-
-uint64 InMemoryURLIndex::WordHistoryMapSize() {
- uint64 mapSize = 0;
- for (WordIDHistoryMap::iterator iter = word_id_history_map_.begin();
- iter != word_id_history_map_.end(); ++iter) {
- mapSize += sizeof(WordID) + (iter->second.size() * sizeof(HistoryID));
- }
- return mapSize;
-}
-
-uint64 InMemoryURLIndex::CharWordMapSize() {
- uint64 mapSize = 0;
- for (CharWordIDMap::iterator iter = char_word_map_.begin();
- iter != char_word_map_.end(); ++iter) {
- mapSize += sizeof(UChar) + (iter->second.size() * sizeof(WordID));
- }
- return mapSize;
-}
-
-uint64 InMemoryURLIndex::HistoryInfoSize() {
- uint64 mapSize = 0;
- for (HistoryInfoMap::iterator iter = history_info_map_.begin();
- iter != history_info_map_.end(); ++iter) {
- mapSize += sizeof(iter->first);
- URLHistoryItem& hist_item(iter->second);
- mapSize += sizeof(hist_item.history_id_);
- mapSize += hist_item.url_.size();
- mapSize += hist_item.title_.size();
- mapSize += sizeof(hist_item.visit_count_);
- mapSize += hist_item.last_visit_time_.size();
- }
- return mapSize;
-}
-
-#endif // HISTORY_INDEX_STATISTICS_ENABLED
-
-// URLHistoryItem Methods
-
-bool URLHistoryItem::TermIsMissing(SearchTermsVector* terms_vector) const {
- for (SearchTermsVector::iterator term_iter = terms_vector->begin();
- term_iter != terms_vector->end(); ++term_iter) {
- SearchTermsVector::value_type& term(*term_iter);
- if (term.length() > 1 &&
- url_.find(term) == string::npos &&
- title_.find(term) == string::npos)
- return true;
- }
- return false;
-}
-
-} // namespace history
diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h
deleted file mode 100644
index 43b4236..0000000
--- a/chrome/browser/history/in_memory_url_index.h
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
-#define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
-
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "app/sql/connection.h"
-#include "base/basictypes.h"
-#include "base/scoped_ptr.h"
-#include "base/string16.h"
-#include "third_party/icu/public/common/unicode/rbbi.h"
-#include "third_party/icu/public/i18n/unicode/regex.h"
-#include "third_party/icu/public/common/unicode/unistr.h"
-
-namespace history {
-
-// Define the following if your application would like to collect information
-// about the history index and memory usage.
-// #define HISTORY_INDEX_STATISTICS_ENABLED 1
-
-// TODO(mrossetti): Move as many as possible of these typedefs, structs,
-// and support classes into the InMemoryURLIndex class.
-
-// UnicodeString compare functor which assists in handling UBool.
-class UnicodeStringCompare {
- public:
- bool operator() (const icu::UnicodeString& lhs,
- const icu::UnicodeString& rhs) {
- return (lhs < rhs) ? true : false;
- }
-};
-
-// Convenience types
-typedef std::vector<icu::UnicodeString> UnicodeStringVector;
-typedef std::set<icu::UnicodeString, UnicodeStringCompare> UnicodeStringSet;
-typedef std::set<UChar> UCharSet;
-typedef std::vector<std::string> SearchTermsVector;
-
-// An index into list of all of the words we have indexed.
-typedef int16 WordID;
-
-// A map allowing a WordID to be determined given a word.
-typedef std::map<icu::UnicodeString, WordID, UnicodeStringCompare> WordMap;
-
-// A map from character to word_id's.
-typedef std::set<WordID> WordIDSet; // An index into the WordList.
-typedef std::map<UChar, WordIDSet> CharWordIDMap;
-
-// A map from word_id to history item.
-typedef int16 HistoryID; // uint16 might actually work for us.
-typedef std::set<HistoryID> HistoryIDSet;
-typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
-
-// Support caching of term character intersections so that we can optimize
-// searches which build upon a previous search.
-struct TermCharWordSet {
- TermCharWordSet(UCharSet char_set, WordIDSet word_id_set, bool used)
- : char_set_(char_set), word_id_set_(word_id_set), used_(used) {}
-
- UCharSet char_set_;
- WordIDSet word_id_set_;
- bool used_; // true if this set has been used for the current term search.
-};
-typedef std::vector<TermCharWordSet> TermCharWordSetVector;
-
-// Describes an entry in the urls table of the History database.
-struct URLHistoryItem {
- URLHistoryItem(HistoryID history_id,
- std::string& url,
- std::string& title,
- int visit_count,
- std::string& last_visit_time)
- : history_id_(history_id), url_(url), title_(title),
- visit_count_(visit_count), last_visit_time_(last_visit_time) {}
-
- // Return true if any of the search terms do not match a substring.
- // TODO(mrossetti): Also need to score and add UI hints.
- bool TermIsMissing(SearchTermsVector* terms_vector) const;
-
- HistoryID history_id_;
- std::string url_;
- std::string title_;
- int visit_count_;
- std::string last_visit_time_;
- // TODO(mrossetti): Make a class. Add fields for score and to provide
- // term matching metrics so a UI can highlight the exact match.
-};
-typedef std::vector<URLHistoryItem> URLItemVector;
-
-// TODO(mrossetti): Rip out either the SQLite filtering or the in-memory
-// HistoryInfo structure.
-// A map from history_id to the history's URL and title.
-typedef std::map<HistoryID, URLHistoryItem> HistoryInfoMap;
-
-// The history source.
-class InMemoryURLIndex {
- public:
- InMemoryURLIndex()
- : history_item_count_(0), question_matcher_(NULL),
- percent_matcher_(NULL), non_alnum_matcher_(NULL), bar_matcher_(NULL),
- word_breaker_(NULL) {}
- ~InMemoryURLIndex() {}
-
- // Open and index the URL history database.
- bool IndexURLHistory(const FilePath& history_path);
-
- // Reset the history index.
- void Reset();
-
- // Given a vector containing one or more words as utf8 strings, scan the
- // history index and return a vector with all scored, matching history items.
- // Each term must occur somewhere in the history item for the item to
- // qualify, however, the terms do not necessarily have to be adjacent.
- URLItemVector HistoryItemsForTerms(SearchTermsVector const& terms);
-
-#if HISTORY_INDEX_STATISTICS_ENABLED
-
- // Return a count of the history items which have been indexed.
- uint64 IndexedHistoryCount();
-
- // Return a count of the words which have been indexed.
- uint64 IndexedWordCount();
-
- // Return a count of the characters which have been indexed.
- uint64 IndexedCharacterCount();
-
- // Return a vector containing all of the words which have been indexed
- // as UTF8 strings.
- UnicodeStringVector IndexedWords() const;
-
- // Return the count of history items for the given word.
- uint64 NumberOfHistoryItemsForWord(icu::UnicodeString uni_word);
-
- // Return a vector containing all of the characters which have been indexed
- // as UTF8 strings.
- UnicodeStringVector IndexedCharacters();
-
- // Return the count of words for the given character.
- uint64 NumberOfWordsForChar(UChar uni_char);
-
- // Return the total size of the word list vector.
- uint64 WordListSize();
-
- // Return the total size of the word/word-id map.
- uint64 WordMapSize();
-
- // Return the total size of the word_id_history_map_.
- uint64 WordHistoryMapSize();
-
- // Return the total size of the char_word_map_.
- uint64 CharWordMapSize();
-
- // Return the total size of the char_word_map_.
- uint64 HistoryInfoSize();
-
-#endif // HISTORY_INDEX_STATISTICS_ENABLED
-
- private:
- // Break an UnicodeString down into individual words.
- UnicodeStringSet WordsFromUnicodeString(icu::UnicodeString uni_string);
-
- // Given a set of UChars, find words containing those characters. If any
- // existing, cached set is a proper subset then start with that cached
- // set. Update the cache.
- WordIDSet WordIDSetForTermChars(UCharSet const& uni_chars);
-
- // URL History indexing support functions.
-
- // Index one URL history item.
- bool IndexRow(int64 row_id, std::string url, std::string title);
-
- // Break an UnicodeString down into its individual characters.
- // Note that this is temporarily intended to work on a single word, but
- // _will_ work on a string of words, perhaps with unexpected results.
- // TODO(mrossetti): Lots of optimizations possible here for not restarting
- // a search if the user is just typing along. Also, change this to uniString
- // and properly handle substring matches, scoring and sorting the results
- // by score. Also, provide the metrics for where the matches occur so that
- // the UI can highlight the matched sections.
- UCharSet CharactersFromUnicodeString(icu::UnicodeString uni_word);
-
- // Trim the parameters from the URL being indexed. Return true if there
- // were not errors encountered during the operation.
- bool TrimParameters(icu::UnicodeString& uni_string);
-
- // Convert %xx encodings in the URL being indexed. Return true if there
- // were not errors encountered during the operation.
- bool ConvertPercentEncoding(icu::UnicodeString& uni_string);
-
- // Given a single word, add a reference to the containing history item
- // to the index.
- void AddWordToIndex(icu::UnicodeString& uni_word, HistoryID history_id);
-
- // URL History index search support functions.
-
- // Given a single search term (i.e. one word, utf8) return a set of
- // history IDs which map to that term but do not necessarily match
- // as a substring.
- HistoryIDSet HistoryIDsForTerm(icu::UnicodeString uni_word);
-
- // Clear the search term cache. This cache holds on to the intermediate
- // word results for each previously typed character to eliminate the need
- // to re-perform set operations for previously typed characters.
- void ResetTermCharWordSetCache();
-
- // Compose a set of history item IDs by intersecting the set for each word
- // in |uni_string|.
- HistoryIDSet HistoryIDSetFromWords(icu::UnicodeString uni_string);
-
- private:
- UnicodeStringVector word_list_; // A list of all of indexed words.
- // NOTE: A word will _never_ be removed from
- // this vector to insure that the word_id
- // remains immutable throughout the session,
- // reducing maintenance complexity.
- sql::Connection history_database_;
- uint64 history_item_count_;
- WordMap word_map_;
- CharWordIDMap char_word_map_;
- WordIDHistoryMap word_id_history_map_;
- TermCharWordSetVector term_char_word_set_cache_;
- HistoryInfoMap history_info_map_;
-
- // Cached items.
- scoped_ptr<icu::RegexMatcher> question_matcher_;
- scoped_ptr<icu::RegexMatcher> percent_matcher_;
- scoped_ptr<icu::RegexMatcher> non_alnum_matcher_;
- scoped_ptr<icu::RegexMatcher> bar_matcher_;
- scoped_ptr<icu::RuleBasedBreakIterator> word_breaker_;
-};
-
-} // namespace history
-
-#endif // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc
deleted file mode 100644
index 8dc9569..0000000
--- a/chrome/browser/history/in_memory_url_index_unittest.cc
+++ /dev/null
@@ -1,5 +0,0 @@
-// Copyright (c) 2010 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// TODO(mrossetti): Implement the unit tests