Add initial url/title history indexing and search provider support -- not hooked up at this point.

BUG=None TEST=None Review URL: http://codereview.chromium.org/2866025 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@51284 0039d316-1c4b-4281-b951-d872f2087c98
author: mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-06-30 19:01:22 +0000
committer: mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-06-30 19:01:22 +0000
commit: a88fdf531a6a3f34e22d3d8443516b3970e4362f (patch)
tree: ddf2df86e958505d27f739a2cd67211fcedf2f23 /chrome
parent: a38b3b5ed715dcf37adabaf8632baf26efdce8a0 (diff)
download: chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.zip
chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.tar.gz
chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.tar.bz2
5 files changed, 885 insertions, 0 deletions
diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc
new file mode 100644
index 0000000..8c016a3
--- /dev/null
+++ b/chrome/browser/history/in_memory_url_index.cc
@@ -0,0 +1,637 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/history/in_memory_url_index.h"
+
+#include <algorithm>
+#include <limits>
+#include <sstream>
+
+#include "app/sql/statement.h"
+#include "base/string_util.h"
+#include "third_party/icu/public/common/unicode/brkiter.h"
+#include "third_party/icu/public/common/unicode/utypes.h"
+
+using std::string;
+using icu::Locale;
+using icu::BreakIterator;
+using icu::RegexMatcher;
+using icu::RuleBasedBreakIterator;
+using icu::UnicodeString;
+using icu::StringCharacterIterator;
+
+namespace history {
+
+// Define the following to '1' if the parameters to the URL should not be
+// indexed.
+#define TRIM_PARAMETERS 1
+
+// Define the following to specify that only the most recent X history
+// entries are to be considered.
+#define LIMIT_HISTORY_TO 10000
+
+// Indexing
+
+bool InMemoryURLIndex::IndexURLHistory(const FilePath& history_path) {
+  bool success = false;
+  // Reset our indexes.
+  char_word_map_.clear();
+  word_id_history_map_.clear();
+  if (history_database_.Open(history_path)) {
+    if (!history_database_.DoesTableExist("urls")) {
+      // TODO(mrossetti): Consider filtering only the most recent and the most
+      // often referenced history items. Also consider usage count when scoring.
+      std::ostringstream query_stream;
+      query_stream << "SELECT id, url, title FROM urls";
+#if LIMIT_HISTORY_TO
+      query_stream << " ORDER BY last_visit_time DESC LIMIT " <<
+          LIMIT_HISTORY_TO;
+#endif  // LIMIT_HISTORY_TO
+      query_stream << ";";
+      string query = query_stream.str();
+      sql::Statement s(history_database_.GetUniqueStatement(query.c_str()));
+      if (!s) {
+        NOTREACHED() << "Statement prepare failed";
+        return false;
+      }
+      success = true;
+      while (s.Step() && success) {
+        // Verify we got the expected columns.
+        // We expect: 1) id, 2) url, 3) title.
+        if (s.ColumnCount() == 3) {
+          int64 row_id(s.ColumnInt64(0));
+          string url(s.ColumnString(1));
+          string title(s.ColumnString(2));
+          if (!IndexRow(row_id, url, title))
+            success = false;
+        } else {
+          success = false;
+        }
+      }
+    }
+  }
+  return success;
+}
+
+// NOTE: Not thread-safe because RuleBasedBreakIterator::getRuleStatus isn't.
+bool InMemoryURLIndex::IndexRow(int64 row_id, string url, string title) {
+  UnicodeString uni_string = UnicodeString::fromUTF8(url.c_str());
+
+#ifdef TRIM_PARAMETERS
+  if (!TrimParameters(uni_string))
+    return false;
+#endif
+  if (!ConvertPercentEncoding(uni_string))
+    return false;
+
+  // TODO(mrossetti): Detect test_id > std::numeric_limits<HistoryID>::max().
+  HistoryID history_id = static_cast<HistoryID>(row_id);
+
+  // Add the row for quick lookup in the history info store.
+  // NOTE: Add visit count and time to history item result for later scoring.
+  transform(url.begin(), url.end(), url.begin(), tolower);
+  string lower_title(title);
+  transform(lower_title.begin(), lower_title.end(), lower_title.begin(),
+            tolower);
+  string time_string("");
+  URLHistoryItem hist_item(history_id, url, lower_title, 0, time_string);
+  history_info_map_.insert(std::make_pair(history_id, hist_item));
+
+  // Catenate URL and title then split into individual, unique words.
+  if (title[0]) {
+    uni_string.append(" ");
+    uni_string.append(UnicodeString::fromUTF8(title.c_str()));
+  }
+  UnicodeStringSet words = WordsFromUnicodeString(uni_string);
+
+  // For each word, add a new entry into the word index referring to the
+  // associated history item.
+  for (UnicodeStringSet::iterator iter = words.begin();
+       iter != words.end(); ++iter) {
+    UnicodeStringSet::value_type uni_word = *iter;
+    AddWordToIndex(uni_word, history_id);
+  }
+  ++history_item_count_;
+  return true;
+}
+
+void InMemoryURLIndex::Reset() {
+  history_database_.Close();
+  word_list_.clear();
+  word_map_.clear();
+  char_word_map_.clear();
+  word_id_history_map_.clear();
+  history_info_map_.clear();
+  history_item_count_ = 0;
+}
+
+// Searching
+
+URLItemVector InMemoryURLIndex::HistoryItemsForTerms(
+    SearchTermsVector const& terms) {
+  URLItemVector item_vector;
+  if (terms.size()) {
+    // Reset used_ flags for term_char_word_set_cache_. We use a basic mark-
+    // and-sweep approach.
+    ResetTermCharWordSetCache();
+
+    // Lowercase the terms.
+    // TODO(mrossetti): Another opportunity for a transform algorithm.
+    SearchTermsVector lower_terms;
+    for (SearchTermsVector::const_iterator term_iter = terms.begin();
+         term_iter != terms.end(); ++term_iter) {
+      SearchTermsVector::value_type lower_string(*term_iter);
+      transform(lower_string.begin(), lower_string.end(), lower_string.begin(),
+                tolower);
+      lower_terms.push_back(lower_string);
+    }
+
+    // Composite a string by joining all terms, separated by a space.
+    // TODO(mrossetti): Use a transform algorithm, eh.
+    SearchTermsVector::value_type all_terms = lower_terms[0];
+    for (SearchTermsVector::iterator term_iter = ++lower_terms.begin();
+         term_iter != lower_terms.end(); ++term_iter) {
+      all_terms += " " + *term_iter;
+    }
+
+    UnicodeString uni_string = UnicodeString::fromUTF8(all_terms.c_str());
+    HistoryIDSet history_id_set = HistoryIDSetFromWords(uni_string);
+
+    if (history_id_set.size()) {
+      // For each item in the final candidate list, perform a substring match
+      // for each original individual term and weed out non-matches.
+      for (HistoryIDSet::iterator id_iter = history_id_set.begin();
+           id_iter != history_id_set.end(); ++id_iter) {
+        HistoryIDSet::value_type history_id = *id_iter;
+        HistoryInfoMap::iterator hist_pos =
+            history_info_map_.find(history_id);
+        if (hist_pos != history_info_map_.end()) {
+          URLHistoryItem& hist_item(hist_pos->second);
+          if (!hist_item.TermIsMissing(&lower_terms))
+            item_vector.push_back(hist_item);
+        } else {
+          // Handle unexpected missing history item here.
+        }
+      }
+
+      // Score.
+      // TODO(mrossetti): Implement scoring after looking at current impl.
+    }
+  }
+
+  // Remove any stale TermCharWordSet's.
+  TermCharWordSetVector new_vector;
+  for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
+       iter != term_char_word_set_cache_.end(); ++iter) {
+    if (iter->used_)
+      new_vector.push_back(*iter);
+  }
+  term_char_word_set_cache_ = new_vector;
+
+  return item_vector;
+}
+
+HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm(UnicodeString uni_word) {
+  HistoryIDSet history_id_set;
+
+  // For each character in the word, get the char/word_id map entry and
+  // intersect with the set in an incremental manner.
+  UCharSet uni_chars = CharactersFromUnicodeString(uni_word);
+  WordIDSet word_id_set(WordIDSetForTermChars(uni_chars));
+
+  // If any words resulted then we can compose a set of history IDs by unioning
+  // the sets from each word.
+  if (word_id_set.size()) {
+    for (WordIDSet::iterator word_id_iter = word_id_set.begin();
+         word_id_iter != word_id_set.end(); ++word_id_iter) {
+      WordID word_id = *word_id_iter;
+      WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
+      if (word_iter != word_id_history_map_.end()) {
+        HistoryIDSet& word_history_id_set(word_iter->second);
+        history_id_set.insert(word_history_id_set.begin(),
+                              word_history_id_set.end());
+      }
+    }
+  }
+
+  return history_id_set;
+}
+
+void InMemoryURLIndex::ResetTermCharWordSetCache() {
+  // TODO(mrossetti): Consider keeping more of the cache around for possible
+  // repeat searches, except a 'shortcuts' approach might be better for that.
+  // TODO(mrossetti): Change TermCharWordSet to a class and use for_each.
+  for (TermCharWordSetVector::iterator iter =
+       term_char_word_set_cache_.begin();
+       iter != term_char_word_set_cache_.end(); ++iter)
+    iter->used_ = false;
+}
+
+HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords(icu::UnicodeString
+                                                     uni_string) {
+  // Break the terms down into individual terms (words), get the candidate
+  // set for each term, and intersect each to get a final candidate list.
+  HistoryIDSet history_id_set;
+  UnicodeStringSet words = WordsFromUnicodeString(uni_string);
+  bool first_word = true;
+  for (UnicodeStringSet::iterator iter = words.begin();
+       iter != words.end(); ++iter) {
+    UnicodeStringSet::value_type uni_word = *iter;
+    HistoryIDSet term_history_id_set =
+    InMemoryURLIndex::HistoryIDsForTerm(uni_word);
+    if (first_word) {
+      history_id_set = term_history_id_set;
+      first_word = false;
+    } else {
+      HistoryIDSet old_history_id_set(history_id_set);
+      history_id_set.clear();
+      std::set_intersection(old_history_id_set.begin(),
+                            old_history_id_set.end(),
+                            term_history_id_set.begin(),
+                            term_history_id_set.end(),
+                            std::inserter(history_id_set,
+                                          history_id_set.begin()));
+    }
+  }
+  return history_id_set;
+}
+
+// Utility Functions
+
+UnicodeStringSet InMemoryURLIndex::WordsFromUnicodeString(
+    UnicodeString uni_string) {
+  UnicodeStringSet words;
+  UErrorCode icu_status = U_ZERO_ERROR;
+  // TODO(mrossetti): Get locale from browser:
+  //      icu::Locale(g_browser_process->GetApplicationLocale().c_str())
+  Locale locale(Locale::getDefault());
+
+  uni_string.toLower(locale);  // Convert to lowercase.
+
+  // Replace all | and _'s with a space.
+  // TODO(mrossetti): Roll this into the word_breaker_.
+  if (!bar_matcher_.get()) {
+    bar_matcher_.reset(new RegexMatcher("[|_]+", 0, icu_status));
+    if (U_FAILURE(icu_status)) {
+      // There was a syntax error in the regular expression.
+      return words;
+    }
+  }
+  bar_matcher_->reset(uni_string);
+  uni_string = bar_matcher_->replaceAll(" ", icu_status);
+  if (U_FAILURE(icu_status)) {
+    // There was a failure during the replace.
+    return words;
+  }
+
+  // Break up the string into individual words.
+  if (!word_breaker_.get()) {
+    word_breaker_.reset(static_cast<RuleBasedBreakIterator*>
+        (RuleBasedBreakIterator::createWordInstance(locale, icu_status)));
+    if (U_FAILURE(icu_status)) {
+      word_breaker_.reset();
+      return words;
+    }
+  }
+  word_breaker_->setText(uni_string);
+  int32_t start = 0;
+  int32_t previous_boundary = 0;
+  while (start != BreakIterator::DONE) {
+    int32_t breakType = word_breaker_->getRuleStatus();
+    if (breakType != UBRK_WORD_NONE) {
+      // TODO(mrossetti): Numbers may have non-numeric characters that
+      //    will have to be stripped before term matching. Do this during
+      //    term matching.
+      // number: UBRK_WORD_NUMBER <= breakType < UBRK_WORD_NUMBER_LIMIT
+      // word: UBRK_WORD_LETTER <= breakType < UBRK_WORD_LETTER_LIMIT
+      // kana: UBRK_WORD_KANA <= breakType < UBRK_WORD_KANA_LIMIT
+      // ideo: UBRK_WORD_IDEO <= breakType < UBRK_WORD_IDEO_LIMIT
+      int32_t current_boundary = word_breaker_->current();
+      int32_t word_length = current_boundary - previous_boundary;
+      UnicodeString uni_word(uni_string, previous_boundary, word_length);
+
+      // Remove non-word characters from the string.
+      if (!non_alnum_matcher_.get()) {
+        non_alnum_matcher_.reset(new RegexMatcher("\\W+", 0, icu_status));
+        if (U_FAILURE(icu_status)) {
+          // There was a syntax errors in the regular expression.
+          return words;
+        }
+      }
+      non_alnum_matcher_->reset(uni_word);
+      uni_word = non_alnum_matcher_->replaceAll("", icu_status);
+      if (U_FAILURE(icu_status)) {
+        // There was a failure during the replace.
+        break;
+      }
+
+      if (uni_word.length())
+        words.insert(uni_word);
+      previous_boundary = current_boundary;
+    }
+    start = word_breaker_->next();
+  }
+  return words;
+}
+
+UCharSet InMemoryURLIndex::CharactersFromUnicodeString(
+    icu::UnicodeString uni_word) {
+  UCharSet characters;
+  icu::StringCharacterIterator ci(uni_word);
+  for (UChar uni_char = ci.first();
+       uni_char != icu::CharacterIterator::DONE; uni_char = ci.next())
+    characters.insert(uni_char);
+  return characters;
+}
+
+bool InMemoryURLIndex::TrimParameters(icu::UnicodeString& uni_string) {
+  // Trim the URL at the first '?'.
+  UErrorCode icu_status = U_ZERO_ERROR;
+  if (!question_matcher_.get()) {
+    question_matcher_.reset(new RegexMatcher("\\?", 0, icu_status));
+    if (U_FAILURE(icu_status)) {
+      // There was a syntax error in the regular expression.
+      return false;
+    }
+  }
+  question_matcher_->reset(uni_string);
+  if (question_matcher_->find()) {
+    int32_t start = question_matcher_->start(icu_status);
+    // TODO(mrossetti): Check status here.
+    uni_string.handleReplaceBetween(start, uni_string.length(), "");
+  }
+  return true;
+}
+
+bool InMemoryURLIndex::ConvertPercentEncoding(icu::UnicodeString& uni_string) {
+  // TODO(mrossetti): Convert, don't remove, the '%xx' encodings.
+  UErrorCode icu_status = U_ZERO_ERROR;
+  if (!percent_matcher_.get()) {
+    percent_matcher_.reset(new RegexMatcher("%[0-9a-fA-F]{2}", 0, icu_status));
+    if (U_FAILURE(icu_status)) {
+      // There was a syntax error in the regular expression.
+      return false;
+    }
+  }
+  percent_matcher_->reset(uni_string);
+  uni_string = percent_matcher_->replaceAll(" ", icu_status);
+  if (U_FAILURE(icu_status)) {
+    // There was a failure during the replace.
+    return false;
+  }
+  return true;
+}
+
+void InMemoryURLIndex::AddWordToIndex(icu::UnicodeString& uni_word,
+                                      HistoryID history_id) {
+  WordMap::iterator word_pos = word_map_.find(uni_word);
+  if (word_pos != word_map_.end()) {
+    // Update existing entry in the word/history index.
+    WordID word_id = word_pos->second;
+    WordIDHistoryMap::iterator history_pos =
+    word_id_history_map_.find(word_id);
+    if (history_pos != word_id_history_map_.end()) {
+      HistoryIDSet& history_id_set(history_pos->second);
+      history_id_set.insert(history_id);
+    } else {
+      // TODO(mrossetti): Handle the unexpected failure here.
+    }
+  } else {
+    // Add a new word to the word list and the word map, and then create a
+    // new entry in the word/history map.
+    word_list_.push_back(uni_word);
+    WordID word_id = word_list_.size() - 1;
+    if (word_map_.insert(std::make_pair(uni_word, word_id)).second) {
+      HistoryIDSet history_id_set;
+      history_id_set.insert(history_id);
+      if (word_id_history_map_.insert(
+          std::make_pair(word_id, history_id_set)).second) {
+        // For each character in the newly added word (i.e. a word that is not
+        // already in the word index), add the word to the character index.
+        UCharSet characters =
+        InMemoryURLIndex::CharactersFromUnicodeString(uni_word);
+        for (UCharSet::iterator uni_char_iter = characters.begin();
+             uni_char_iter != characters.end(); ++uni_char_iter) {
+          UCharSet::value_type uni_string = *uni_char_iter;
+          CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string);
+          if (char_iter != char_word_map_.end()) {
+            // Update existing entry in the char/word index.
+            WordIDSet& word_id_set(char_iter->second);
+            word_id_set.insert(word_id);
+          } else {
+            // Create a new entry in the char/word index.
+            WordIDSet word_id_set;
+            word_id_set.insert(word_id);
+            if (!char_word_map_.insert(std::make_pair(uni_string,
+                                                      word_id_set)).second) {
+              // TODO(mrossetti): Handle the unexpected failure here.
+            }
+          }
+        }
+      } else {
+        // TODO(mrossetti): Handle the unexpected failure here.
+      }
+    } else {
+      // TODO(mrossetti): Handle the unexpected failure here.
+    }
+  }
+}
+
+WordIDSet InMemoryURLIndex::WordIDSetForTermChars(UCharSet const& uni_chars) {
+  TermCharWordSet* best_term_char_word_set = NULL;
+  bool set_found = false;
+  size_t outstanding_count = 0;
+  UCharSet outstanding_chars;
+  for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin();
+       iter != term_char_word_set_cache_.end(); ++iter) {
+    TermCharWordSetVector::value_type& term_char_word_set(*iter);
+    UCharSet& char_set(term_char_word_set.char_set_);
+    UCharSet exclusions;
+    set_difference(char_set.begin(), char_set.end(),
+                   uni_chars.begin(), uni_chars.end(),
+                   std::inserter(exclusions, exclusions.begin()));
+    if (exclusions.size() == 0) {
+      // Do a reverse difference so that we know which characters remain
+      // to be indexed. Then decide if this is a better match than any
+      // previous cached set.
+      set_difference(uni_chars.begin(), uni_chars.end(),
+                     char_set.begin(), char_set.end(),
+                     std::inserter(exclusions, exclusions.begin()));
+      if (!set_found || exclusions.size() < outstanding_count) {
+        outstanding_chars = exclusions;
+        best_term_char_word_set = &*iter;
+        outstanding_count = exclusions.size();
+        set_found = true;
+      }
+    }
+  }
+
+  WordIDSet word_id_set;
+  if (set_found && outstanding_count == 0) {
+    // If there were no oustanding characters then we can use the cached one.
+    best_term_char_word_set->used_ = true;
+    word_id_set = best_term_char_word_set->word_id_set_;
+  } else {
+    // Some or all of the characters remain to be indexed.
+    bool first_character = true;
+    if (set_found) {
+      first_character = false;
+      word_id_set = best_term_char_word_set->word_id_set_;
+    } else {
+      outstanding_chars = uni_chars;
+    }
+
+    for (UCharSet::iterator uni_char_iter = outstanding_chars.begin();
+         uni_char_iter != outstanding_chars.end(); ++uni_char_iter) {
+      UCharSet::value_type uni_char = *uni_char_iter;
+      CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
+      if (char_iter != char_word_map_.end()) {
+        WordIDSet& char_word_id_set(char_iter->second);
+        if (first_character) {
+          word_id_set = char_word_id_set;
+          first_character = false;
+        } else {
+          WordIDSet old_word_id_set(word_id_set);
+          word_id_set.clear();
+          std::set_intersection(old_word_id_set.begin(),
+                                old_word_id_set.end(),
+                                char_word_id_set.begin(),
+                                char_word_id_set.end(),
+                                std::inserter(word_id_set,
+                                word_id_set.begin()));
+        }
+      } else {
+        // The character was not found so bail.
+        word_id_set.clear();
+        break;
+      }
+    }
+    // Update the cache.
+    if (!set_found || outstanding_count) {
+      TermCharWordSet new_char_word_set(uni_chars, word_id_set, true);
+      term_char_word_set_cache_.push_back(new_char_word_set);
+    }
+  }
+  return word_id_set;
+}
+
+// Statistics
+
+#if HISTORY_INDEX_STATISTICS_ENABLED
+
+uint64 InMemoryURLIndex::IndexedHistoryCount() {
+  return history_item_count_;
+}
+
+uint64 InMemoryURLIndex::IndexedWordCount() {
+  return word_list_.size();
+}
+
+// Return a count of the characters which have been indexed.
+uint64 InMemoryURLIndex::IndexedCharacterCount() {
+  return char_word_map_.size();
+}
+
+UnicodeStringVector InMemoryURLIndex::IndexedWords() const {
+  UnicodeStringVector words(word_list_);
+  return words;  // Return a copy.
+}
+
+uint64 InMemoryURLIndex::NumberOfHistoryItemsForWord(UnicodeString uni_word) {
+  uint64 history_count = 0;
+  WordMap::iterator word_pos = word_map_.find(uni_word);
+  if (word_pos != word_map_.end()) {
+    WordID word_id = word_pos->second;
+    WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
+    if (history_pos != word_id_history_map_.end())
+      history_count = history_pos->second.size();
+  }
+  return history_count;
+}
+
+UnicodeStringVector InMemoryURLIndex::IndexedCharacters() {
+  UnicodeStringVector characters;
+  for (CharWordIDMap::iterator iter = char_word_map_.begin();
+       iter != char_word_map_.end(); ++iter)
+    characters.push_back(iter->first);
+  return characters;
+}
+
+uint64 InMemoryURLIndex::NumberOfWordsForChar(UChar uni_char) {
+  uint64 word_count = 0;
+  CharWordIDMap::iterator char_pos = char_word_map_.find(uni_char);
+  if (char_pos != char_word_map_.end())
+    word_count = char_pos->second.size();
+  return word_count;
+}
+
+uint64 InMemoryURLIndex::WordListSize() {
+  uint64 listSize = 0;
+  // TODO(mrossetti): Use an algorithm.
+  for (UnicodeStringVector::iterator iter = word_list_.begin();
+       iter != word_list_.end(); ++iter) {
+    listSize += iter->length();  // vector overhead is ignored.
+  }
+  return listSize;
+}
+
+uint64 InMemoryURLIndex::WordMapSize() {
+  uint64 mapSize = 0;
+  for (WordMap::iterator iter = word_map_.begin(); iter != word_map_.end();
+       ++iter) {
+    mapSize += sizeof(WordID) + iter->first.length();
+  }
+  return mapSize;
+}
+
+uint64 InMemoryURLIndex::WordHistoryMapSize() {
+  uint64 mapSize = 0;
+  for (WordIDHistoryMap::iterator iter = word_id_history_map_.begin();
+       iter != word_id_history_map_.end(); ++iter) {
+    mapSize += sizeof(WordID) + (iter->second.size() * sizeof(HistoryID));
+  }
+  return mapSize;
+}
+
+uint64 InMemoryURLIndex::CharWordMapSize() {
+  uint64 mapSize = 0;
+  for (CharWordIDMap::iterator iter = char_word_map_.begin();
+       iter != char_word_map_.end(); ++iter) {
+    mapSize += sizeof(UChar) + (iter->second.size() * sizeof(WordID));
+  }
+  return mapSize;
+}
+
+uint64 InMemoryURLIndex::HistoryInfoSize() {
+  uint64 mapSize = 0;
+  for (HistoryInfoMap::iterator iter = history_info_map_.begin();
+       iter != history_info_map_.end(); ++iter) {
+    mapSize += sizeof(iter->first);
+    URLHistoryItem& hist_item(iter->second);
+    mapSize += sizeof(hist_item.history_id_);
+    mapSize += hist_item.url_.size();
+    mapSize += hist_item.title_.size();
+    mapSize += sizeof(hist_item.visit_count_);
+    mapSize += hist_item.last_visit_time_.size();
+  }
+  return mapSize;
+}
+
+#endif  // HISTORY_INDEX_STATISTICS_ENABLED
+
+// URLHistoryItem Methods
+
+bool URLHistoryItem::TermIsMissing(SearchTermsVector* terms_vector) const {
+  for (SearchTermsVector::iterator term_iter = terms_vector->begin();
+       term_iter != terms_vector->end(); ++term_iter) {
+    SearchTermsVector::value_type& term(*term_iter);
+    if (term.length() > 1 &&
+        url_.find(term) == string::npos &&
+        title_.find(term) == string::npos)
+      return true;
+  }
+  return false;
+}
+
+}  // namespace history
diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h
new file mode 100644
index 0000000..43b4236
--- /dev/null
+++ b/chrome/browser/history/in_memory_url_index.h
@@ -0,0 +1,240 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
+#define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "app/sql/connection.h"
+#include "base/basictypes.h"
+#include "base/scoped_ptr.h"
+#include "base/string16.h"
+#include "third_party/icu/public/common/unicode/rbbi.h"
+#include "third_party/icu/public/i18n/unicode/regex.h"
+#include "third_party/icu/public/common/unicode/unistr.h"
+
+namespace history {
+
+// Define the following if your application would like to collect information
+// about the history index and memory usage.
+// #define HISTORY_INDEX_STATISTICS_ENABLED 1
+
+// TODO(mrossetti): Move as many as possible of these typedefs, structs,
+// and support classes into the InMemoryURLIndex class.
+
+// UnicodeString compare functor which assists in handling UBool.
+class UnicodeStringCompare {
+ public:
+  bool operator() (const icu::UnicodeString& lhs,
+                   const icu::UnicodeString& rhs) {
+    return (lhs < rhs) ? true : false;
+  }
+};
+
+// Convenience types
+typedef std::vector<icu::UnicodeString> UnicodeStringVector;
+typedef std::set<icu::UnicodeString, UnicodeStringCompare> UnicodeStringSet;
+typedef std::set<UChar> UCharSet;
+typedef std::vector<std::string> SearchTermsVector;
+
+// An index into list of all of the words we have indexed.
+typedef int16 WordID;
+
+// A map allowing a WordID to be determined given a word.
+typedef std::map<icu::UnicodeString, WordID, UnicodeStringCompare> WordMap;
+
+// A map from character to word_id's.
+typedef std::set<WordID> WordIDSet;  // An index into the WordList.
+typedef std::map<UChar, WordIDSet> CharWordIDMap;
+
+// A map from word_id to history item.
+typedef int16 HistoryID;  // uint16 might actually work for us.
+typedef std::set<HistoryID> HistoryIDSet;
+typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
+
+// Support caching of term character intersections so that we can optimize
+// searches which build upon a previous search.
+struct TermCharWordSet {
+  TermCharWordSet(UCharSet char_set, WordIDSet word_id_set, bool used)
+      : char_set_(char_set), word_id_set_(word_id_set), used_(used) {}
+
+  UCharSet char_set_;
+  WordIDSet word_id_set_;
+  bool used_;  // true if this set has been used for the current term search.
+};
+typedef std::vector<TermCharWordSet> TermCharWordSetVector;
+
+// Describes an entry in the urls table of the History database.
+struct URLHistoryItem {
+  URLHistoryItem(HistoryID history_id,
+                 std::string& url,
+                 std::string& title,
+                 int visit_count,
+                 std::string& last_visit_time)
+      : history_id_(history_id), url_(url), title_(title),
+        visit_count_(visit_count), last_visit_time_(last_visit_time) {}
+
+  // Return true if any of the search terms do not match a substring.
+  // TODO(mrossetti): Also need to score and add UI hints.
+  bool TermIsMissing(SearchTermsVector* terms_vector) const;
+
+  HistoryID history_id_;
+  std::string url_;
+  std::string title_;
+  int visit_count_;
+  std::string last_visit_time_;
+  // TODO(mrossetti): Make a class. Add fields for score and to provide
+  // term matching metrics so a UI can highlight the exact match.
+};
+typedef std::vector<URLHistoryItem> URLItemVector;
+
+// TODO(mrossetti): Rip out either the SQLite filtering or the in-memory
+// HistoryInfo structure.
+// A map from history_id to the history's URL and title.
+typedef std::map<HistoryID, URLHistoryItem> HistoryInfoMap;
+
+// The history source.
+class InMemoryURLIndex {
+ public:
+  InMemoryURLIndex()
+      : history_item_count_(0), question_matcher_(NULL),
+        percent_matcher_(NULL), non_alnum_matcher_(NULL), bar_matcher_(NULL),
+        word_breaker_(NULL)  {}
+  ~InMemoryURLIndex() {}
+
+  // Open and index the URL history database.
+  bool IndexURLHistory(const FilePath& history_path);
+
+  // Reset the history index.
+  void Reset();
+
+  // Given a vector containing one or more words as utf8 strings, scan the
+  // history index and return a vector with all scored, matching history items.
+  // Each term must occur somewhere in the history item for the item to
+  // qualify, however, the terms do not necessarily have to be adjacent.
+  URLItemVector HistoryItemsForTerms(SearchTermsVector const& terms);
+
+#if HISTORY_INDEX_STATISTICS_ENABLED
+
+  // Return a count of the history items which have been indexed.
+  uint64 IndexedHistoryCount();
+
+  // Return a count of the words which have been indexed.
+  uint64 IndexedWordCount();
+
+  // Return a count of the characters which have been indexed.
+  uint64 IndexedCharacterCount();
+
+  // Return a vector containing all of the words which have been indexed
+  // as UTF8 strings.
+  UnicodeStringVector IndexedWords() const;
+
+  // Return the count of history items for the given word.
+  uint64 NumberOfHistoryItemsForWord(icu::UnicodeString uni_word);
+
+  // Return a vector containing all of the characters which have been indexed
+  // as UTF8 strings.
+  UnicodeStringVector IndexedCharacters();
+
+  // Return the count of words for the given character.
+  uint64 NumberOfWordsForChar(UChar uni_char);
+
+  // Return the total size of the word list vector.
+  uint64 WordListSize();
+
+  // Return the total size of the word/word-id map.
+  uint64 WordMapSize();
+
+  // Return the total size of the word_id_history_map_.
+  uint64 WordHistoryMapSize();
+
+  // Return the total size of the char_word_map_.
+  uint64 CharWordMapSize();
+
+  // Return the total size of the char_word_map_.
+  uint64 HistoryInfoSize();
+
+#endif  // HISTORY_INDEX_STATISTICS_ENABLED
+
+ private:
+  // Break an UnicodeString down into individual words.
+  UnicodeStringSet WordsFromUnicodeString(icu::UnicodeString uni_string);
+
+  // Given a set of UChars, find words containing those characters. If any
+  // existing, cached set is a proper subset then start with that cached
+  // set. Update the cache.
+  WordIDSet WordIDSetForTermChars(UCharSet const& uni_chars);
+
+  // URL History indexing support functions.
+
+  // Index one URL history item.
+  bool IndexRow(int64 row_id, std::string url, std::string title);
+
+  // Break an UnicodeString down into its individual characters.
+  // Note that this is temporarily intended to work on a single word, but
+  // _will_ work on a string of words, perhaps with unexpected results.
+  // TODO(mrossetti): Lots of optimizations possible here for not restarting
+  // a search if the user is just typing along. Also, change this to uniString
+  // and properly handle substring matches, scoring and sorting the results
+  // by score. Also, provide the metrics for where the matches occur so that
+  // the UI can highlight the matched sections.
+  UCharSet CharactersFromUnicodeString(icu::UnicodeString uni_word);
+
+  // Trim the parameters from the URL being indexed. Return true if there
+  // were not errors encountered during the operation.
+  bool TrimParameters(icu::UnicodeString& uni_string);
+
+  // Convert %xx encodings in the URL being indexed. Return true if there
+  // were not errors encountered during the operation.
+  bool ConvertPercentEncoding(icu::UnicodeString& uni_string);
+
+  // Given a single word, add a reference to the containing history item
+  // to the index.
+  void AddWordToIndex(icu::UnicodeString& uni_word, HistoryID history_id);
+
+  // URL History index search support functions.
+
+  // Given a single search term (i.e. one word, utf8) return a set of
+  // history IDs which map to that term but do not necessarily match
+  // as a substring.
+  HistoryIDSet HistoryIDsForTerm(icu::UnicodeString uni_word);
+
+  // Clear the search term cache. This cache holds on to the intermediate
+  // word results for each previously typed character to eliminate the need
+  // to re-perform set operations for previously typed characters.
+  void ResetTermCharWordSetCache();
+
+  // Compose a set of history item IDs by intersecting the set for each word
+  // in |uni_string|.
+  HistoryIDSet HistoryIDSetFromWords(icu::UnicodeString uni_string);
+
+ private:
+  UnicodeStringVector word_list_;  // A list of all of indexed words.
+                                   // NOTE: A word will _never_ be removed from
+                                   // this vector to insure that the word_id
+                                   // remains immutable throughout the session,
+                                   // reducing maintenance complexity.
+  sql::Connection history_database_;
+  uint64 history_item_count_;
+  WordMap word_map_;
+  CharWordIDMap char_word_map_;
+  WordIDHistoryMap word_id_history_map_;
+  TermCharWordSetVector term_char_word_set_cache_;
+  HistoryInfoMap history_info_map_;
+
+  // Cached items.
+  scoped_ptr<icu::RegexMatcher> question_matcher_;
+  scoped_ptr<icu::RegexMatcher> percent_matcher_;
+  scoped_ptr<icu::RegexMatcher> non_alnum_matcher_;
+  scoped_ptr<icu::RegexMatcher> bar_matcher_;
+  scoped_ptr<icu::RuleBasedBreakIterator> word_breaker_;
+};
+
+}  // namespace history
+
+#endif  // CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_
diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc
new file mode 100644
index 0000000..8dc9569
--- /dev/null
+++ b/chrome/browser/history/in_memory_url_index_unittest.cc
@@ -0,0 +1,5 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// TODO(mrossetti): Implement the unit tests
diff --git a/chrome/chrome_browser.gypi b/chrome/chrome_browser.gypi
index 4292a17..6415453 100644
--- a/chrome/chrome_browser.gypi
+++ b/chrome/chrome_browser.gypi
@@ -1572,6 +1572,8 @@
         'browser/history/in_memory_database.h',
         'browser/history/in_memory_history_backend.cc',
         'browser/history/in_memory_history_backend.h',
+        'browser/history/in_memory_url_index.cc',
+        'browser/history/in_memory_url_index.h',
         'browser/history/page_usage_data.cc',
         'browser/history/page_usage_data.h',
         'browser/history/query_parser.cc',
diff --git a/chrome/chrome_tests.gypi b/chrome/chrome_tests.gypi
index 69cef8d..0a55ebd 100644
--- a/chrome/chrome_tests.gypi
+++ b/chrome/chrome_tests.gypi
@@ -843,6 +843,7 @@
         'browser/history/history_querying_unittest.cc',
         'browser/history/history_types_unittest.cc',
         'browser/history/history_unittest.cc',
+        'browser/history/in_memory_url_index_unittest.cc',
         'browser/history/query_parser_unittest.cc',
         'browser/history/snippet_unittest.cc',
         'browser/history/starred_url_database_unittest.cc',
author	mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-06-30 19:01:22 +0000
committer	mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-06-30 19:01:22 +0000
commit	a88fdf531a6a3f34e22d3d8443516b3970e4362f (patch)
tree	ddf2df86e958505d27f739a2cd67211fcedf2f23 /chrome
parent	a38b3b5ed715dcf37adabaf8632baf26efdce8a0 (diff)
download	chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.zip chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.tar.gz chromium_src-a88fdf531a6a3f34e22d3d8443516b3970e4362f.tar.bz2