diff options
author | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-21 01:35:23 +0000 |
---|---|---|
committer | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-21 01:35:23 +0000 |
commit | e668070a63734b38c52bf358e98a4a1a730cf57e (patch) | |
tree | 9a68577ae02cca8d7dca17c1e34151c15edb6205 /chrome/browser/history | |
parent | 5e650297186c6c086ed281ec0f21f056d90715c7 (diff) | |
download | chromium_src-e668070a63734b38c52bf358e98a4a1a730cf57e.zip chromium_src-e668070a63734b38c52bf358e98a4a1a730cf57e.tar.gz chromium_src-e668070a63734b38c52bf358e98a4a1a730cf57e.tar.bz2 |
Next step integrating the HistoryQuickProvider: Implement index initialization and population and provide unit test with test data.
BUG=None
TEST=None
Review URL: http://codereview.chromium.org/3138006
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@56962 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/history')
-rw-r--r-- | chrome/browser/history/in_memory_history_backend.cc | 9 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index.cc | 164 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index.h | 144 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index_unittest.cc | 100 |
4 files changed, 406 insertions, 11 deletions
diff --git a/chrome/browser/history/in_memory_history_backend.cc b/chrome/browser/history/in_memory_history_backend.cc index 1350667..47b3a43 100644 --- a/chrome/browser/history/in_memory_history_backend.cc +++ b/chrome/browser/history/in_memory_history_backend.cc @@ -7,14 +7,17 @@ #include "base/command_line.h" #include "base/histogram.h" #include "base/time.h" +#include "base/utf_string_conversions.h" #include "chrome/browser/browser_process.h" #include "chrome/browser/history/history_notifications.h" #include "chrome/browser/history/in_memory_database.h" #include "chrome/browser/history/in_memory_url_index.h" #include "chrome/browser/history/url_database.h" +#include "chrome/browser/pref_service.h" #include "chrome/browser/profile.h" #include "chrome/common/chrome_switches.h" #include "chrome/common/notification_service.h" +#include "chrome/common/pref_names.h" namespace history { @@ -36,9 +39,11 @@ bool InMemoryHistoryBackend::Init(const FilePath& history_filename, if (CommandLine::ForCurrentProcess()->HasSwitch( switches::kEnableInMemoryURLIndex)) { - index_.reset(new InMemoryURLIndex); + index_.reset(new InMemoryURLIndex()); base::TimeTicks beginning_time = base::TimeTicks::Now(); - index_->Init(db); + string16 languages = + UTF8ToUTF16(profile_->GetPrefs()->GetString(prefs::kAcceptLanguages)); + index_->Init(db, languages); UMA_HISTOGRAM_TIMES("Autocomplete.HistoryDatabaseIndexingTime", base::TimeTicks::Now() - beginning_time); } diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc index 7f249df2..c69ec470 100644 --- a/chrome/browser/history/in_memory_url_index.cc +++ b/chrome/browser/history/in_memory_url_index.cc @@ -4,16 +4,172 @@ #include "chrome/browser/history/in_memory_url_index.h" +#include <algorithm> +#include <limits> + +#include "app/l10n_util.h" +#include "base/i18n/word_iterator.h" +#include "base/string_util.h" +#include "base/time.h" +#include "base/utf_string_conversions.h" #include "chrome/browser/history/url_database.h" +#include "net/base/escape.h" +#include "net/base/net_util.h" + +using base::Time; +using base::TimeDelta; namespace history { +InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {} + +InMemoryURLIndex::~InMemoryURLIndex() {} + // Indexing -bool InMemoryURLIndex::Init(history::URLDatabase* history_db) { - bool success = true; - // TODO(mrossetti): Implement. - return success; +bool InMemoryURLIndex::Init(history::URLDatabase* history_db, + const string16& languages) { + // TODO(mrossetti): Register for profile/language change notifications. + languages_ = languages; + // Reset our indexes. + char_word_map_.clear(); + word_id_history_map_.clear(); + if (!history_db) + return false; + URLDatabase::URLEnumerator history_enum; + if (history_db->InitURLEnumeratorForEverything(&history_enum)) { + URLRow row; + Time recent_threshold = InMemoryURLIndex::RecentThreshold(); + while (history_enum.GetNextURL(&row)) { + // Do some filtering so that we only get history items which could + // possibly pass the HistoryURLProvider::CullPoorMatches filter later. + if ((row.typed_count() > kLowQualityMatchTypedLimit) || + (row.visit_count() > kLowQualityMatchVisitLimit) || + (row.last_visit() >= recent_threshold)) { + if (!IndexRow(row)) + return false; + } + } + } + return true; +} + +bool InMemoryURLIndex::IndexRow(URLRow row) { + const GURL& gurl(row.url()); + string16 url(WideToUTF16(net::FormatUrl(gurl, UTF16ToWide(languages_), + net::kFormatUrlOmitUsernamePassword, + UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, NULL, NULL, + NULL))); + + // TODO(mrossetti): Find or implement a ConvertPercentEncoding and use it + // on the url. + + // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). + HistoryID history_id = static_cast<HistoryID>(row.id()); + + // Add the row for quick lookup in the history info store. + url = l10n_util::ToLower(url); + URLRow new_row(GURL(url), row.id()); + new_row.set_visit_count(row.visit_count()); + new_row.set_typed_count(row.typed_count()); + new_row.set_last_visit(row.last_visit()); + new_row.set_title(row.title()); + history_info_map_.insert(std::make_pair(history_id, new_row)); + + // Split into individual, unique words. + String16Set words = WordsFromString16(url); + + // For each word, add a new entry into the word index referring to the + // associated history item. + for (String16Set::iterator iter = words.begin(); + iter != words.end(); ++iter) { + String16Set::value_type uni_word = *iter; + AddWordToIndex(uni_word, history_id); + } + ++history_item_count_; + return true; +} + +// Utility Functions + +InMemoryURLIndex::String16Set InMemoryURLIndex::WordsFromString16( + const string16& uni_string) { + String16Set words; + + // TODO(mrossetti): Replace all | and _'s with a space, all % quoted + // characters with real characters, and break into words, using + // appropriate string16 functions. + WordIterator iter(&uni_string, WordIterator::BREAK_WORD); + if (iter.Init()) { + while (iter.Advance()) { + if (iter.IsWord()) { + words.insert(iter.GetWord()); + } + } + } + return words; +} + +InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16( + const string16& uni_word) { + Char16Set characters; + for (string16::const_iterator iter = uni_word.begin(); + iter != uni_word.end(); ++iter) + characters.insert(*iter); + return characters; +} + +void InMemoryURLIndex::AddWordToIndex(const string16& uni_word, + HistoryID history_id) { + WordMap::iterator word_pos = word_map_.find(uni_word); + if (word_pos != word_map_.end()) + UpdateWordHistory(word_pos->second, history_id); + else + AddWordHistory(uni_word, history_id); +} + +void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { + WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); + DCHECK(history_pos != word_id_history_map_.end()); + HistoryIDSet& history_id_set(history_pos->second); + history_id_set.insert(history_id); +} + +// Add a new word to the word list and the word map, and then create a +// new entry in the word/history map. +void InMemoryURLIndex::AddWordHistory(const string16& uni_word, + HistoryID history_id) { + word_list_.push_back(uni_word); + WordID word_id = word_list_.size() - 1; + DCHECK(word_map_.insert(std::make_pair(uni_word, word_id)).second); + HistoryIDSet history_id_set; + history_id_set.insert(history_id); + DCHECK(word_id_history_map_.insert( + std::make_pair(word_id, history_id_set)).second); + // For each character in the newly added word (i.e. a word that is not + // already in the word index), add the word to the character index. + Char16Set characters = CharactersFromString16(uni_word); + for (Char16Set::iterator uni_char_iter = characters.begin(); + uni_char_iter != characters.end(); ++uni_char_iter) { + Char16Set::value_type uni_string = *uni_char_iter; + CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string); + if (char_iter != char_word_map_.end()) { + // Update existing entry in the char/word index. + WordIDSet& word_id_set(char_iter->second); + word_id_set.insert(word_id); + } else { + // Create a new entry in the char/word index. + WordIDSet word_id_set; + word_id_set.insert(word_id); + DCHECK(char_word_map_.insert(std::make_pair(uni_string, + word_id_set)).second); + } + } +} + +// static +Time InMemoryURLIndex::RecentThreshold() { + return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); } } // namespace history diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h index 5b7861a..ca270b0 100644 --- a/chrome/browser/history/in_memory_url_index.h +++ b/chrome/browser/history/in_memory_url_index.h @@ -6,6 +6,22 @@ #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ #pragma once +#include <map> +#include <set> +#include <vector> + +#include "app/sql/connection.h" +#include "base/basictypes.h" +#include "base/linked_ptr.h" +#include "base/scoped_ptr.h" +#include "base/string16.h" +#include "chrome/browser/history/history_types.h" +#include "testing/gtest/include/gtest/gtest_prod.h" + +namespace base { +class Time; +} + namespace history { class URLDatabase; @@ -15,13 +31,135 @@ class URLDatabase; // quickly look up matching URLs for a given query string. Used by // the HistoryURLProvider for inline autocomplete and to provide URL // matches to the omnibox. +// +// Note about multi-byte codepoints and the data structures in the +// InMemoryURLIndex class: One will quickly notice that no effort is made to +// insure that multi-byte character boundaries are detected when indexing the +// words and characters in the URL history database except when converting +// URL strings to lowercase. Multi-byte-edness makes no difference when +// indexing or when searching the index as the final filtering of results +// is dependent on the comparison of a string of bytes, not individual +// characters. While the lookup of those bytes during a search in the +// |char_word_map_| could serve up words in which the individual char16 +// occurs as a portion of a composite character the next filtering step +// will eliminate such words except in the case where a single character +// is being searched on and which character occurs as the second char16 of a +// multi-char16 instance. class InMemoryURLIndex { public: - InMemoryURLIndex() {} - ~InMemoryURLIndex() {} + InMemoryURLIndex(); + ~InMemoryURLIndex(); + + // Convenience types + typedef std::vector<string16> String16Vector; // Open and index the URL history database. - bool Init(URLDatabase* history_db); + bool Init(URLDatabase* history_db, const string16& languages); + + // Reset the history index. + void Reset(); + + // Given a vector containing one or more words as string16s, scan the + // history index and return a vector with all scored, matching history items. + // Each term must occur somewhere in the history item for the item to + // qualify; however, the terms do not necessarily have to be adjacent. + HistoryMatches HistoryItemsForTerms(const String16Vector& terms); + + // Returns the date threshold for considering an history item as significant. + static base::Time RecentThreshold(); + + private: + friend class InMemoryURLIndexTest; + FRIEND_TEST(InMemoryURLIndexTest, Initialization); + + // Convenience types + typedef std::set<string16> String16Set; + typedef std::set<char16> Char16Set; + + // An index into list of all of the words we have indexed. + typedef int16 WordID; + + // A map allowing a WordID to be determined given a word. + typedef std::map<string16, WordID> WordMap; + + // A map from character to word_ids. + typedef std::set<WordID> WordIDSet; // An index into the WordList. + typedef std::map<char16, WordIDSet> CharWordIDMap; + + // A map from word_id to history item. + // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit. + // Consider using a smaller type. + typedef URLID HistoryID; + typedef std::set<HistoryID> HistoryIDSet; + typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; + + // Support caching of term character intersections so that we can optimize + // searches which build upon a previous search. + struct TermCharWordSet { + TermCharWordSet(Char16Set char_set, WordIDSet word_id_set, bool used) + : char_set_(char_set), + word_id_set_(word_id_set), + used_(used) {} + + Char16Set char_set_; + WordIDSet word_id_set_; + bool used_; // true if this set has been used for the current term search. + }; + typedef std::vector<TermCharWordSet> TermCharWordSetVector; + + // TODO(rohitrao): Probably replace this with QueryResults. + typedef std::vector<URLRow> URLRowVector; + + // A map from history_id to the history's URL and title. + typedef std::map<HistoryID, URLRow> HistoryInfoMap; + + // Break a string down into individual words. + String16Set WordsFromString16(const string16& uni_string); + + // URL History indexing support functions. + + // Index one URL history item. + bool IndexRow(URLRow row); + + // Break a string down into its individual characters. + // Note that this is temporarily intended to work on a single word, but + // _will_ work on a string of words, perhaps with unexpected results. + // TODO(mrossetti): Lots of optimizations possible here for not restarting + // a search if the user is just typing along. Also, change this to uniString + // and properly handle substring matches, scoring and sorting the results + // by score. Also, provide the metrics for where the matches occur so that + // the UI can highlight the matched sections. + Char16Set CharactersFromString16(const string16& uni_word); + + // Given a single word, add a reference to the containing history item + // to the index. + void AddWordToIndex(const string16& uni_word, HistoryID history_id); + + // Update an existing entry in the word/history index by adding the + // |history_id| to set for |word_id| in the word_id_history_map_. + void UpdateWordHistory(WordID word_id, HistoryID history_id); + + // Create a new entry in the word/history map for |word_id| and add + // |history_id| as the initial element of the word's set. + void AddWordHistory(const string16& uni_word, HistoryID history_id); + + // A list of all of indexed words. The index of a word in this list is the + // ID of the word in the word_map_. It reduces the memory overhead by + // replacing a potentially long and repeated string with a simple index. + // NOTE: A word will _never_ be removed from this vector thus insuring + // the immutability of the word_id throughout the session, reducing + // maintenance complexity. + String16Vector word_list_; + + uint64 history_item_count_; + WordMap word_map_; + CharWordIDMap char_word_map_; + WordIDHistoryMap word_id_history_map_; + TermCharWordSetVector term_char_word_set_cache_; + HistoryInfoMap history_info_map_; + string16 languages_; + + DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); }; } // namespace history diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc index f5932ef..2459053 100644 --- a/chrome/browser/history/in_memory_url_index_unittest.cc +++ b/chrome/browser/history/in_memory_url_index_unittest.cc @@ -2,16 +2,100 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "chrome/browser/history/in_memory_url_index.h" +#include <stdio.h> + +#include <fstream> +#include <string> +#include <vector> +#include "app/sql/statement.h" +#include "base/file_util.h" +#include "base/path_service.h" #include "base/scoped_ptr.h" +#include "base/time.h" +#include "base/utf_string_conversions.h" +#include "chrome/browser/history/in_memory_url_index.h" +#include "chrome/browser/history/url_database.h" +#include "chrome/common/chrome_paths.h" #include "testing/gtest/include/gtest/gtest.h" +// The test version of the history url database table ('url') is contained in +// a database file created from a text file('url_history_provider_test.db.txt'). +// The only difference between this table and a live 'urls' table from a +// profile is that the last_visit_time column in the test table contains a +// number specifying the number of days relative to 'today' to which the +// absolute time should be set during the test setup stage. +// +// The format of the test database text file is of a SQLite .dump file. +// Note that only lines whose first character is an upper-case letter are +// processed when creating the test database. + +using base::Time; +using base::TimeDelta; + namespace history { -class InMemoryURLIndexTest : public testing::Test { +class InMemoryURLIndexTest : public testing::Test, + public URLDatabase { + public: + InMemoryURLIndexTest() {} + protected: + // Test setup. + void SetUp() { + // Create and populate a working copy of the URL history database. + FilePath history_proto_path; + PathService::Get(chrome::DIR_TEST_DATA, &history_proto_path); + history_proto_path = history_proto_path.Append( + FILE_PATH_LITERAL("History")); + history_proto_path = history_proto_path.Append( + FILE_PATH_LITERAL("url_history_provider_test.db.txt")); + EXPECT_TRUE(file_util::PathExists(history_proto_path)); + EXPECT_TRUE(db_.OpenInMemory()); + + std::ifstream proto_file(history_proto_path.value().c_str()); + static const size_t kCommandBufferMaxSize = 2048; + char sql_cmd_line[kCommandBufferMaxSize]; + while (!proto_file.eof()) { + proto_file.getline(sql_cmd_line, kCommandBufferMaxSize); + if (!proto_file.eof()) { + // We only process lines which begin with a upper-case letter. + // TODO(mrossetti): Can iswupper() be used here? + if (sql_cmd_line[0] >= 'A' && sql_cmd_line[0] <= 'Z') { + std::string sql_cmd(sql_cmd_line); + sql::Statement sql_stmt(db_.GetUniqueStatement(sql_cmd_line)); + EXPECT_TRUE(sql_stmt.Run()); + } + } + } + proto_file.close(); + + // Update the last_visit_time table column + // such that it represents a time relative to 'now'. + sql::Statement statement(db_.GetUniqueStatement( + "SELECT" HISTORY_URL_ROW_FIELDS "FROM urls;")); + EXPECT_TRUE(statement); + Time time_right_now = Time::NowFromSystemTime(); + TimeDelta day_delta = TimeDelta::FromDays(1); + while (statement.Step()) { + URLRow row; + FillURLRow(statement, &row); + Time last_visit = time_right_now; + for (int64 i = row.last_visit().ToInternalValue(); i > 0; --i) + last_visit -= day_delta; + row.set_last_visit(last_visit); + UpdateURLRow(row.id(), row); + } + } + + void TearDown() { + db_.Close(); + } + + virtual sql::Connection& GetDB() { return db_; } + scoped_ptr<InMemoryURLIndex> url_index_; + sql::Connection db_; }; TEST_F(InMemoryURLIndexTest, Construction) { @@ -19,4 +103,16 @@ TEST_F(InMemoryURLIndexTest, Construction) { EXPECT_TRUE(url_index_.get()); } +TEST_F(InMemoryURLIndexTest, Initialization) { + url_index_.reset(new InMemoryURLIndex); + url_index_->Init(this, UTF8ToUTF16("en,ja,hi,zh")); + // There should have been 25 of the 29 urls accepted during filtering. + EXPECT_EQ(25U, url_index_->history_item_count_); + // The resulting indexes should account for: + // 37 characters + // 88 words + EXPECT_EQ(37U, url_index_->char_word_map_.size()); + EXPECT_EQ(88U, url_index_->word_map_.size()); +} + } // namespace history |