diff options
author | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-21 02:03:41 +0000 |
---|---|---|
committer | mrossetti@chromium.org <mrossetti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-21 02:03:41 +0000 |
commit | 7689af9a4483fd4908bce9e3c59b4273fb184ec6 (patch) | |
tree | 2615c797da5b6350220a99d1b20cbc4e755b584c /chrome/browser/history | |
parent | 0c4ea7914810f9f937ef43744e4b1fb2c5491422 (diff) | |
download | chromium_src-7689af9a4483fd4908bce9e3c59b4273fb184ec6.zip chromium_src-7689af9a4483fd4908bce9e3c59b4273fb184ec6.tar.gz chromium_src-7689af9a4483fd4908bce9e3c59b4273fb184ec6.tar.bz2 |
Revert 56962 - Next step integrating the HistoryQuickProvider: Implement index initialization and population and provide unit test with test data.
BUG=None
TEST=None
Review URL: http://codereview.chromium.org/3138006
TBR=mrossetti@chromium.org
Review URL: http://codereview.chromium.org/3197008
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@56968 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/history')
-rw-r--r-- | chrome/browser/history/in_memory_history_backend.cc | 9 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index.cc | 164 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index.h | 144 | ||||
-rw-r--r-- | chrome/browser/history/in_memory_url_index_unittest.cc | 100 |
4 files changed, 11 insertions, 406 deletions
diff --git a/chrome/browser/history/in_memory_history_backend.cc b/chrome/browser/history/in_memory_history_backend.cc index 47b3a43..1350667 100644 --- a/chrome/browser/history/in_memory_history_backend.cc +++ b/chrome/browser/history/in_memory_history_backend.cc @@ -7,17 +7,14 @@ #include "base/command_line.h" #include "base/histogram.h" #include "base/time.h" -#include "base/utf_string_conversions.h" #include "chrome/browser/browser_process.h" #include "chrome/browser/history/history_notifications.h" #include "chrome/browser/history/in_memory_database.h" #include "chrome/browser/history/in_memory_url_index.h" #include "chrome/browser/history/url_database.h" -#include "chrome/browser/pref_service.h" #include "chrome/browser/profile.h" #include "chrome/common/chrome_switches.h" #include "chrome/common/notification_service.h" -#include "chrome/common/pref_names.h" namespace history { @@ -39,11 +36,9 @@ bool InMemoryHistoryBackend::Init(const FilePath& history_filename, if (CommandLine::ForCurrentProcess()->HasSwitch( switches::kEnableInMemoryURLIndex)) { - index_.reset(new InMemoryURLIndex()); + index_.reset(new InMemoryURLIndex); base::TimeTicks beginning_time = base::TimeTicks::Now(); - string16 languages = - UTF8ToUTF16(profile_->GetPrefs()->GetString(prefs::kAcceptLanguages)); - index_->Init(db, languages); + index_->Init(db); UMA_HISTOGRAM_TIMES("Autocomplete.HistoryDatabaseIndexingTime", base::TimeTicks::Now() - beginning_time); } diff --git a/chrome/browser/history/in_memory_url_index.cc b/chrome/browser/history/in_memory_url_index.cc index c69ec470..7f249df2 100644 --- a/chrome/browser/history/in_memory_url_index.cc +++ b/chrome/browser/history/in_memory_url_index.cc @@ -4,172 +4,16 @@ #include "chrome/browser/history/in_memory_url_index.h" -#include <algorithm> -#include <limits> - -#include "app/l10n_util.h" -#include "base/i18n/word_iterator.h" -#include "base/string_util.h" -#include "base/time.h" -#include "base/utf_string_conversions.h" #include "chrome/browser/history/url_database.h" -#include "net/base/escape.h" -#include "net/base/net_util.h" - -using base::Time; -using base::TimeDelta; namespace history { -InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {} - -InMemoryURLIndex::~InMemoryURLIndex() {} - // Indexing -bool InMemoryURLIndex::Init(history::URLDatabase* history_db, - const string16& languages) { - // TODO(mrossetti): Register for profile/language change notifications. - languages_ = languages; - // Reset our indexes. - char_word_map_.clear(); - word_id_history_map_.clear(); - if (!history_db) - return false; - URLDatabase::URLEnumerator history_enum; - if (history_db->InitURLEnumeratorForEverything(&history_enum)) { - URLRow row; - Time recent_threshold = InMemoryURLIndex::RecentThreshold(); - while (history_enum.GetNextURL(&row)) { - // Do some filtering so that we only get history items which could - // possibly pass the HistoryURLProvider::CullPoorMatches filter later. - if ((row.typed_count() > kLowQualityMatchTypedLimit) || - (row.visit_count() > kLowQualityMatchVisitLimit) || - (row.last_visit() >= recent_threshold)) { - if (!IndexRow(row)) - return false; - } - } - } - return true; -} - -bool InMemoryURLIndex::IndexRow(URLRow row) { - const GURL& gurl(row.url()); - string16 url(WideToUTF16(net::FormatUrl(gurl, UTF16ToWide(languages_), - net::kFormatUrlOmitUsernamePassword, - UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, NULL, NULL, - NULL))); - - // TODO(mrossetti): Find or implement a ConvertPercentEncoding and use it - // on the url. - - // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). - HistoryID history_id = static_cast<HistoryID>(row.id()); - - // Add the row for quick lookup in the history info store. - url = l10n_util::ToLower(url); - URLRow new_row(GURL(url), row.id()); - new_row.set_visit_count(row.visit_count()); - new_row.set_typed_count(row.typed_count()); - new_row.set_last_visit(row.last_visit()); - new_row.set_title(row.title()); - history_info_map_.insert(std::make_pair(history_id, new_row)); - - // Split into individual, unique words. - String16Set words = WordsFromString16(url); - - // For each word, add a new entry into the word index referring to the - // associated history item. - for (String16Set::iterator iter = words.begin(); - iter != words.end(); ++iter) { - String16Set::value_type uni_word = *iter; - AddWordToIndex(uni_word, history_id); - } - ++history_item_count_; - return true; -} - -// Utility Functions - -InMemoryURLIndex::String16Set InMemoryURLIndex::WordsFromString16( - const string16& uni_string) { - String16Set words; - - // TODO(mrossetti): Replace all | and _'s with a space, all % quoted - // characters with real characters, and break into words, using - // appropriate string16 functions. - WordIterator iter(&uni_string, WordIterator::BREAK_WORD); - if (iter.Init()) { - while (iter.Advance()) { - if (iter.IsWord()) { - words.insert(iter.GetWord()); - } - } - } - return words; -} - -InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16( - const string16& uni_word) { - Char16Set characters; - for (string16::const_iterator iter = uni_word.begin(); - iter != uni_word.end(); ++iter) - characters.insert(*iter); - return characters; -} - -void InMemoryURLIndex::AddWordToIndex(const string16& uni_word, - HistoryID history_id) { - WordMap::iterator word_pos = word_map_.find(uni_word); - if (word_pos != word_map_.end()) - UpdateWordHistory(word_pos->second, history_id); - else - AddWordHistory(uni_word, history_id); -} - -void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { - WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); - DCHECK(history_pos != word_id_history_map_.end()); - HistoryIDSet& history_id_set(history_pos->second); - history_id_set.insert(history_id); -} - -// Add a new word to the word list and the word map, and then create a -// new entry in the word/history map. -void InMemoryURLIndex::AddWordHistory(const string16& uni_word, - HistoryID history_id) { - word_list_.push_back(uni_word); - WordID word_id = word_list_.size() - 1; - DCHECK(word_map_.insert(std::make_pair(uni_word, word_id)).second); - HistoryIDSet history_id_set; - history_id_set.insert(history_id); - DCHECK(word_id_history_map_.insert( - std::make_pair(word_id, history_id_set)).second); - // For each character in the newly added word (i.e. a word that is not - // already in the word index), add the word to the character index. - Char16Set characters = CharactersFromString16(uni_word); - for (Char16Set::iterator uni_char_iter = characters.begin(); - uni_char_iter != characters.end(); ++uni_char_iter) { - Char16Set::value_type uni_string = *uni_char_iter; - CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string); - if (char_iter != char_word_map_.end()) { - // Update existing entry in the char/word index. - WordIDSet& word_id_set(char_iter->second); - word_id_set.insert(word_id); - } else { - // Create a new entry in the char/word index. - WordIDSet word_id_set; - word_id_set.insert(word_id); - DCHECK(char_word_map_.insert(std::make_pair(uni_string, - word_id_set)).second); - } - } -} - -// static -Time InMemoryURLIndex::RecentThreshold() { - return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); +bool InMemoryURLIndex::Init(history::URLDatabase* history_db) { + bool success = true; + // TODO(mrossetti): Implement. + return success; } } // namespace history diff --git a/chrome/browser/history/in_memory_url_index.h b/chrome/browser/history/in_memory_url_index.h index ca270b0..5b7861a 100644 --- a/chrome/browser/history/in_memory_url_index.h +++ b/chrome/browser/history/in_memory_url_index.h @@ -6,22 +6,6 @@ #define CHROME_BROWSER_HISTORY_IN_MEMORY_URL_INDEX_H_ #pragma once -#include <map> -#include <set> -#include <vector> - -#include "app/sql/connection.h" -#include "base/basictypes.h" -#include "base/linked_ptr.h" -#include "base/scoped_ptr.h" -#include "base/string16.h" -#include "chrome/browser/history/history_types.h" -#include "testing/gtest/include/gtest/gtest_prod.h" - -namespace base { -class Time; -} - namespace history { class URLDatabase; @@ -31,135 +15,13 @@ class URLDatabase; // quickly look up matching URLs for a given query string. Used by // the HistoryURLProvider for inline autocomplete and to provide URL // matches to the omnibox. -// -// Note about multi-byte codepoints and the data structures in the -// InMemoryURLIndex class: One will quickly notice that no effort is made to -// insure that multi-byte character boundaries are detected when indexing the -// words and characters in the URL history database except when converting -// URL strings to lowercase. Multi-byte-edness makes no difference when -// indexing or when searching the index as the final filtering of results -// is dependent on the comparison of a string of bytes, not individual -// characters. While the lookup of those bytes during a search in the -// |char_word_map_| could serve up words in which the individual char16 -// occurs as a portion of a composite character the next filtering step -// will eliminate such words except in the case where a single character -// is being searched on and which character occurs as the second char16 of a -// multi-char16 instance. class InMemoryURLIndex { public: - InMemoryURLIndex(); - ~InMemoryURLIndex(); - - // Convenience types - typedef std::vector<string16> String16Vector; + InMemoryURLIndex() {} + ~InMemoryURLIndex() {} // Open and index the URL history database. - bool Init(URLDatabase* history_db, const string16& languages); - - // Reset the history index. - void Reset(); - - // Given a vector containing one or more words as string16s, scan the - // history index and return a vector with all scored, matching history items. - // Each term must occur somewhere in the history item for the item to - // qualify; however, the terms do not necessarily have to be adjacent. - HistoryMatches HistoryItemsForTerms(const String16Vector& terms); - - // Returns the date threshold for considering an history item as significant. - static base::Time RecentThreshold(); - - private: - friend class InMemoryURLIndexTest; - FRIEND_TEST(InMemoryURLIndexTest, Initialization); - - // Convenience types - typedef std::set<string16> String16Set; - typedef std::set<char16> Char16Set; - - // An index into list of all of the words we have indexed. - typedef int16 WordID; - - // A map allowing a WordID to be determined given a word. - typedef std::map<string16, WordID> WordMap; - - // A map from character to word_ids. - typedef std::set<WordID> WordIDSet; // An index into the WordList. - typedef std::map<char16, WordIDSet> CharWordIDMap; - - // A map from word_id to history item. - // TODO(mrossetti): URLID is 64 bit: a memory bloat and performance hit. - // Consider using a smaller type. - typedef URLID HistoryID; - typedef std::set<HistoryID> HistoryIDSet; - typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap; - - // Support caching of term character intersections so that we can optimize - // searches which build upon a previous search. - struct TermCharWordSet { - TermCharWordSet(Char16Set char_set, WordIDSet word_id_set, bool used) - : char_set_(char_set), - word_id_set_(word_id_set), - used_(used) {} - - Char16Set char_set_; - WordIDSet word_id_set_; - bool used_; // true if this set has been used for the current term search. - }; - typedef std::vector<TermCharWordSet> TermCharWordSetVector; - - // TODO(rohitrao): Probably replace this with QueryResults. - typedef std::vector<URLRow> URLRowVector; - - // A map from history_id to the history's URL and title. - typedef std::map<HistoryID, URLRow> HistoryInfoMap; - - // Break a string down into individual words. - String16Set WordsFromString16(const string16& uni_string); - - // URL History indexing support functions. - - // Index one URL history item. - bool IndexRow(URLRow row); - - // Break a string down into its individual characters. - // Note that this is temporarily intended to work on a single word, but - // _will_ work on a string of words, perhaps with unexpected results. - // TODO(mrossetti): Lots of optimizations possible here for not restarting - // a search if the user is just typing along. Also, change this to uniString - // and properly handle substring matches, scoring and sorting the results - // by score. Also, provide the metrics for where the matches occur so that - // the UI can highlight the matched sections. - Char16Set CharactersFromString16(const string16& uni_word); - - // Given a single word, add a reference to the containing history item - // to the index. - void AddWordToIndex(const string16& uni_word, HistoryID history_id); - - // Update an existing entry in the word/history index by adding the - // |history_id| to set for |word_id| in the word_id_history_map_. - void UpdateWordHistory(WordID word_id, HistoryID history_id); - - // Create a new entry in the word/history map for |word_id| and add - // |history_id| as the initial element of the word's set. - void AddWordHistory(const string16& uni_word, HistoryID history_id); - - // A list of all of indexed words. The index of a word in this list is the - // ID of the word in the word_map_. It reduces the memory overhead by - // replacing a potentially long and repeated string with a simple index. - // NOTE: A word will _never_ be removed from this vector thus insuring - // the immutability of the word_id throughout the session, reducing - // maintenance complexity. - String16Vector word_list_; - - uint64 history_item_count_; - WordMap word_map_; - CharWordIDMap char_word_map_; - WordIDHistoryMap word_id_history_map_; - TermCharWordSetVector term_char_word_set_cache_; - HistoryInfoMap history_info_map_; - string16 languages_; - - DISALLOW_COPY_AND_ASSIGN(InMemoryURLIndex); + bool Init(URLDatabase* history_db); }; } // namespace history diff --git a/chrome/browser/history/in_memory_url_index_unittest.cc b/chrome/browser/history/in_memory_url_index_unittest.cc index 2459053..f5932ef 100644 --- a/chrome/browser/history/in_memory_url_index_unittest.cc +++ b/chrome/browser/history/in_memory_url_index_unittest.cc @@ -2,100 +2,16 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include <stdio.h> - -#include <fstream> -#include <string> -#include <vector> +#include "chrome/browser/history/in_memory_url_index.h" -#include "app/sql/statement.h" -#include "base/file_util.h" -#include "base/path_service.h" #include "base/scoped_ptr.h" -#include "base/time.h" -#include "base/utf_string_conversions.h" -#include "chrome/browser/history/in_memory_url_index.h" -#include "chrome/browser/history/url_database.h" -#include "chrome/common/chrome_paths.h" #include "testing/gtest/include/gtest/gtest.h" -// The test version of the history url database table ('url') is contained in -// a database file created from a text file('url_history_provider_test.db.txt'). -// The only difference between this table and a live 'urls' table from a -// profile is that the last_visit_time column in the test table contains a -// number specifying the number of days relative to 'today' to which the -// absolute time should be set during the test setup stage. -// -// The format of the test database text file is of a SQLite .dump file. -// Note that only lines whose first character is an upper-case letter are -// processed when creating the test database. - -using base::Time; -using base::TimeDelta; - namespace history { -class InMemoryURLIndexTest : public testing::Test, - public URLDatabase { - public: - InMemoryURLIndexTest() {} - +class InMemoryURLIndexTest : public testing::Test { protected: - // Test setup. - void SetUp() { - // Create and populate a working copy of the URL history database. - FilePath history_proto_path; - PathService::Get(chrome::DIR_TEST_DATA, &history_proto_path); - history_proto_path = history_proto_path.Append( - FILE_PATH_LITERAL("History")); - history_proto_path = history_proto_path.Append( - FILE_PATH_LITERAL("url_history_provider_test.db.txt")); - EXPECT_TRUE(file_util::PathExists(history_proto_path)); - EXPECT_TRUE(db_.OpenInMemory()); - - std::ifstream proto_file(history_proto_path.value().c_str()); - static const size_t kCommandBufferMaxSize = 2048; - char sql_cmd_line[kCommandBufferMaxSize]; - while (!proto_file.eof()) { - proto_file.getline(sql_cmd_line, kCommandBufferMaxSize); - if (!proto_file.eof()) { - // We only process lines which begin with a upper-case letter. - // TODO(mrossetti): Can iswupper() be used here? - if (sql_cmd_line[0] >= 'A' && sql_cmd_line[0] <= 'Z') { - std::string sql_cmd(sql_cmd_line); - sql::Statement sql_stmt(db_.GetUniqueStatement(sql_cmd_line)); - EXPECT_TRUE(sql_stmt.Run()); - } - } - } - proto_file.close(); - - // Update the last_visit_time table column - // such that it represents a time relative to 'now'. - sql::Statement statement(db_.GetUniqueStatement( - "SELECT" HISTORY_URL_ROW_FIELDS "FROM urls;")); - EXPECT_TRUE(statement); - Time time_right_now = Time::NowFromSystemTime(); - TimeDelta day_delta = TimeDelta::FromDays(1); - while (statement.Step()) { - URLRow row; - FillURLRow(statement, &row); - Time last_visit = time_right_now; - for (int64 i = row.last_visit().ToInternalValue(); i > 0; --i) - last_visit -= day_delta; - row.set_last_visit(last_visit); - UpdateURLRow(row.id(), row); - } - } - - void TearDown() { - db_.Close(); - } - - virtual sql::Connection& GetDB() { return db_; } - scoped_ptr<InMemoryURLIndex> url_index_; - sql::Connection db_; }; TEST_F(InMemoryURLIndexTest, Construction) { @@ -103,16 +19,4 @@ TEST_F(InMemoryURLIndexTest, Construction) { EXPECT_TRUE(url_index_.get()); } -TEST_F(InMemoryURLIndexTest, Initialization) { - url_index_.reset(new InMemoryURLIndex); - url_index_->Init(this, UTF8ToUTF16("en,ja,hi,zh")); - // There should have been 25 of the 29 urls accepted during filtering. - EXPECT_EQ(25U, url_index_->history_item_count_); - // The resulting indexes should account for: - // 37 characters - // 88 words - EXPECT_EQ(37U, url_index_->char_word_map_.size()); - EXPECT_EQ(88U, url_index_->word_map_.size()); -} - } // namespace history |