diff options
author | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-10-30 02:36:20 +0000 |
---|---|---|
committer | brettw@google.com <brettw@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-10-30 02:36:20 +0000 |
commit | 5ffd5e9b703d122dbac2dce9425a88b414f6de97 (patch) | |
tree | a9654890ca1178c026e1707a7bc29387eca03b45 | |
parent | a6604d90563ed00e56d4ffe5116d3c7726a893f9 (diff) | |
download | chromium_src-5ffd5e9b703d122dbac2dce9425a88b414f6de97.zip chromium_src-5ffd5e9b703d122dbac2dce9425a88b414f6de97.tar.gz chromium_src-5ffd5e9b703d122dbac2dce9425a88b414f6de97.tar.bz2 |
Make sure titles and bodies of web pages that come into the history system very
late are indexed. Very slow pages (>20 seconds) would previously not get indexed.
BUG=3835
Review URL: http://codereview.chromium.org/8899
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@4201 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | chrome/browser/history/expire_history_backend_unittest.cc | 7 | ||||
-rw-r--r-- | chrome/browser/history/history_backend.cc | 3 | ||||
-rw-r--r-- | chrome/browser/history/text_database_manager.cc | 56 | ||||
-rw-r--r-- | chrome/browser/history/text_database_manager.h | 3 | ||||
-rw-r--r-- | chrome/browser/history/text_database_manager_unittest.cc | 102 |
5 files changed, 139 insertions, 32 deletions
diff --git a/chrome/browser/history/expire_history_backend_unittest.cc b/chrome/browser/history/expire_history_backend_unittest.cc index fab52e5..7195cdb 100644 --- a/chrome/browser/history/expire_history_backend_unittest.cc +++ b/chrome/browser/history/expire_history_backend_unittest.cc @@ -118,7 +118,8 @@ class ExpireHistoryTest : public testing::Test, if (thumb_db_->Init(thumb_name) != INIT_OK) thumb_db_.reset(); - text_db_.reset(new TextDatabaseManager(dir_, main_db_.get())); + text_db_.reset(new TextDatabaseManager(dir_, + main_db_.get(), main_db_.get())); if (!text_db_->Init()) text_db_.reset(); @@ -398,7 +399,7 @@ TEST_F(ExpireHistoryTest, DeleteURLAndFavicon) { // it just like the test set-up did. text_db_.reset(); EXPECT_TRUE(IsStringInFile(fts_filename, "goats")); - text_db_.reset(new TextDatabaseManager(dir_, main_db_.get())); + text_db_.reset(new TextDatabaseManager(dir_, main_db_.get(), main_db_.get())); ASSERT_TRUE(text_db_->Init()); expirer_.SetDatabases(main_db_.get(), archived_db_.get(), thumb_db_.get(), text_db_.get()); @@ -410,7 +411,7 @@ TEST_F(ExpireHistoryTest, DeleteURLAndFavicon) { // doesn't remove it from the file, we want to be sure we're doing the latter. text_db_.reset(); EXPECT_FALSE(IsStringInFile(fts_filename, "goats")); - text_db_.reset(new TextDatabaseManager(dir_, main_db_.get())); + text_db_.reset(new TextDatabaseManager(dir_, main_db_.get(), main_db_.get())); ASSERT_TRUE(text_db_->Init()); expirer_.SetDatabases(main_db_.get(), archived_db_.get(), thumb_db_.get(), text_db_.get()); diff --git a/chrome/browser/history/history_backend.cc b/chrome/browser/history/history_backend.cc index afaee0a..c3bdd4a 100644 --- a/chrome/browser/history/history_backend.cc +++ b/chrome/browser/history/history_backend.cc @@ -533,7 +533,8 @@ void HistoryBackend::InitImpl() { // Full-text database. This has to be first so we can pass it to the // HistoryDatabase for migration. - text_database_.reset(new TextDatabaseManager(history_dir_, db_.get())); + text_database_.reset(new TextDatabaseManager(history_dir_, + db_.get(), db_.get())); if (!text_database_->Init()) { LOG(WARNING) << "Text database initialization failed, running without it."; text_database_.reset(); diff --git a/chrome/browser/history/text_database_manager.cc b/chrome/browser/history/text_database_manager.cc index 21a7436..483befa 100644 --- a/chrome/browser/history/text_database_manager.cc +++ b/chrome/browser/history/text_database_manager.cc @@ -67,9 +67,11 @@ bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const { // TextDatabaseManager --------------------------------------------------------- TextDatabaseManager::TextDatabaseManager(const std::wstring& dir, + URLDatabase* url_database, VisitDatabase* visit_database) : dir_(dir), db_(NULL), + url_database_(url_database), visit_database_(visit_database), recent_changes_(RecentChangeList::NO_AUTO_EVICT), transaction_nesting_(0), @@ -170,8 +172,38 @@ void TextDatabaseManager::AddPageURL(const GURL& url, void TextDatabaseManager::AddPageTitle(const GURL& url, const std::wstring& title) { RecentChangeList::iterator found = recent_changes_.Peek(url); - if (found == recent_changes_.end()) + if (found == recent_changes_.end()) { + // This page is not in our cache of recent pages. This is very much an edge + // case as normally a title will come in <20 seconds after the page commits, + // and WebContents will avoid spamming us with >1 title per page. However, + // it could come up if your connection is unhappy, and we don't want to + // miss anything. + // + // To solve this problem, we'll just associate the most recent visit with + // the new title and index that using the regular code path. + URLRow url_row; + if (!url_database_->GetRowForURL(url, &url_row)) + return; // URL is unknown, give up. + VisitRow visit; + if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) + return; // No recent visit, give up. + + if (visit.is_indexed) { + // If this page was already indexed, we could have a body that came in + // first and we don't want to overwrite it. We could go query for the + // current body, or have a special setter for only the title, but this is + // not worth it for this edge case. + // + // It will be almost impossible for the title to take longer than + // kExpirationSec yet we got a body in less than that time, since the + // title should always come in first. + return; + } + + AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, + title, std::wstring()); return; // We don't know about this page, give up. + } PageInfo& info = found->second; if (info.has_body()) { @@ -188,8 +220,26 @@ void TextDatabaseManager::AddPageTitle(const GURL& url, void TextDatabaseManager::AddPageContents(const GURL& url, const std::wstring& body) { RecentChangeList::iterator found = recent_changes_.Peek(url); - if (found == recent_changes_.end()) - return; // We don't know about this page, give up. + if (found == recent_changes_.end()) { + // This page is not in our cache of recent pages. This means that the page + // took more than kExpirationSec to load. Often, this will be the result of + // a very slow iframe or other resource on the page that makes us think its + // still loading. + // + // As a fallback, set the most recent visit's contents using the input, and + // use the last set title in the URL table as the title to index. + URLRow url_row; + if (!url_database_->GetRowForURL(url, &url_row)) + return; // URL is unknown, give up. + VisitRow visit; + if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) + return; // No recent visit, give up. + + // Use the title from the URL row as the title for the indexing. + AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, + url_row.title(), body); + return; + } PageInfo& info = found->second; if (info.has_title()) { diff --git a/chrome/browser/history/text_database_manager.h b/chrome/browser/history/text_database_manager.h index fc5bdae..f1195ef 100644 --- a/chrome/browser/history/text_database_manager.h +++ b/chrome/browser/history/text_database_manager.h @@ -71,6 +71,7 @@ class TextDatabaseManager { // (of recent visits). The visit database will be updated to refer to the // added text database entries. explicit TextDatabaseManager(const std::wstring& dir, + URLDatabase* url_database, VisitDatabase* visit_database); ~TextDatabaseManager(); @@ -154,6 +155,7 @@ class TextDatabaseManager { private: // These tests call ExpireRecentChangesForTime to force expiration. FRIEND_TEST(TextDatabaseManagerTest, InsertPartial); + FRIEND_TEST(TextDatabaseManagerTest, PartialComplete); FRIEND_TEST(ExpireHistoryTest, DeleteURLAndFavicon); FRIEND_TEST(ExpireHistoryTest, FlushRecentURLsUnstarred); @@ -244,6 +246,7 @@ class TextDatabaseManager { DBCloseScoper db_close_scoper_; // Non-owning pointers to the recent history databases for URLs and visits. + URLDatabase* url_database_; VisitDatabase* visit_database_; // Lists recent additions that we have not yet filled out with the title and diff --git a/chrome/browser/history/text_database_manager_unittest.cc b/chrome/browser/history/text_database_manager_unittest.cc index fd9a46d..6781a76 100644 --- a/chrome/browser/history/text_database_manager_unittest.cc +++ b/chrome/browser/history/text_database_manager_unittest.cc @@ -56,17 +56,18 @@ class TextDatabaseManagerTest : public testing::Test { std::wstring dir_; }; -// This provides a simple implementation of a VisitDatabase using an in-memory -// sqlite connection. The text database manager expects to be able to update -// the visit database to keep in sync. -class InMemVisitDB : public VisitDatabase { +// This provides a simple implementation of a URL+VisitDatabase using an +// in-memory sqlite connection. The text database manager expects to be able to +// update the visit database to keep in sync. +class InMemDB : public URLDatabase, public VisitDatabase { public: - InMemVisitDB() { + InMemDB() { sqlite3_open(":memory:", &db_); statement_cache_ = new SqliteStatementCache(db_); + CreateURLTable(false); InitVisitTable(); } - ~InMemVisitDB() { + ~InMemDB() { delete statement_cache_; sqlite3_close(db_); } @@ -80,7 +81,7 @@ class InMemVisitDB : public VisitDatabase { sqlite3* db_; SqliteStatementCache* statement_cache_; - DISALLOW_EVIL_CONSTRUCTORS(InMemVisitDB); + DISALLOW_EVIL_CONSTRUCTORS(InMemDB); }; // Adds all the pages once, and the first page once more in the next month. @@ -166,8 +167,8 @@ bool ResultsHaveURL(const std::vector<TextDatabase::Match>& results, // Tests basic querying. TEST_F(TextDatabaseManagerTest, InsertQuery) { ASSERT_TRUE(Init()); - InMemVisitDB visit_db; - TextDatabaseManager manager(dir_, &visit_db); + InMemDB visit_db; + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); std::vector<Time> times; @@ -198,8 +199,8 @@ TEST_F(TextDatabaseManagerTest, InsertQuery) { // tests right now, but we test it anyway. TEST_F(TextDatabaseManagerTest, InsertCompleteNoVisit) { ASSERT_TRUE(Init()); - InMemVisitDB visit_db; - TextDatabaseManager manager(dir_, &visit_db); + InMemDB visit_db; + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); // First add one without a visit. @@ -222,8 +223,8 @@ TEST_F(TextDatabaseManagerTest, InsertCompleteNoVisit) { // visit was updated properly. TEST_F(TextDatabaseManagerTest, InsertCompleteVisit) { ASSERT_TRUE(Init()); - InMemVisitDB visit_db; - TextDatabaseManager manager(dir_, &visit_db); + InMemDB visit_db; + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); // First add a visit to a page. We can just make up a URL ID since there is @@ -261,8 +262,8 @@ TEST_F(TextDatabaseManagerTest, InsertCompleteVisit) { // Tests that partial inserts that expire are added to the database. TEST_F(TextDatabaseManagerTest, InsertPartial) { ASSERT_TRUE(Init()); - InMemVisitDB visit_db; - TextDatabaseManager manager(dir_, &visit_db); + InMemDB visit_db; + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); // Add the first one with just a URL. @@ -305,6 +306,57 @@ TEST_F(TextDatabaseManagerTest, InsertPartial) { EXPECT_TRUE(ResultsHaveURL(results, kURL3)); } +// Tests that partial inserts (due to timeouts) will still get updated if the +// data comes in later. +TEST_F(TextDatabaseManagerTest, PartialComplete) { + ASSERT_TRUE(Init()); + InMemDB visit_db; + TextDatabaseManager manager(dir_, &visit_db, &visit_db); + ASSERT_TRUE(manager.Init()); + + Time added_time = Time::Now(); + GURL url(kURL1); + + // We have to have the URL in the URL and visit databases for this test to + // work. + URLRow url_row(url); + url_row.set_title(L"chocolate"); + URLID url_id = visit_db.AddURL(url_row); + ASSERT_TRUE(url_id); + VisitRow visit_row; + visit_row.url_id = url_id; + visit_row.visit_time = added_time; + visit_db.AddVisit(&visit_row); + + // Add a URL with no title or body, and say that it expired. + manager.AddPageURL(url, 0, 0, added_time); + TimeTicks expire_time = TimeTicks::Now() + TimeDelta::FromDays(1); + manager.FlushOldChangesForTime(expire_time); + + // Add the title. We should be able to query based on that. The title in the + // URL row we set above should not come into the picture. + manager.AddPageTitle(url, L"Some unique title"); + Time first_time_searched; + QueryOptions options; + std::vector<TextDatabase::Match> results; + manager.GetTextMatches(L"unique", options, &results, &first_time_searched); + EXPECT_EQ(1, results.size()); + manager.GetTextMatches(L"chocolate", options, &results, &first_time_searched); + EXPECT_EQ(0, results.size()); + + // Now add the body, which should be queryable. + manager.AddPageContents(url, L"Very awesome body"); + manager.GetTextMatches(L"awesome", options, &results, &first_time_searched); + EXPECT_EQ(1, results.size()); + + // Adding the body will actually copy the title from the URL table rather + // than the previously indexed row (we made them not match above). This isn't + // necessarily what we want, but it's how it's implemented, and we don't want + // to regress it. + manager.GetTextMatches(L"chocolate", options, &results, &first_time_searched); + EXPECT_EQ(1, results.size()); +} + // Tests that changes get properly committed to disk. TEST_F(TextDatabaseManagerTest, Writing) { ASSERT_TRUE(Init()); @@ -313,11 +365,11 @@ TEST_F(TextDatabaseManagerTest, Writing) { std::vector<TextDatabase::Match> results; Time first_time_searched; - InMemVisitDB visit_db; + InMemDB visit_db; // Create the manager and write some stuff to it. { - TextDatabaseManager manager(dir_, &visit_db); + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); std::vector<Time> times; @@ -331,7 +383,7 @@ TEST_F(TextDatabaseManagerTest, Writing) { // Recreate the manager and make sure it finds the written stuff. { - TextDatabaseManager manager(dir_, &visit_db); + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); // We should have matched every page again. @@ -349,11 +401,11 @@ TEST_F(TextDatabaseManagerTest, WritingTransaction) { std::vector<TextDatabase::Match> results; Time first_time_searched; - InMemVisitDB visit_db; + InMemDB visit_db; // Create the manager and write some stuff to it. { - TextDatabaseManager manager(dir_, &visit_db); + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); std::vector<Time> times; @@ -369,7 +421,7 @@ TEST_F(TextDatabaseManagerTest, WritingTransaction) { // Recreate the manager and make sure it finds the written stuff. { - TextDatabaseManager manager(dir_, &visit_db); + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); // We should have matched every page again. @@ -381,8 +433,8 @@ TEST_F(TextDatabaseManagerTest, WritingTransaction) { // Tests querying where the maximum number of items is met. TEST_F(TextDatabaseManagerTest, QueryMax) { ASSERT_TRUE(Init()); - InMemVisitDB visit_db; - TextDatabaseManager manager(dir_, &visit_db); + InMemDB visit_db; + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); std::vector<Time> times; @@ -418,8 +470,8 @@ TEST_F(TextDatabaseManagerTest, QueryMax) { // Tests querying backwards in time in chunks. TEST_F(TextDatabaseManagerTest, QueryBackwards) { ASSERT_TRUE(Init()); - InMemVisitDB visit_db; - TextDatabaseManager manager(dir_, &visit_db); + InMemDB visit_db; + TextDatabaseManager manager(dir_, &visit_db, &visit_db); ASSERT_TRUE(manager.Init()); std::vector<Time> times; |