// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/browser/history/text_database_manager.h" #include #include #include "base/bind.h" #include "base/compiler_specific.h" #include "base/files/file_enumerator.h" #include "base/logging.h" #include "base/message_loop.h" #include "base/metrics/histogram.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" #include "chrome/browser/history/history_publisher.h" #include "chrome/browser/history/visit_database.h" using base::Time; using base::TimeDelta; using base::TimeTicks; namespace history { namespace { // The number of database files we will be attached to at once. const int kCacheDBSize = 5; std::string ConvertStringForIndexer(const string16& input) { // TODO(evanm): other transformations here? return UTF16ToUTF8(CollapseWhitespace(input, false)); } // Data older than this will be committed to the full text index even if we // haven't gotten a title and/or body. const int kExpirationSeconds = 20; } // namespace // TextDatabaseManager::ChangeSet ---------------------------------------------- TextDatabaseManager::ChangeSet::ChangeSet() {} TextDatabaseManager::ChangeSet::~ChangeSet() {} // TextDatabaseManager::PageInfo ----------------------------------------------- TextDatabaseManager::PageInfo::PageInfo(URLID url_id, VisitID visit_id, Time visit_time) : url_id_(url_id), visit_id_(visit_id), visit_time_(visit_time) { added_time_ = TimeTicks::Now(); } TextDatabaseManager::PageInfo::~PageInfo() {} void TextDatabaseManager::PageInfo::set_title(const string16& ttl) { if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet. title_ = ASCIIToUTF16(" "); else title_ = ttl; } void TextDatabaseManager::PageInfo::set_body(const string16& bdy) { if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet. body_ = ASCIIToUTF16(" "); else body_ = bdy; } bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const { return now - added_time_ > base::TimeDelta::FromSeconds(kExpirationSeconds); } // TextDatabaseManager --------------------------------------------------------- TextDatabaseManager::TextDatabaseManager(const base::FilePath& dir, URLDatabase* url_database, VisitDatabase* visit_database) : dir_(dir), url_database_(url_database), visit_database_(visit_database), recent_changes_(RecentChangeList::NO_AUTO_EVICT), transaction_nesting_(0), db_cache_(DBCache::NO_AUTO_EVICT), present_databases_loaded_(false), weak_factory_(this), history_publisher_(NULL) { } TextDatabaseManager::~TextDatabaseManager() { if (transaction_nesting_) CommitTransaction(); } // static TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) { Time::Exploded exploded; time.UTCExplode(&exploded); // We combine the month and year into a 6-digit number (200801 for // January, 2008). The month is 1-based. return exploded.year * 100 + exploded.month; } // static Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) { Time::Exploded exploded; memset(&exploded, 0, sizeof(Time::Exploded)); exploded.year = id / 100; exploded.month = id % 100; return Time::FromUTCExploded(exploded); } bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) { history_publisher_ = history_publisher; // Start checking recent changes and committing them. ScheduleFlushOldChanges(); return true; } void TextDatabaseManager::BeginTransaction() { transaction_nesting_++; } void TextDatabaseManager::CommitTransaction() { DCHECK(transaction_nesting_); transaction_nesting_--; if (transaction_nesting_) return; // Still more nesting of transactions before committing. // Commit all databases with open transactions on them. for (DBIdentSet::const_iterator i = open_transactions_.begin(); i != open_transactions_.end(); ++i) { DBCache::iterator iter = db_cache_.Get(*i); if (iter == db_cache_.end()) { NOTREACHED() << "All open transactions should be cached."; continue; } iter->second->CommitTransaction(); } open_transactions_.clear(); // Now that the transaction is over, we can expire old connections. db_cache_.ShrinkToSize(kCacheDBSize); } void TextDatabaseManager::InitDBList() { if (present_databases_loaded_) return; present_databases_loaded_ = true; // Find files on disk matching our pattern so we can quickly test for them. base::FilePath::StringType filepattern(TextDatabase::file_base()); filepattern.append(FILE_PATH_LITERAL("*")); base::FileEnumerator enumerator( dir_, false, base::FileEnumerator::FILES, filepattern); base::FilePath cur_file; while (!(cur_file = enumerator.Next()).empty()) { // Convert to the number representing this file. TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file); if (id) // Will be 0 on error. present_databases_.insert(id); } } void TextDatabaseManager::AddPageURL(const GURL& url, URLID url_id, VisitID visit_id, Time time) { // Delete any existing page info. RecentChangeList::iterator found = recent_changes_.Peek(url); if (found != recent_changes_.end()) recent_changes_.Erase(found); // Just save this info for later. We will save it when it expires or when all // the data is complete. recent_changes_.Put(url, PageInfo(url_id, visit_id, time)); } void TextDatabaseManager::AddPageTitle(const GURL& url, const string16& title) { RecentChangeList::iterator found = recent_changes_.Peek(url); if (found == recent_changes_.end()) { // This page is not in our cache of recent pages. This is very much an edge // case as normally a title will come in <20 seconds after the page commits, // and WebContents will avoid spamming us with >1 title per page. However, // it could come up if your connection is unhappy, and we don't want to // miss anything. // // To solve this problem, we'll just associate the most recent visit with // the new title and index that using the regular code path. URLRow url_row; if (!url_database_->GetRowForURL(url, &url_row)) return; // URL is unknown, give up. VisitRow visit; if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) return; // No recent visit, give up. if (visit.is_indexed) { // If this page was already indexed, we could have a body that came in // first and we don't want to overwrite it. We could go query for the // current body, or have a special setter for only the title, but this is // not worth it for this edge case. // // It will be almost impossible for the title to take longer than // kExpirationSeconds yet we got a body in less than that time, since // the title should always come in first. return; } AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, title, string16()); return; // We don't know about this page, give up. } PageInfo& info = found->second; if (info.has_body()) { // This info is complete, write to the database. AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), title, info.body()); recent_changes_.Erase(found); return; } info.set_title(title); } void TextDatabaseManager::AddPageContents(const GURL& url, const string16& body) { RecentChangeList::iterator found = recent_changes_.Peek(url); if (found == recent_changes_.end()) { // This page is not in our cache of recent pages. This means that the page // took more than kExpirationSeconds to load. Often, this will be the result // of a very slow iframe or other resource on the page that makes us think // it's still loading. // // As a fallback, set the most recent visit's contents using the input, and // use the last set title in the URL table as the title to index. URLRow url_row; if (!url_database_->GetRowForURL(url, &url_row)) return; // URL is unknown, give up. VisitRow visit; if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) return; // No recent visit, give up. // Use the title from the URL row as the title for the indexing. AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, url_row.title(), body); return; } PageInfo& info = found->second; if (info.has_title()) { // This info is complete, write to the database. AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), info.title(), body); recent_changes_.Erase(found); return; } info.set_body(body); } bool TextDatabaseManager::AddPageData(const GURL& url, URLID url_id, VisitID visit_id, Time visit_time, const string16& title, const string16& body) { TextDatabase* db = GetDBForTime(visit_time, true); if (!db) return false; TimeTicks beginning_time = TimeTicks::Now(); // First delete any recently-indexed data for this page. This will delete // anything in the main database, but we don't bother looking through the // archived database. VisitVector visits; visit_database_->GetIndexedVisitsForURL(url_id, &visits); for (size_t i = 0; i < visits.size(); i++) { visits[i].is_indexed = false; visit_database_->UpdateVisitRow(visits[i]); DeletePageData(visits[i].visit_time, url, NULL); } if (visit_id) { // We're supposed to update the visit database, so load the visit. VisitRow row; if (!visit_database_->GetRowForVisit(visit_id, &row)) { // This situation can occur if Chrome's history is in the process of // being updated, and then the browsing history is deleted before all // updates have been completely performed. In this case, a stale update // to the database is attempted, leading to the warning below. DLOG(WARNING) << "Could not find requested visit #" << visit_id; return false; } DCHECK(visit_time == row.visit_time); // Update the visit database to reference our addition. row.is_indexed = true; if (!visit_database_->UpdateVisitRow(row)) return false; } // Now index the data. std::string url_str = URLDatabase::GURLToDatabaseURL(url); bool success = db->AddPageData(visit_time, url_str, ConvertStringForIndexer(title), ConvertStringForIndexer(body)); UMA_HISTOGRAM_TIMES("History.AddFTSData", TimeTicks::Now() - beginning_time); if (history_publisher_) history_publisher_->PublishPageContent(visit_time, url, title, body); return success; } void TextDatabaseManager::DeletePageData(Time time, const GURL& url, ChangeSet* change_set) { TextDatabase::DBIdent db_ident = TimeToID(time); // We want to open the database for writing, but only if it exists. To // achieve this, we check whether it exists by saying we're not going to // write to it (avoiding the autocreation code normally called when writing) // and then access it for writing only if it succeeds. TextDatabase* db = GetDB(db_ident, false); if (!db) return; db = GetDB(db_ident, true); if (change_set) change_set->Add(db_ident); db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url)); } void TextDatabaseManager::DeleteFromUncommitted( const std::set& restrict_urls, Time begin, Time end) { // First find the beginning of the range to delete. Recall that the list // has the most recent item at the beginning. There won't normally be very // many items, so a brute-force search is fine. RecentChangeList::iterator cur = recent_changes_.begin(); if (!end.is_null()) { // Walk from the beginning of the list backwards in time to find the newest // entry that should be deleted. while (cur != recent_changes_.end() && cur->second.visit_time() >= end) ++cur; } // Now delete all visits up to the oldest one we were supposed to delete. // Note that if begin is_null, it will be less than or equal to any other // time. if (restrict_urls.empty()) { while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) cur = recent_changes_.Erase(cur); } else { while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) { if (restrict_urls.find(cur->first) != restrict_urls.end()) cur = recent_changes_.Erase(cur); else ++cur; } } } void TextDatabaseManager::DeleteFromUncommittedForTimes( const std::vector& times) { // |times| must be in reverse chronological order, i.e. each member // must be earlier than or the same as the one before it. DCHECK( std::adjacent_find( times.begin(), times.end(), std::less()) == times.end()); // Both |recent_changes_| and |times| are in reverse chronological order. RecentChangeList::iterator it = recent_changes_.begin(); std::vector::const_iterator time_it = times.begin(); while (it != recent_changes_.end() && time_it != times.end()) { base::Time visit_time = it->second.visit_time(); if (visit_time == *time_it) { it = recent_changes_.Erase(it); } else if (visit_time < *time_it) { ++time_it; } else /* if (visit_time > *time_it) */ { ++it; } } } void TextDatabaseManager::DeleteAll() { DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction."; InitDBList(); // Delete uncommitted entries. recent_changes_.Clear(); // Close all open databases. db_cache_.Clear(); // Now go through and delete all the files. for (DBIdentSet::iterator i = present_databases_.begin(); i != present_databases_.end(); ++i) { base::FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i)); sql::Connection::Delete(file_name); } } void TextDatabaseManager::OptimizeChangedDatabases( const ChangeSet& change_set) { for (ChangeSet::DBSet::const_iterator i = change_set.changed_databases_.begin(); i != change_set.changed_databases_.end(); ++i) { // We want to open the database for writing, but only if it exists. To // achieve this, we check whether it exists by saying we're not going to // write to it (avoiding the autocreation code normally called when writing) // and then access it for writing only if it succeeds. TextDatabase* db = GetDB(*i, false); if (!db) continue; db = GetDB(*i, true); if (!db) continue; // The file may have changed or something. db->Optimize(); } } void TextDatabaseManager::GetTextMatches( const string16& query, const QueryOptions& options, std::vector* results, Time* first_time_searched) { results->clear(); *first_time_searched = options.begin_time; InitDBList(); if (present_databases_.empty()) return; // Nothing to search. // Get the query into the proper format for the individual DBs. string16 fts_query16; query_parser_.ParseQuery(query, &fts_query16); std::string fts_query = UTF16ToUTF8(fts_query16); // Need a copy of the options so we can modify the max count for each call // to the individual databases. QueryOptions cur_options(options); // Compute the minimum and maximum values for the identifiers that could // encompass the input time range. TextDatabase::DBIdent min_ident = options.begin_time.is_null() ? *present_databases_.begin() : TimeToID(options.begin_time); TextDatabase::DBIdent max_ident = options.end_time.is_null() ? *present_databases_.rbegin() : TimeToID(options.end_time); // Iterate over the databases from the most recent backwards. TextDatabase::URLSet found_urls; for (DBIdentSet::reverse_iterator i = present_databases_.rbegin(); i != present_databases_.rend(); ++i) { // TODO(brettw) allow canceling the query in the middle. // if (canceled_or_something) // break; // This code is stupid, we just loop until we find the correct starting // time range rather than search in an intelligent way. Users will have a // few dozen files at most, so this should not be an issue. if (*i > max_ident) continue; // Haven't gotten to the time range yet. if (*i < min_ident) break; // Covered all the time range. TextDatabase* cur_db = GetDB(*i, false); if (!cur_db) continue; // Adjust the max count according to how many results we've already got. if (options.max_count) { cur_options.max_count = options.max_count - static_cast(results->size()); } bool has_more_results = cur_db->GetTextMatches( fts_query, cur_options, results, &found_urls); DCHECK(static_cast(results->size()) <= options.EffectiveMaxCount()); if (has_more_results || static_cast(results->size()) == options.EffectiveMaxCount()) { // Since the search proceeds backwards in time, the last result we have // gives the first time searched. *first_time_searched = results->back().time; break; } } } size_t TextDatabaseManager::GetUncommittedEntryCountForTest() const { return recent_changes_.size(); } TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id, bool for_writing) { DBCache::iterator found_db = db_cache_.Get(id); if (found_db != db_cache_.end()) { if (transaction_nesting_ && for_writing && open_transactions_.find(id) == open_transactions_.end()) { // If we currently have an open transaction, that database is not yet // part of the transaction, and the database will be written to, it needs // to be part of our transaction. found_db->second->BeginTransaction(); open_transactions_.insert(id); } return found_db->second; } // Need to make the database. TextDatabase* new_db = new TextDatabase(dir_, id, for_writing); if (!new_db->Init()) { delete new_db; return NULL; } db_cache_.Put(id, new_db); present_databases_.insert(id); if (transaction_nesting_ && for_writing) { // If we currently have an open transaction and the new database will be // written to, it needs to be part of our transaction. new_db->BeginTransaction(); open_transactions_.insert(id); } // When no transaction is open, allow this new one to kick out an old one. if (!transaction_nesting_) db_cache_.ShrinkToSize(kCacheDBSize); return new_db; } TextDatabase* TextDatabaseManager::GetDBForTime(Time time, bool create_if_necessary) { return GetDB(TimeToID(time), create_if_necessary); } void TextDatabaseManager::ScheduleFlushOldChanges() { weak_factory_.InvalidateWeakPtrs(); base::MessageLoop::current()->PostDelayedTask( FROM_HERE, base::Bind(&TextDatabaseManager::FlushOldChanges, weak_factory_.GetWeakPtr()), base::TimeDelta::FromSeconds(kExpirationSeconds)); } void TextDatabaseManager::FlushOldChanges() { FlushOldChangesForTime(TimeTicks::Now()); } void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) { // The end of the list is the oldest, so we just start from there committing // things until we get something too new. RecentChangeList::reverse_iterator i = recent_changes_.rbegin(); while (i != recent_changes_.rend() && i->second.Expired(now)) { AddPageData(i->first, i->second.url_id(), i->second.visit_id(), i->second.visit_time(), i->second.title(), i->second.body()); i = recent_changes_.Erase(i); } ScheduleFlushOldChanges(); } } // namespace history