// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/browser/history/visitsegment_database.h" #include #include #include #include #include "base/command_line.h" #include "base/logging.h" #include "base/stl_util.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" #include "chrome/browser/history/page_usage_data.h" #include "chrome/common/chrome_switches.h" #include "sql/statement.h" #include "sql/transaction.h" // The following tables are used to store url segment information. // // segments // id Primary key // name A unique string to represent that segment. (URL derived) // url_id ID of the url currently used to represent this segment. // // segment_usage // id Primary key // segment_id Corresponding segment id // time_slot time stamp identifying for what day this entry is about // visit_count Number of visit in the segment // // segment_duration // id Primary key // segment_id Corresponding segment id // time_slot time stamp identifying what day this entry is for // duration Total time during the time_slot the user has been on // the page. This is a serialized TimeDelta value. // segment_duration is only created if chrome::kTrackActiveVisitTime is set. namespace history { VisitSegmentDatabase::VisitSegmentDatabase() : has_duration_table_(CommandLine::ForCurrentProcess()->HasSwitch( switches::kTrackActiveVisitTime)) { } VisitSegmentDatabase::~VisitSegmentDatabase() { } bool VisitSegmentDatabase::InitSegmentTables() { // Segments table. if (!GetDB().DoesTableExist("segments")) { if (!GetDB().Execute("CREATE TABLE segments (" "id INTEGER PRIMARY KEY," "name VARCHAR," "url_id INTEGER NON NULL)")) { return false; } if (!GetDB().Execute( "CREATE INDEX segments_name ON segments(name)")) { return false; } } // This was added later, so we need to try to create it even if the table // already exists. if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_url_id ON " "segments(url_id)")) return false; // Segment usage table. if (!GetDB().DoesTableExist("segment_usage")) { if (!GetDB().Execute("CREATE TABLE segment_usage (" "id INTEGER PRIMARY KEY," "segment_id INTEGER NOT NULL," "time_slot INTEGER NOT NULL," "visit_count INTEGER DEFAULT 0 NOT NULL)")) { return false; } if (!GetDB().Execute( "CREATE INDEX segment_usage_time_slot_segment_id ON " "segment_usage(time_slot, segment_id)")) { return false; } } // Added in a later version, so we always need to try to creat this index. if (!GetDB().Execute("CREATE INDEX IF NOT EXISTS segments_usage_seg_id " "ON segment_usage(segment_id)")) return false; // TODO(sky): if we decide to keep this feature duration should be added to // segument_usage. if (has_duration_table_ && !GetDB().DoesTableExist("segment_duration")) { if (!GetDB().Execute("CREATE TABLE segment_duration (" "id INTEGER PRIMARY KEY," "segment_id INTEGER NOT NULL," "time_slot INTEGER NOT NULL," "duration INTEGER DEFAULT 0 NOT NULL)")) { return false; } if (!GetDB().Execute( "CREATE INDEX segment_duration_time_slot_segment_id ON " "segment_duration(time_slot, segment_id)")) { return false; } } else if (!has_duration_table_ && !GetDB().Execute("DROP TABLE IF EXISTS segment_duration")) { return false; } return true; } bool VisitSegmentDatabase::DropSegmentTables() { // Dropping the tables will implicitly delete the indices. return GetDB().Execute("DROP TABLE segments") && GetDB().Execute("DROP TABLE segment_usage") && GetDB().Execute("DROP TABLE IF EXISTS segment_duration"); } // Note: the segment name is derived from the URL but is not a URL. It is // a string that can be easily recreated from various URLS. Maybe this should // be an MD5 to limit the length. // // static std::string VisitSegmentDatabase::ComputeSegmentName(const GURL& url) { // TODO(brettw) this should probably use the registry controlled // domains service. GURL::Replacements r; const char kWWWDot[] = "www."; const int kWWWDotLen = arraysize(kWWWDot) - 1; std::string host = url.host(); const char* host_c = host.c_str(); // Remove www. to avoid some dups. if (static_cast(host.size()) > kWWWDotLen && LowerCaseEqualsASCII(host_c, host_c + kWWWDotLen, kWWWDot)) { r.SetHost(host.c_str(), url_parse::Component(kWWWDotLen, static_cast(host.size()) - kWWWDotLen)); } // Remove other stuff we don't want. r.ClearUsername(); r.ClearPassword(); r.ClearQuery(); r.ClearRef(); r.ClearPort(); return url.ReplaceComponents(r).spec(); } // static base::Time VisitSegmentDatabase::SegmentTime(base::Time time) { return time.LocalMidnight(); } SegmentID VisitSegmentDatabase::GetSegmentNamed( const std::string& segment_name) { sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "SELECT id FROM segments WHERE name = ?")); statement.BindString(0, segment_name); if (statement.Step()) return statement.ColumnInt64(0); return 0; } bool VisitSegmentDatabase::UpdateSegmentRepresentationURL(SegmentID segment_id, URLID url_id) { sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "UPDATE segments SET url_id = ? WHERE id = ?")); statement.BindInt64(0, url_id); statement.BindInt64(1, segment_id); return statement.Run(); } URLID VisitSegmentDatabase::GetSegmentRepresentationURL(SegmentID segment_id) { sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "SELECT url_id FROM segments WHERE id = ?")); statement.BindInt64(0, segment_id); if (statement.Step()) return statement.ColumnInt64(0); return 0; } SegmentID VisitSegmentDatabase::CreateSegment(URLID url_id, const std::string& segment_name) { sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "INSERT INTO segments (name, url_id) VALUES (?,?)")); statement.BindString(0, segment_name); statement.BindInt64(1, url_id); if (statement.Run()) return GetDB().GetLastInsertRowId(); return 0; } bool VisitSegmentDatabase::IncreaseSegmentVisitCount(SegmentID segment_id, base::Time ts, int amount) { base::Time t = SegmentTime(ts); sql::Statement select(GetDB().GetCachedStatement(SQL_FROM_HERE, "SELECT id, visit_count FROM segment_usage " "WHERE time_slot = ? AND segment_id = ?")); select.BindInt64(0, t.ToInternalValue()); select.BindInt64(1, segment_id); if (!select.is_valid()) return false; if (select.Step()) { sql::Statement update(GetDB().GetCachedStatement(SQL_FROM_HERE, "UPDATE segment_usage SET visit_count = ? WHERE id = ?")); update.BindInt64(0, select.ColumnInt64(1) + static_cast(amount)); update.BindInt64(1, select.ColumnInt64(0)); return update.Run(); } else { sql::Statement insert(GetDB().GetCachedStatement(SQL_FROM_HERE, "INSERT INTO segment_usage " "(segment_id, time_slot, visit_count) VALUES (?, ?, ?)")); insert.BindInt64(0, segment_id); insert.BindInt64(1, t.ToInternalValue()); insert.BindInt64(2, static_cast(amount)); return insert.Run(); } } void VisitSegmentDatabase::QuerySegmentUsage( base::Time from_time, int max_result_count, std::vector* result) { // Gather all the segment scores. sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "SELECT segment_id, time_slot, visit_count " "FROM segment_usage WHERE time_slot >= ? " "ORDER BY segment_id")); if (!statement.is_valid()) return; QuerySegmentsCommon(&statement, from_time, max_result_count, QUERY_VISIT_COUNT, result); } bool VisitSegmentDatabase::DeleteSegmentData(base::Time older_than) { sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "DELETE FROM segment_usage WHERE time_slot < ?")); statement.BindInt64(0, SegmentTime(older_than).ToInternalValue()); if (!statement.Run()) return false; if (!has_duration_table_) return true; sql::Statement duration_statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "DELETE FROM segment_duration WHERE time_slot < ?")); duration_statement.BindInt64(0, SegmentTime(older_than).ToInternalValue()); return duration_statement.Run(); } bool VisitSegmentDatabase::DeleteSegmentForURL(URLID url_id) { sql::Statement delete_usage(GetDB().GetCachedStatement(SQL_FROM_HERE, "DELETE FROM segment_usage WHERE segment_id IN " "(SELECT id FROM segments WHERE url_id = ?)")); delete_usage.BindInt64(0, url_id); if (!delete_usage.Run()) return false; if (has_duration_table_) { sql::Statement delete_duration(GetDB().GetCachedStatement(SQL_FROM_HERE, "DELETE FROM segment_duration WHERE segment_id IN " "(SELECT id FROM segments WHERE url_id = ?)")); delete_duration.BindInt64(0, url_id); if (!delete_duration.Run()) return false; } sql::Statement delete_seg(GetDB().GetCachedStatement(SQL_FROM_HERE, "DELETE FROM segments WHERE url_id = ?")); delete_seg.BindInt64(0, url_id); return delete_seg.Run(); } SegmentDurationID VisitSegmentDatabase::CreateSegmentDuration( SegmentID segment_id, base::Time time, base::TimeDelta delta) { if (!has_duration_table_) return 0; sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "INSERT INTO segment_duration (segment_id, time_slot, duration) " "VALUES (?,?,?)")); statement.BindInt64(0, segment_id); statement.BindInt64(1, SegmentTime(time).ToInternalValue()); statement.BindInt64(2, delta.ToInternalValue()); return statement.Run() ? GetDB().GetLastInsertRowId() : 0; } bool VisitSegmentDatabase::SetSegmentDuration(SegmentDurationID duration_id, base::TimeDelta time_delta) { if (!has_duration_table_) return false; sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "UPDATE segment_duration SET duration = ? WHERE id = ?")); statement.BindInt64(0, time_delta.ToInternalValue()); statement.BindInt64(1, duration_id); return statement.Run(); } bool VisitSegmentDatabase::GetSegmentDuration(SegmentID segment_id, base::Time time, SegmentDurationID* duration_id, base::TimeDelta* time_delta) { if (!has_duration_table_) return false; sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "SELECT id, duration FROM segment_duration " "WHERE segment_id = ? AND time_slot = ? ")); if (!statement.is_valid()) return false; statement.BindInt64(0, segment_id); statement.BindInt64(1, SegmentTime(time).ToInternalValue()); if (!statement.Step()) return false; *duration_id = statement.ColumnInt64(0); *time_delta = base::TimeDelta::FromInternalValue(statement.ColumnInt64(1)); return true; } void VisitSegmentDatabase::QuerySegmentDuration( base::Time from_time, int max_result_count, std::vector* result) { if (!has_duration_table_) return; // Gather all the segment scores. sql::Statement statement(GetDB().GetCachedStatement(SQL_FROM_HERE, "SELECT segment_id, time_slot, duration " "FROM segment_duration WHERE time_slot >= ? " "ORDER BY segment_id")); if (!statement.is_valid()) return; QuerySegmentsCommon(&statement, from_time, max_result_count, QUERY_DURATION, result); } bool VisitSegmentDatabase::MigratePresentationIndex() { sql::Transaction transaction(&GetDB()); return transaction.Begin() && GetDB().Execute("DROP TABLE presentation") && GetDB().Execute("CREATE TABLE segments_tmp (" "id INTEGER PRIMARY KEY," "name VARCHAR," "url_id INTEGER NON NULL)") && GetDB().Execute("INSERT INTO segments_tmp SELECT " "id, name, url_id FROM segments") && GetDB().Execute("DROP TABLE segments") && GetDB().Execute("ALTER TABLE segments_tmp RENAME TO segments") && transaction.Commit(); } void VisitSegmentDatabase::QuerySegmentsCommon( sql::Statement* statement, base::Time from_time, int max_result_count, QueryType query_type, std::vector* result) { // This function gathers the highest-ranked segments in two queries. // The first gathers scores for all segments. // The second gathers segment data (url, title, etc.) for the highest-ranked // segments. base::Time ts = SegmentTime(from_time); statement->BindInt64(0, ts.ToInternalValue()); base::Time now = base::Time::Now(); SegmentID last_segment_id = 0; PageUsageData* pud = NULL; float score = 0; base::TimeDelta duration; while (statement->Step()) { SegmentID segment_id = statement->ColumnInt64(0); if (segment_id != last_segment_id) { if (pud) { pud->SetScore(score); pud->SetDuration(duration); result->push_back(pud); } pud = new PageUsageData(segment_id); score = 0; last_segment_id = segment_id; duration = base::TimeDelta(); } base::Time timeslot = base::Time::FromInternalValue(statement->ColumnInt64(1)); int count; if (query_type == QUERY_VISIT_COUNT) { count = statement->ColumnInt(2); } else { base::TimeDelta current_duration( base::TimeDelta::FromInternalValue(statement->ColumnInt64(2))); duration += current_duration; // Souldn't overflow since we group by day. count = static_cast(current_duration.InSeconds()); } float day_score = 1.0f + log(static_cast(count)); // Recent visits count more than historical ones, so we multiply in a boost // related to how long ago this day was. // This boost is a curve that smoothly goes through these values: // Today gets 3x, a week ago 2x, three weeks ago 1.5x, falling off to 1x // at the limit of how far we reach into the past. int days_ago = (now - timeslot).InDays(); float recency_boost = 1.0f + (2.0f * (1.0f / (1.0f + days_ago/7.0f))); score += recency_boost * day_score; } if (pud) { pud->SetScore(score); pud->SetDuration(duration); result->push_back(pud); } // Limit to the top kResultCount results. std::sort(result->begin(), result->end(), PageUsageData::Predicate); if (static_cast(result->size()) > max_result_count) { STLDeleteContainerPointers(result->begin() + max_result_count, result->end()); result->resize(max_result_count); } // Now fetch the details about the entries we care about. sql::Statement statement2(GetDB().GetCachedStatement(SQL_FROM_HERE, "SELECT urls.url, urls.title FROM urls " "JOIN segments ON segments.url_id = urls.id " "WHERE segments.id = ?")); if (!statement2.is_valid()) return; for (size_t i = 0; i < result->size(); ++i) { PageUsageData* pud = (*result)[i]; statement2.BindInt64(0, pud->GetID()); if (statement2.Step()) { pud->SetURL(GURL(statement2.ColumnString(0))); pud->SetTitle(statement2.ColumnString16(1)); } statement2.Reset(true); } } } // namespace history