summaryrefslogtreecommitdiffstats
path: root/chrome/browser/history/text_database_manager.cc
blob: 2feea81223a3c2463bce384af0448fd90c4f0389 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/history/text_database_manager.h"

#include "base/compiler_specific.h"
#include "base/file_util.h"
#include "base/histogram.h"
#include "base/logging.h"
#include "base/message_loop.h"
#include "base/string_util.h"
#include "chrome/browser/history/history_publisher.h"
#include "chrome/common/mru_cache.h"

using base::Time;
using base::TimeDelta;
using base::TimeTicks;

namespace history {

namespace {

// The number of database files we will be attached to at once.
const int kCacheDBSize = 5;

std::string ConvertStringForIndexer(
    const std::wstring& input) {
  // TODO(evanm): other transformations here?
  return WideToUTF8(CollapseWhitespace(input, false));
}

// Data older than this will be committed to the full text index even if we
// haven't gotten a title and/or body.
const int kExpirationSec = 20;

}  // namespace

// TextDatabaseManager::PageInfo -----------------------------------------------

TextDatabaseManager::PageInfo::PageInfo(URLID url_id,
                                        VisitID visit_id,
                                        Time visit_time)
    : url_id_(url_id),
      visit_id_(visit_id),
      visit_time_(visit_time) {
  added_time_ = TimeTicks::Now();
}

void TextDatabaseManager::PageInfo::set_title(const std::wstring& ttl) {
  if (ttl.empty())  // Make the title nonempty when we set it for EverybodySet.
    title_ = L" ";
  else
    title_ = ttl;
}

void TextDatabaseManager::PageInfo::set_body(const std::wstring& bdy) {
  if (bdy.empty())  // Make the body nonempty when we set it for EverybodySet.
    body_ = L" ";
  else
    body_ = bdy;
}

bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const {
  return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec);
}

// TextDatabaseManager ---------------------------------------------------------

TextDatabaseManager::TextDatabaseManager(const std::wstring& dir,
                                         URLDatabase* url_database,
                                         VisitDatabase* visit_database)
    : dir_(dir),
      db_(NULL),
      url_database_(url_database),
      visit_database_(visit_database),
      recent_changes_(RecentChangeList::NO_AUTO_EVICT),
      transaction_nesting_(0),
      db_cache_(DBCache::NO_AUTO_EVICT),
      present_databases_loaded_(false),
      ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)),
      history_publisher_(NULL) {
}

TextDatabaseManager::~TextDatabaseManager() {
  if (transaction_nesting_)
    CommitTransaction();
}

// static
TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) {
  Time::Exploded exploded;
  time.UTCExplode(&exploded);

  // We combine the month and year into a 6-digit number (200801 for
  // January, 2008). The month is 1-based.
  return exploded.year * 100 + exploded.month;
}

// static
Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) {
  Time::Exploded exploded;
  memset(&exploded, 0, sizeof(Time::Exploded));
  exploded.year = id / 100;
  exploded.month = id % 100;
  return Time::FromUTCExploded(exploded);
}

bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) {
  history_publisher_ = history_publisher;

  // Start checking recent changes and committing them.
  ScheduleFlushOldChanges();
  return true;
}

void TextDatabaseManager::BeginTransaction() {
  transaction_nesting_++;
}

void TextDatabaseManager::CommitTransaction() {
  DCHECK(transaction_nesting_);
  transaction_nesting_--;
  if (transaction_nesting_)
    return;  // Still more nesting of transactions before committing.

  // Commit all databases with open transactions on them.
  for (DBIdentSet::const_iterator i = open_transactions_.begin();
       i != open_transactions_.end(); ++i) {
    DBCache::iterator iter = db_cache_.Get(*i);
    if (iter == db_cache_.end()) {
      NOTREACHED() << "All open transactions should be cached.";
      continue;
    }
    iter->second->CommitTransaction();
  }
  open_transactions_.clear();

  // Now that the transaction is over, we can expire old connections.
  db_cache_.ShrinkToSize(kCacheDBSize);
}

void TextDatabaseManager::InitDBList() {
  if (present_databases_loaded_)
    return;

  present_databases_loaded_ = true;

  // Find files on disk matching our pattern so we can quickly test for them.
  file_util::FileEnumerator enumerator(FilePath::FromWStringHack(dir_), false,
      file_util::FileEnumerator::FILES,
      FilePath::FromWStringHack(
          std::wstring(TextDatabase::file_base()) + L"*").value());
  std::wstring cur_file;
  while (!(cur_file = enumerator.Next().ToWStringHack()).empty()) {
    // Convert to the number representing this file.
    TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file);
    if (id)  // Will be 0 on error.
      present_databases_.insert(id);
  }
}

void TextDatabaseManager::AddPageURL(const GURL& url,
                                     URLID url_id,
                                     VisitID visit_id,
                                     Time time) {
  // Delete any existing page info.
  RecentChangeList::iterator found = recent_changes_.Peek(url);
  if (found != recent_changes_.end())
    recent_changes_.Erase(found);

  // Just save this info for later. We will save it when it expires or when all
  // the data is complete.
  recent_changes_.Put(url, PageInfo(url_id, visit_id, time));
}

void TextDatabaseManager::AddPageTitle(const GURL& url,
                                       const std::wstring& title) {
  RecentChangeList::iterator found = recent_changes_.Peek(url);
  if (found == recent_changes_.end()) {
    // This page is not in our cache of recent pages. This is very much an edge
    // case as normally a title will come in <20 seconds after the page commits,
    // and WebContents will avoid spamming us with >1 title per page. However,
    // it could come up if your connection is unhappy, and we don't want to
    // miss anything.
    //
    // To solve this problem, we'll just associate the most recent visit with
    // the new title and index that using the regular code path.
    URLRow url_row;
    if (!url_database_->GetRowForURL(url, &url_row))
      return;  // URL is unknown, give up.
    VisitRow visit;
    if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
      return;  // No recent visit, give up.

    if (visit.is_indexed) {
      // If this page was already indexed, we could have a body that came in
      // first and we don't want to overwrite it. We could go query for the
      // current body, or have a special setter for only the title, but this is
      // not worth it for this edge case.
      //
      // It will be almost impossible for the title to take longer than
      // kExpirationSec yet we got a body in less than that time, since the
      // title should always come in first.
      return;
    }

    AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
                title, std::wstring());
    return;  // We don't know about this page, give up.
  }

  PageInfo& info = found->second;
  if (info.has_body()) {
    // This info is complete, write to the database.
    AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
                title, info.body());
    recent_changes_.Erase(found);
    return;
  }

  info.set_title(title);
}

void TextDatabaseManager::AddPageContents(const GURL& url,
                                          const std::wstring& body) {
  RecentChangeList::iterator found = recent_changes_.Peek(url);
  if (found == recent_changes_.end()) {
    // This page is not in our cache of recent pages. This means that the page
    // took more than kExpirationSec to load. Often, this will be the result of
    // a very slow iframe or other resource on the page that makes us think its
    // still loading.
    //
    // As a fallback, set the most recent visit's contents using the input, and
    // use the last set title in the URL table as the title to index.
    URLRow url_row;
    if (!url_database_->GetRowForURL(url, &url_row))
      return;  // URL is unknown, give up.
    VisitRow visit;
    if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit))
      return;  // No recent visit, give up.

    // Use the title from the URL row as the title for the indexing.
    AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time,
                url_row.title(), body);
    return;
  }

  PageInfo& info = found->second;
  if (info.has_title()) {
    // This info is complete, write to the database.
    AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(),
                info.title(), body);
    recent_changes_.Erase(found);
    return;
  }

  info.set_body(body);
}

bool TextDatabaseManager::AddPageData(const GURL& url,
                                      URLID url_id,
                                      VisitID visit_id,
                                      Time visit_time,
                                      const std::wstring& title,
                                      const std::wstring& body) {
  TextDatabase* db = GetDBForTime(visit_time, true);
  if (!db)
    return false;

  TimeTicks beginning_time = TimeTicks::Now();

  // First delete any recently-indexed data for this page. This will delete
  // anything in the main database, but we don't bother looking through the
  // archived database.
  VisitVector visits;
  visit_database_->GetVisitsForURL(url_id, &visits);
  size_t our_visit_row_index = visits.size();
  for (size_t i = 0; i < visits.size(); i++) {
    // While we're going trough all the visits, also find our row so we can
    // avoid another DB query.
    if (visits[i].visit_id == visit_id) {
      our_visit_row_index = i;
    } else if (visits[i].is_indexed) {
      visits[i].is_indexed = false;
      visit_database_->UpdateVisitRow(visits[i]);
      DeletePageData(visits[i].visit_time, url, NULL);
    }
  }

  if (visit_id) {
    // We're supposed to update the visit database.
    if (our_visit_row_index >= visits.size()) {
      NOTREACHED() << "We should always have found a visit when given an ID.";
      return false;
    }

    DCHECK(visit_time == visits[our_visit_row_index].visit_time);

    // Update the visit database to reference our addition.
    visits[our_visit_row_index].is_indexed = true;
    if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index]))
      return false;
  }

  // Now index the data.
  std::string url_str = URLDatabase::GURLToDatabaseURL(url);
  bool success = db->AddPageData(visit_time, url_str,
                                 ConvertStringForIndexer(title),
                                 ConvertStringForIndexer(body));

  HISTOGRAM_TIMES("History.AddFTSData",
                  TimeTicks::Now() - beginning_time);

  if (history_publisher_)
    history_publisher_->PublishPageContent(visit_time, url, title, body);

  return success;
}

void TextDatabaseManager::DeletePageData(Time time, const GURL& url,
                                         ChangeSet* change_set) {
  TextDatabase::DBIdent db_ident = TimeToID(time);

  // We want to open the database for writing, but only if it exists. To
  // achieve this, we check whether it exists by saying we're not going to
  // write to it (avoiding the autocreation code normally called when writing)
  // and then access it for writing only if it succeeds.
  TextDatabase* db = GetDB(db_ident, false);
  if (!db)
    return;
  db = GetDB(db_ident, true);

  if (change_set)
    change_set->Add(db_ident);

  db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url));
}

void TextDatabaseManager::DeleteFromUncommitted(Time begin, Time end) {
  // First find the beginning of the range to delete. Recall that the list
  // has the most recent item at the beginning. There won't normally be very
  // many items, so a brute-force search is fine.
  RecentChangeList::iterator cur = recent_changes_.begin();
  if (!end.is_null()) {
    // Walk from the beginning of the list backwards in time to find the newest
    // entry that should be deleted.
    while (cur != recent_changes_.end() && cur->second.visit_time() >= end)
      ++cur;
  }

  // Now delete all visits up to the oldest one we were supposed to delete.
  // Note that if begin is_null, it will be less than or equal to any other
  // time.
  while (cur != recent_changes_.end() && cur->second.visit_time() >= begin)
    cur = recent_changes_.Erase(cur);
}

void TextDatabaseManager::DeleteURLFromUncommitted(const GURL& url) {
  RecentChangeList::iterator found = recent_changes_.Peek(url);
  if (found == recent_changes_.end())
    return;  // We don't know about this page, give up.
  recent_changes_.Erase(found);
}

void TextDatabaseManager::DeleteAll() {
  DCHECK(transaction_nesting_ == 0) << "Calling deleteAll in a transaction.";

  InitDBList();

  // Close all open databases.
  db_cache_.ShrinkToSize(0);

  // Now go through and delete all the files.
  for (DBIdentSet::iterator i = present_databases_.begin();
       i != present_databases_.end(); ++i) {
    std::wstring file_name(dir_);
    file_util::AppendToPath(&file_name, TextDatabase::IDToFileName(*i));
    file_util::Delete(file_name, false);
  }
}

void TextDatabaseManager::OptimizeChangedDatabases(
    const ChangeSet& change_set) {
  for (ChangeSet::DBSet::const_iterator i =
           change_set.changed_databases_.begin();
       i != change_set.changed_databases_.end(); ++i) {
    // We want to open the database for writing, but only if it exists. To
    // achieve this, we check whether it exists by saying we're not going to
    // write to it (avoiding the autocreation code normally called when writing)
    // and then access it for writing only if it succeeds.
    TextDatabase* db = GetDB(*i, false);
    if (!db)
      continue;
    db = GetDB(*i, true);
    if (!db)
      continue;  // The file may have changed or something.
    db->Optimize();
  }
}

void TextDatabaseManager::GetTextMatches(
    const std::wstring& query,
    const QueryOptions& options,
    std::vector<TextDatabase::Match>* results,
    Time* first_time_searched) {
  results->clear();

  InitDBList();
  if (present_databases_.empty()) {
    // Nothing to search.
    *first_time_searched = options.begin_time;
    return;
  }

  // Get the query into the proper format for the individual DBs.
  std::wstring fts_query_wide;
  query_parser_.ParseQuery(query, &fts_query_wide);
  std::string fts_query = WideToUTF8(fts_query_wide);

  // Need a copy of the options so we can modify the max count for each call
  // to the individual databases.
  QueryOptions cur_options(options);

  // Compute the minimum and maximum values for the identifiers that could
  // encompass the input time range.
  TextDatabase::DBIdent min_ident = options.begin_time.is_null() ?
      *present_databases_.begin() :
      TimeToID(options.begin_time);
  TextDatabase::DBIdent max_ident = options.end_time.is_null() ?
      *present_databases_.rbegin() :
      TimeToID(options.end_time);

  // Iterate over the databases from the most recent backwards.
  bool checked_one = false;
  TextDatabase::URLSet found_urls;
  for (DBIdentSet::reverse_iterator i = present_databases_.rbegin();
       i != present_databases_.rend();
       ++i) {
    // TODO(brettw) allow canceling the query in the middle.
    // if (canceled_or_something)
    //   break;

    // This code is stupid, we just loop until we find the correct starting
    // time range rather than search in an intelligent way. Users will have a
    // few dozen files at most, so this should not be an issue.
    if (*i > max_ident)
      continue;  // Haven't gotten to the time range yet.
    if (*i < min_ident)
      break;  // Covered all the time range.

    TextDatabase* cur_db = GetDB(*i, false);
    if (!cur_db)
      continue;

    // Adjust the max count according to how many results we've already got.
    if (options.max_count) {
      cur_options.max_count = options.max_count -
          static_cast<int>(results->size());
    }

    // Since we are going backwards in time, it is always OK to pass the
    // current first_time_searched, since it will always be smaller than
    // any previous set.
    cur_db->GetTextMatches(fts_query, cur_options,
                           results, &found_urls, first_time_searched);
    checked_one = true;

    DCHECK(options.max_count == 0 ||
           static_cast<int>(results->size()) <= options.max_count);
    if (options.max_count &&
        static_cast<int>(results->size()) >= options.max_count)
      break;  // Got the max number of results.
  }

  // When there were no databases in the range, we need to fix up the min time.
  if (!checked_one)
    *first_time_searched = options.begin_time;
}

TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id,
                                         bool for_writing) {
  DBCache::iterator found_db = db_cache_.Get(id);
  if (found_db != db_cache_.end()) {
    if (transaction_nesting_ && for_writing &&
        open_transactions_.find(id) == open_transactions_.end()) {
      // If we currently have an open transaction, that database is not yet
      // part of the transaction, and the database will be written to, it needs
      // to be part of our transaction.
      found_db->second->BeginTransaction();
      open_transactions_.insert(id);
    }
    return found_db->second;
  }

  // Need to make the database.
  TextDatabase* new_db = new TextDatabase(dir_, id, for_writing);
  if (!new_db->Init()) {
    delete new_db;
    return NULL;
  }
  db_cache_.Put(id, new_db);
  present_databases_.insert(id);

  if (transaction_nesting_ && for_writing) {
    // If we currently have an open transaction and the new database will be
    // written to, it needs to be part of our transaction.
    new_db->BeginTransaction();
    open_transactions_.insert(id);
  }

  // When no transaction is open, allow this new one to kick out an old one.
  if (!transaction_nesting_)
    db_cache_.ShrinkToSize(kCacheDBSize);

  return new_db;
}

TextDatabase* TextDatabaseManager::GetDBForTime(Time time,
                                                bool create_if_necessary) {
  return GetDB(TimeToID(time), create_if_necessary);
}

void TextDatabaseManager::ScheduleFlushOldChanges() {
  factory_.RevokeAll();
  MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod(
          &TextDatabaseManager::FlushOldChanges),
      kExpirationSec * Time::kMillisecondsPerSecond);
}

void TextDatabaseManager::FlushOldChanges() {
  FlushOldChangesForTime(TimeTicks::Now());
}

void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) {
  // The end of the list is the oldest, so we just start from there committing
  // things until we get something too new.
  RecentChangeList::reverse_iterator i = recent_changes_.rbegin();
  while (i != recent_changes_.rend() && i->second.Expired(now)) {
    AddPageData(i->first, i->second.url_id(), i->second.visit_id(),
                i->second.visit_time(), i->second.title(), i->second.body());
    i = recent_changes_.Erase(i);
  }

  ScheduleFlushOldChanges();
}

}  // namespace history