SafeBrowsingStore storage abstraction for SafeBrowsing database.

First bit of refactoring safe-browsing to use a flat file format. SafeBrowsingStore implements just what is needed for SafeBrowsingDatabase using straight-forward read/modify/write code. There will be a follow-on change to layer in on-the-fly format migration and integrate with SafeBrowsingDatabase. This CL only adds new classes and tests for same. BUG=none TEST=none Review URL: http://codereview.chromium.org/545053 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@36615 0039d316-1c4b-4281-b951-d872f2087c98
author: shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-01-20 06:29:28 +0000
committer: shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-01-20 06:29:28 +0000
commit: 5332aa894ad01d22aeb01107db6d82ccee648604 (patch)
tree: 2c5186162c3a0cff848751dad8f0dba472ef5b21 /chrome/browser/safe_browsing/safe_browsing_store.h
parent: 86fdd8723d9f3e185eb946781ed160d4ec122fff (diff)
download: chromium_src-5332aa894ad01d22aeb01107db6d82ccee648604.zip
chromium_src-5332aa894ad01d22aeb01107db6d82ccee648604.tar.gz
chromium_src-5332aa894ad01d22aeb01107db6d82ccee648604.tar.bz2
1 files changed, 211 insertions, 0 deletions
diff --git a/chrome/browser/safe_browsing/safe_browsing_store.h b/chrome/browser/safe_browsing/safe_browsing_store.h
new file mode 100644
index 0000000..0d13e88
--- /dev/null
+++ b/chrome/browser/safe_browsing/safe_browsing_store.h
@@ -0,0 +1,211 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
+#define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
+
+#include <set>
+#include <vector>
+
+#include "base/basictypes.h"
+#include "base/file_path.h"
+#include "base/task.h"
+#include "base/time.h"
+#include "chrome/browser/safe_browsing/safe_browsing_util.h"
+
+// SafeBrowsingStore provides a storage abstraction for the
+// safe-browsing data used to build the bloom filter.  The items
+// stored are:
+//   The set of add and sub chunks seen.
+//   List of SBAddPrefix (chunk_id and SBPrefix).
+//   List of SBSubPrefix (chunk_id and the target SBAddPrefix).
+//   List of SBAddFullHash (SBAddPrefix, time received and an SBFullHash).
+//   List of SBSubFullHash (chunk_id, target SBAddPrefix, and an SBFullHash).
+//
+// The store is geared towards updating the data, not runtime access
+// to the data (that is handled by SafeBrowsingDatabase).  Updates are
+// handled similar to a SQL transaction cycle, with the new data being
+// returned from FinishUpdate() (the COMMIT).  Data is not persistent
+// until FinishUpdate() returns successfully.
+//
+// FinishUpdate() also handles dropping items who's chunk has been
+// deleted, and netting out the add/sub lists (when a sub matches an
+// add, both are dropped).
+
+// GetAddChunkId(), GetAddPrefix() and GetFullHash() are exposed so
+// that these items can be generically compared with each other by
+// SBAddPrefixLess() and SBAddPrefixHashLess().
+
+struct SBAddPrefix {
+  int32 chunk_id;
+  SBPrefix prefix;
+
+  SBAddPrefix(int32 id, SBPrefix p) : chunk_id(id), prefix(p) {}
+
+  int32 GetAddChunkId() const { return chunk_id; }
+  SBPrefix GetAddPrefix() const { return prefix; }
+};
+
+struct SBSubPrefix {
+  int32 chunk_id;
+  SBAddPrefix add_prefix;
+
+  SBSubPrefix(int32 id, int32 add_id, int prefix)
+      : chunk_id(id), add_prefix(add_id, prefix) {}
+
+  int32 GetAddChunkId() const { return add_prefix.chunk_id; }
+  SBPrefix GetAddPrefix() const { return add_prefix.prefix; }
+};
+
+// TODO(shess): The full_hash includes the prefix, so the prefix could
+// be dropped.  But SBAddPrefix is convenient for comparing across
+// different structs, and there aren't many full hashes.  Hmm.
+struct SBAddFullHash {
+  SBAddPrefix add_prefix;
+  base::Time received;
+  SBFullHash full_hash;
+
+  SBAddFullHash(int32 id, SBPrefix p, base::Time r, SBFullHash h)
+      : add_prefix(id, p), received(r), full_hash(h) {}
+
+  int32 GetAddChunkId() const { return add_prefix.chunk_id; }
+  SBPrefix GetAddPrefix() const { return add_prefix.prefix; }
+};
+
+struct SBSubFullHash {
+  int32 chunk_id;
+  SBAddPrefix add_prefix;
+  SBFullHash full_hash;
+
+  SBSubFullHash(int32 id, int32 add_id, SBPrefix p, SBFullHash h)
+      : chunk_id(id), add_prefix(add_id, p), full_hash(h) {}
+
+  int32 GetAddChunkId() const { return add_prefix.chunk_id; }
+  SBPrefix GetAddPrefix() const { return add_prefix.prefix; }
+};
+
+// Determine less-than based on add chunk and prefix.
+template <class T, class U>
+bool SBAddPrefixLess(const T& a, const U& b) {
+  if (a.GetAddChunkId() != b.GetAddChunkId())
+    return a.GetAddChunkId() < b.GetAddChunkId();
+
+  return a.GetAddPrefix() < b.GetAddPrefix();
+}
+
+// Determine less-than based on add chunk, prefix, and full hash.
+// Prefix can compare differently than hash due to byte ordering,
+// so it must take precedence.
+template <class T, class U>
+bool SBAddPrefixHashLess(const T& a, const U& b) {
+  if (SBAddPrefixLess(a, b))
+    return true;
+
+  if (SBAddPrefixLess(b, a))
+    return false;
+
+  return memcmp(a.full_hash.full_hash, b.full_hash.full_hash,
+                sizeof(a.full_hash.full_hash)) < 0;
+}
+
+// Process the lists for subs which knock out adds.  For any item in
+// |sub_prefixes| which has a match in |add_prefixes|, knock out the
+// matched items from all vectors.
+//
+// TODO(shess): Since the prefixes are uniformly-distributed hashes,
+// there aren't many ways to organize the inputs for efficient
+// processing.  For this reason, the vectors are sorted and processed
+// in parallel.  At this time this code does the sorting internally,
+// but it might make sense to make sorting an API requirement so that
+// the storage can optimize for it.
+//
+// TODO(shess): The original code did not process |sub_full_hashes|
+// for matches in |add_full_hashes|, so this code doesn't, either.  I
+// think this is probably a bug.
+void SBProcessSubs(std::vector<SBAddPrefix>* add_prefixes,
+                   std::vector<SBSubPrefix>* sub_prefixes,
+                   std::vector<SBAddFullHash>* add_full_hashes,
+                   std::vector<SBSubFullHash>* sub_full_hashes);
+
+// TODO(shess): This uses int32 rather than int because it's writing
+// specifically-sized items to files.  SBPrefix should likewise be
+// explicitly sized.
+
+// Abstract interface for storing data.
+class SafeBrowsingStore {
+ public:
+  SafeBrowsingStore() {}
+  virtual ~SafeBrowsingStore() {}
+
+  // Sets up the information for later use, but does not necessarily
+  // check whether the underlying file exists, or is valid.  If
+  // |curruption_callback| is non-NULL it will be called if corruption
+  // is detected, which could happen as part of any call other than
+  // Delete().  The appropriate action is to use Delete() to clear the
+  // store.
+  virtual void Init(const FilePath& filename,
+                    Callback0::Type* corruption_callback) = 0;
+
+  // Deletes the files which back the store, returning true if
+  // successful.
+  virtual bool Delete() = 0;
+
+  // Start an update.  None of the following methods should be called
+  // unless this returns true.  If this returns true, the update
+  // should be terminated by FinishUpdate() or CancelUpdate().
+  virtual bool BeginUpdate() = 0;
+
+  // Start a chunk of data.  None of the methods through FinishChunk()
+  // should be called unless this returns true.
+  // TODO(shess): Would it make sense for this to accept |chunk_id|?
+  // Possibly not, because of possible confusion between sub_chunk_id
+  // and add_chunk_id.
+  virtual bool BeginChunk() = 0;
+
+  virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) = 0;
+  virtual bool WriteAddHash(int32 chunk_id, SBPrefix prefix,
+                            base::Time receive_time, SBFullHash full_hash) = 0;
+  virtual bool WriteSubPrefix(int32 chunk_id,
+                              int32 add_chunk_id, SBPrefix prefix) = 0;
+  virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id,
+                            SBPrefix prefix, SBFullHash full_hash) = 0;
+
+  // Collect the chunk data and preferrably store it on disk to
+  // release memory.  Shoul not modify the data in-place.
+  virtual bool FinishChunk() = 0;
+
+  // Track the chunks which have been seen.
+  virtual void SetAddChunk(int32 chunk_id) = 0;
+  virtual bool CheckAddChunk(int32 chunk_id) = 0;
+  virtual void GetAddChunks(std::vector<int32>* out) = 0;
+  virtual void SetSubChunk(int32 chunk_id) = 0;
+  virtual bool CheckSubChunk(int32 chunk_id) = 0;
+  virtual void GetSubChunks(std::vector<int32>* out) = 0;
+
+  // Delete the indicated chunk_id.  The chunk will continue to be
+  // visible until the end of the transaction.
+  virtual void DeleteAddChunk(int32 chunk_id) = 0;
+  virtual void DeleteSubChunk(int32 chunk_id) = 0;
+
+  // Pass the collected chunks through SBPRocessSubs() and commit to
+  // permanent storage.  The resulting add prefixes and hashes will be
+  // stored in |add_prefixes_result| and |add_full_hashes_result|.
+  // |pending_adds| is the set of full hashes which have been received
+  // since the previous update, and is provided as a convenience
+  // (could be written via WriteAddHash(), but that would flush the
+  // chunk to disk).
+  virtual bool FinishUpdate(
+      const std::vector<SBAddFullHash>& pending_adds,
+      std::vector<SBAddPrefix>* add_prefixes_result,
+      std::vector<SBAddFullHash>* add_full_hashes_result) = 0;
+
+  // Cancel the update in process and remove any temporary disk
+  // storage, leaving the original data unmodified.
+  virtual bool CancelUpdate() = 0;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStore);
+};
+
+#endif  // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_H_
author	shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-01-20 06:29:28 +0000
committer	shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-01-20 06:29:28 +0000
commit	5332aa894ad01d22aeb01107db6d82ccee648604 (patch)
tree	2c5186162c3a0cff848751dad8f0dba472ef5b21 /chrome/browser/safe_browsing/safe_browsing_store.h
parent	86fdd8723d9f3e185eb946781ed160d4ec122fff (diff)
download	chromium_src-5332aa894ad01d22aeb01107db6d82ccee648604.zip chromium_src-5332aa894ad01d22aeb01107db6d82ccee648604.tar.gz chromium_src-5332aa894ad01d22aeb01107db6d82ccee648604.tar.bz2