diff options
author | shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-03-27 05:27:17 +0000 |
---|---|---|
committer | shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-03-27 05:27:17 +0000 |
commit | d3dd0715afc50f2e999f5a28478e783519170a29 (patch) | |
tree | ac638131304a9a2524d10e08dee9b7929e55b239 /chrome/browser/safe_browsing | |
parent | f42a5a827ca1c591f1f9e918fce5ce6d485d0e3d (diff) | |
download | chromium_src-d3dd0715afc50f2e999f5a28478e783519170a29.zip chromium_src-d3dd0715afc50f2e999f5a28478e783519170a29.tar.gz chromium_src-d3dd0715afc50f2e999f5a28478e783519170a29.tar.bz2 |
Refactor safe-browsing file storage to allow incremental processing.
Previously the file was read into memory, processed, and written out to
a new file. Modify the format to shard the data by prefix, so that only
a portion needs to be read into memory at a given time. The sharding is
dynamic based on target memory footprint (currently 100k), so small
files have fewer shards than large files.
Additionally, add a checksum over the header and chunk table-of-contents
area to prevent corruption from causing broken update requests.
BUG=120219,351448
Review URL: https://codereview.chromium.org/208583005
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@259791 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/safe_browsing')
6 files changed, 1015 insertions, 292 deletions
diff --git a/chrome/browser/safe_browsing/safe_browsing_store.cc b/chrome/browser/safe_browsing/safe_browsing_store.cc index 2638e58..a942b9a 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store.cc +++ b/chrome/browser/safe_browsing/safe_browsing_store.cc @@ -10,6 +10,17 @@ namespace { +// Return |true| if the range is sorted by the given comparator. +template <typename CTI, typename LESS> +bool sorted(CTI beg, CTI end, LESS less) { + while ((end - beg) > 2) { + CTI n = beg++; + if (less(*beg, *n)) + return false; + } + return true; +} + // Find items matching between |subs| and |adds|, and remove them, // recording the item from |adds| in |adds_removed|. To minimize // copies, the inputs are processing in parallel, so |subs| and |adds| @@ -133,15 +144,15 @@ void SBProcessSubs(SBAddPrefixes* add_prefixes, // to qualify things. It becomes very arbitrary, though, and less // clear how things are working. - // Sort the inputs by the SBAddPrefix bits. - std::sort(add_prefixes->begin(), add_prefixes->end(), - SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); - std::sort(sub_prefixes->begin(), sub_prefixes->end(), - SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); - std::sort(add_full_hashes->begin(), add_full_hashes->end(), - SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); - std::sort(sub_full_hashes->begin(), sub_full_hashes->end(), - SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); + // Make sure things are sorted appropriately. + DCHECK(sorted(add_prefixes->begin(), add_prefixes->end(), + SBAddPrefixLess<SBAddPrefix,SBAddPrefix>)); + DCHECK(sorted(sub_prefixes->begin(), sub_prefixes->end(), + SBAddPrefixLess<SBSubPrefix,SBSubPrefix>)); + DCHECK(sorted(add_full_hashes->begin(), add_full_hashes->end(), + SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>)); + DCHECK(sorted(sub_full_hashes->begin(), sub_full_hashes->end(), + SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>)); // Factor out the prefix subs. SBAddPrefixes removed_adds; diff --git a/chrome/browser/safe_browsing/safe_browsing_store.h b/chrome/browser/safe_browsing/safe_browsing_store.h index 76cf28a..b381743 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store.h +++ b/chrome/browser/safe_browsing/safe_browsing_store.h @@ -53,6 +53,8 @@ struct SBAddPrefix { SBPrefix GetAddPrefix() const { return prefix; } }; +// TODO(shess): Measure the performance impact of switching this back to +// std::vector<> once the v8 file format dominates. Also SBSubPrefixes. typedef std::deque<SBAddPrefix> SBAddPrefixes; struct SBSubPrefix { @@ -134,12 +136,7 @@ bool SBAddPrefixHashLess(const T& a, const U& b) { // matched items from all vectors. Additionally remove items from // deleted chunks. // -// TODO(shess): Since the prefixes are uniformly-distributed hashes, -// there aren't many ways to organize the inputs for efficient -// processing. For this reason, the vectors are sorted and processed -// in parallel. At this time this code does the sorting internally, -// but it might make sense to make sorting an API requirement so that -// the storage can optimize for it. +// The inputs must be sorted by SBAddPrefixLess or SBAddPrefixHashLess. void SBProcessSubs(SBAddPrefixes* add_prefixes, SBSubPrefixes* sub_prefixes, std::vector<SBAddFullHash>* add_full_hashes, @@ -147,10 +144,6 @@ void SBProcessSubs(SBAddPrefixes* add_prefixes, const base::hash_set<int32>& add_chunks_deleted, const base::hash_set<int32>& sub_chunks_deleted); -// TODO(shess): This uses int32 rather than int because it's writing -// specifically-sized items to files. SBPrefix should likewise be -// explicitly sized. - // Abstract interface for storing data. class SafeBrowsingStore { public: diff --git a/chrome/browser/safe_browsing/safe_browsing_store_file.cc b/chrome/browser/safe_browsing/safe_browsing_store_file.cc index a879096..b58d961 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store_file.cc +++ b/chrome/browser/safe_browsing/safe_browsing_store_file.cc @@ -14,22 +14,68 @@ namespace { // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so // that byte-order changes force corruption. const int32 kFileMagic = 0x600D71FE; -const int32 kFileVersion = 7; // SQLite storage was 6... + +// Version history: +// Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite) +// Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28 +// Version 8: ????????/r????? by shess@chromium.org on 2014-03-?? +const int32 kFileVersion = 8; + +// ReadAndVerifyHeader() returns this in case of error. +const int32 kInvalidVersion = -1; // Header at the front of the main database file. -struct FileHeader { +struct FileHeaderV7 { int32 magic, version; uint32 add_chunk_count, sub_chunk_count; uint32 add_prefix_count, sub_prefix_count; uint32 add_hash_count, sub_hash_count; }; +// Starting with version 8, the storage is sorted and can be sharded to allow +// updates to be done with lower memory requirements. Newly written files will +// be sharded to need less than this amount of memory during update. Larger +// values are preferred to minimize looping overhead during processing. +const int64 kUpdateStorageBytes = 100 * 1024; + +// Prevent excessive sharding by setting a lower limit on the shard stride. +// Smaller values should work fine, but very small values will probably lead to +// poor performance. Shard stride is indirectly related to +// |kUpdateStorageBytes|, setting that very small will bump against this. +const uint32 kMinShardStride = 1 << 24; + +// Strides over the entire SBPrefix space. +const uint64 kMaxShardStride = GG_LONGLONG(1u) << 32; + +// Maximum SBPrefix value. +const SBPrefix kMaxSBPrefix = ~0; + +// Header at the front of the main database file. +struct FileHeaderV8 { + int32 magic, version; + uint32 add_chunk_count, sub_chunk_count; + uint32 shard_stride; + // TODO(shess): Is this where 64-bit will bite me? Perhaps write a + // specialized read/write? +}; + +union FileHeader { + struct FileHeaderV7 v7; + struct FileHeaderV8 v8; +}; + // Header for each chunk in the chunk-accumulation file. struct ChunkHeader { uint32 add_prefix_count, sub_prefix_count; uint32 add_hash_count, sub_hash_count; }; +// Header for each shard of data in the main database file. +struct ShardHeader { + uint32 add_prefix_count, sub_prefix_count; + uint32 add_hash_count, sub_hash_count; +}; + // Rewind the file. Using fseek(2) because rewind(3) errors are // weird. bool FileRewind(FILE* fp) { @@ -38,18 +84,6 @@ bool FileRewind(FILE* fp) { return rv == 0; } -// Move file read pointer forward by |bytes| relative to current position. -bool FileSkip(size_t bytes, FILE* fp) { - // Although fseek takes negative values, for this case, we only want - // to skip forward. - DCHECK(static_cast<long>(bytes) >= 0); - if (static_cast<long>(bytes) < 0) - return false; - int rv = fseek(fp, static_cast<long>(bytes), SEEK_CUR); - DCHECK_EQ(rv, 0); - return rv == 0; -} - // Read from |fp| into |item|, and fold the input data into the // checksum in |context|, if non-NULL. Return true on success. template <class T> @@ -104,22 +138,26 @@ bool ReadToContainer(CT* values, size_t count, FILE* fp, return true; } -// Write all of |values| to |fp|, and fold the data into the checksum -// in |context|, if non-NULL. Returns true on succsess. -template <typename CT> -bool WriteContainer(const CT& values, FILE* fp, - base::MD5Context* context) { - if (values.empty()) - return true; - - for (typename CT::const_iterator iter = values.begin(); - iter != values.end(); ++iter) { +// Write values between |beg| and |end| to |fp|, and fold the data into the +// checksum in |context|, if non-NULL. Returns true if all items successful. +template <typename CTI> +bool WriteRange(const CTI& beg, const CTI& end, + FILE* fp, base::MD5Context* context) { + for (CTI iter = beg; iter != end; ++iter) { if (!WriteItem(*iter, fp, context)) return false; } return true; } +// Write all of |values| to |fp|, and fold the data into the checksum +// in |context|, if non-NULL. Returns true if all items successful. +template <typename CT> +bool WriteContainer(const CT& values, FILE* fp, + base::MD5Context* context) { + return WriteRange(values.begin(), values.end(), fp, context); +} + // Delete the chunks in |deleted| from |chunks|. void DeleteChunksFromSet(const base::hash_set<int32>& deleted, std::set<int32>* chunks) { @@ -131,16 +169,37 @@ void DeleteChunksFromSet(const base::hash_set<int32>& deleted, } } +// base::MD5Final() modifies |context| in generating |digest|. This wrapper +// generates an intermediate digest without modifying the context. +void MD5IntermediateDigest(base::MD5Digest* digest, base::MD5Context* context) { + base::MD5Context temp_context; + memcpy(&temp_context, context, sizeof(temp_context)); + base::MD5Final(digest, &temp_context); +} + +bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) { + base::MD5Digest calculated_digest; + MD5IntermediateDigest(&calculated_digest, context); + + base::MD5Digest file_digest; + if (!ReadItem(&file_digest, fp, context)) + return false; + + return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0; +} + // Sanity-check the header against the file's size to make sure our // vectors aren't gigantic. This doubles as a cheap way to detect // corruption without having to checksum the entire file. -bool FileHeaderSanityCheck(const base::FilePath& filename, - const FileHeader& header) { +bool FileHeaderV7SanityCheck(const base::FilePath& filename, + const FileHeaderV7& header) { + DCHECK_EQ(header.version, 7); + int64 size = 0; if (!base::GetFileSize(filename, &size)) return false; - int64 expected_size = sizeof(FileHeader); + int64 expected_size = sizeof(FileHeaderV7); expected_size += header.add_chunk_count * sizeof(int32); expected_size += header.sub_chunk_count * sizeof(int32); expected_size += header.add_prefix_count * sizeof(SBAddPrefix); @@ -154,21 +213,387 @@ bool FileHeaderSanityCheck(const base::FilePath& filename, return true; } -// This a helper function that reads header to |header|. Returns true if the -// magic number is correct and santiy check passes. -bool ReadAndVerifyHeader(const base::FilePath& filename, - FILE* fp, - FileHeader* header, - base::MD5Context* context) { - if (!ReadItem(header, fp, context)) +// Helper function to read the file header and chunk TOC. Rewinds |fp| and +// initializes |context|. The header is left in |header|, with the version +// returned. kInvalidVersion is returned for sanity check or checksum failure. +int ReadAndVerifyHeader(const base::FilePath& filename, + FileHeader* header, + std::set<int32>* add_chunks, + std::set<int32>* sub_chunks, + FILE* fp, + base::MD5Context* context) { + DCHECK(header); + DCHECK(add_chunks); + DCHECK(sub_chunks); + DCHECK(fp); + DCHECK(context); + + int version = kInvalidVersion; + + base::MD5Init(context); + if (!FileRewind(fp)) + return kInvalidVersion; + if (!ReadItem(&header->v8, fp, context)) + return kInvalidVersion; + if (header->v8.magic != kFileMagic) + return kInvalidVersion; + + size_t add_chunks_count = 0; + size_t sub_chunks_count = 0; + + if (header->v8.version == 7) { + version = 7; + + // Reset the context and re-read the v7 header. + base::MD5Init(context); + if (!FileRewind(fp)) + return kInvalidVersion; + if (!ReadItem(&header->v7, fp, context)) + return kInvalidVersion; + if (header->v7.magic != kFileMagic || header->v7.version != 7) + return kInvalidVersion; + if (!FileHeaderV7SanityCheck(filename, header->v7)) + return kInvalidVersion; + + add_chunks_count = header->v7.add_chunk_count; + sub_chunks_count = header->v7.sub_chunk_count; + } else if (header->v8.version == kFileVersion) { + version = 8; + add_chunks_count = header->v8.add_chunk_count; + sub_chunks_count = header->v8.sub_chunk_count; + } else { + return kInvalidVersion; + } + + if (!ReadToContainer(add_chunks, add_chunks_count, fp, context) || + !ReadToContainer(sub_chunks, sub_chunks_count, fp, context)) { + return kInvalidVersion; + } + + // v8 includes a checksum to validate the header. + if (version > 7 && !ReadAndVerifyChecksum(fp, context)) + return kInvalidVersion; + + return version; +} + +// Helper function to write out the initial header and chunks-contained data. +// Rewinds |fp|, initializes |context|, then writes a file header and +// |add_chunks| and |sub_chunks|. +bool WriteHeader(uint32 out_stride, + const std::set<int32>& add_chunks, + const std::set<int32>& sub_chunks, + FILE* fp, + base::MD5Context* context) { + if (!FileRewind(fp)) + return false; + + base::MD5Init(context); + FileHeaderV8 header; + header.magic = kFileMagic; + header.version = kFileVersion; + header.add_chunk_count = add_chunks.size(); + header.sub_chunk_count = sub_chunks.size(); + header.shard_stride = out_stride; + if (!WriteItem(header, fp, context)) return false; - if (header->magic != kFileMagic || header->version != kFileVersion) + + if (!WriteContainer(add_chunks, fp, context) || + !WriteContainer(sub_chunks, fp, context)) return false; - if (!FileHeaderSanityCheck(filename, *header)) + + // Write out the header digest. + base::MD5Digest header_digest; + MD5IntermediateDigest(&header_digest, context); + if (!WriteItem(header_digest, fp, context)) return false; + + return true; +} + +// Return |true| if the range is sorted by the given comparator. +template <typename CTI, typename LESS> +bool sorted(CTI beg, CTI end, LESS less) { + while ((end - beg) > 2) { + CTI n = beg++; + DCHECK(!less(*beg, *n)); + if (less(*beg, *n)) + return false; + } return true; } +// Merge |beg|..|end| into |container|. Both should be sorted by the given +// comparator, and the range iterators should not be derived from |container|. +// Differs from std::inplace_merge() in that additional memory is not required +// for linear performance. +template <typename CT, typename CTI, typename COMP> +void container_merge(CT* container, CTI beg, CTI end, const COMP& less) { + DCHECK(sorted(container->begin(), container->end(), less)); + DCHECK(sorted(beg, end, less)); + + // Size the container to fit the results. + const size_t c_size = container->size(); + container->resize(c_size + (end - beg)); + + // |c_end| points to the original endpoint, while |c_out| points to the + // endpoint that will scan from end to beginning while merging. + typename CT::iterator c_end = container->begin() + c_size; + typename CT::iterator c_out = container->end(); + + // While both inputs have data, move the greater to |c_out|. + while (c_end != container->begin() && end != beg) { + if (less(*(c_end - 1), *(end - 1))) { + *(--c_out) = *(--end); + } else { + *(--c_out) = *(--c_end); + } + } + + // Copy any data remaining in the new range. + if (end != beg) { + // The original container data has been fully shifted. + DCHECK(c_end == container->begin()); + + // There is exactly the correct amount of space left. + DCHECK_EQ(c_out - c_end, end - beg); + + std::copy(beg, end, container->begin()); + } + + DCHECK(sorted(container->begin(), container->end(), less)); +} + +// Collection of iterators used while stepping through StateInternal (see +// below). +class StateInternalPos { + public: + StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter, + SBSubPrefixes::iterator sub_prefixes_iter, + std::vector<SBAddFullHash>::iterator add_hashes_iter, + std::vector<SBSubFullHash>::iterator sub_hashes_iter) + : add_prefixes_iter_(add_prefixes_iter), + sub_prefixes_iter_(sub_prefixes_iter), + add_hashes_iter_(add_hashes_iter), + sub_hashes_iter_(sub_hashes_iter) { + } + + SBAddPrefixes::iterator add_prefixes_iter_; + SBSubPrefixes::iterator sub_prefixes_iter_; + std::vector<SBAddFullHash>::iterator add_hashes_iter_; + std::vector<SBSubFullHash>::iterator sub_hashes_iter_; +}; + +// Helper to find the next shard boundary. +template <class T> +bool prefix_bounder(SBPrefix val, const T& elt) { + return val < elt.GetAddPrefix(); +} + +// Container for partial database state. Includes add/sub prefixes/hashes, plus +// aggregate operations on same. +class StateInternal { + public: + explicit StateInternal(const std::vector<SBAddFullHash>& pending_adds) + : add_full_hashes_(pending_adds.begin(), pending_adds.end()) { + } + + StateInternal() {} + + // Append indicated amount of data from |fp|. + bool AppendData(size_t add_prefix_count, size_t sub_prefix_count, + size_t add_hash_count, size_t sub_hash_count, + FILE* fp, base::MD5Context* context) { + return + ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) && + ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) && + ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) && + ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context); + } + + void ClearData() { + add_prefixes_.clear(); + sub_prefixes_.clear(); + add_full_hashes_.clear(); + sub_full_hashes_.clear(); + } + + // Merge data from |beg|..|end| into receiver's state, then process the state. + // The current state and the range given should corrospond to the same sorted + // shard of data from different sources. |add_del_cache| and |sub_del_cache| + // indicate the chunk ids which should be deleted during processing (see + // SBProcessSubs). + void MergeDataAndProcess(const StateInternalPos& beg, + const StateInternalPos& end, + const base::hash_set<int32>& add_del_cache, + const base::hash_set<int32>& sub_del_cache) { + container_merge(&add_prefixes_, + beg.add_prefixes_iter_, + end.add_prefixes_iter_, + SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); + + container_merge(&sub_prefixes_, + beg.sub_prefixes_iter_, + end.sub_prefixes_iter_, + SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); + + container_merge(&add_full_hashes_, + beg.add_hashes_iter_, + end.add_hashes_iter_, + SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); + + container_merge(&sub_full_hashes_, + beg.sub_hashes_iter_, + end.sub_hashes_iter_, + SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>); + + SBProcessSubs(&add_prefixes_, &sub_prefixes_, + &add_full_hashes_, &sub_full_hashes_, + add_del_cache, sub_del_cache); + } + + // Sort the data appropriately for the sharding, merging, and processing + // operations. + void SortData() { + std::sort(add_prefixes_.begin(), add_prefixes_.end(), + SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); + std::sort(sub_prefixes_.begin(), sub_prefixes_.end(), + SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); + std::sort(add_full_hashes_.begin(), add_full_hashes_.end(), + SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); + std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(), + SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); + } + + // Iterator from the beginning of the state's data. + StateInternalPos StateBegin() { + return StateInternalPos(add_prefixes_.begin(), + sub_prefixes_.begin(), + add_full_hashes_.begin(), + sub_full_hashes_.begin()); + } + + // An iterator pointing just after the last possible element of the shard + // indicated by |shard_max|. Used to step through the state by shard. + // TODO(shess): Verify whether binary search really improves over linear. + // Merging or writing will immediately touch all of these elements. + StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) { + return StateInternalPos( + std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(), + shard_max, prefix_bounder<SBAddPrefix>), + std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(), + shard_max, prefix_bounder<SBSubPrefix>), + std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(), + shard_max, prefix_bounder<SBAddFullHash>), + std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(), + shard_max, prefix_bounder<SBSubFullHash>)); + } + + // Write a shard header and data for the shard starting at |beg| and ending at + // the element before |end|. + bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end, + FILE* fp, base::MD5Context* context) { + ShardHeader shard_header; + shard_header.add_prefix_count = + end.add_prefixes_iter_ - beg.add_prefixes_iter_; + shard_header.sub_prefix_count = + end.sub_prefixes_iter_ - beg.sub_prefixes_iter_; + shard_header.add_hash_count = + end.add_hashes_iter_ - beg.add_hashes_iter_; + shard_header.sub_hash_count = + end.sub_hashes_iter_ - beg.sub_hashes_iter_; + + return + WriteItem(shard_header, fp, context) && + WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_, + fp, context) && + WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_, + fp, context) && + WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_, + fp, context) && + WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_, + fp, context); + } + + SBAddPrefixes add_prefixes_; + SBSubPrefixes sub_prefixes_; + std::vector<SBAddFullHash> add_full_hashes_; + std::vector<SBSubFullHash> sub_full_hashes_; +}; + +// True if |val| is an even power of two. +template <typename T> +bool IsPowerOfTwo(const T& val) { + return val && (val & (val - 1)) == 0; +} + +// Helper to read the entire database state, used by GetAddPrefixes() and +// GetAddFullHashes(). Those functions are generally used only for smaller +// files. Returns false in case of errors reading the data. +bool ReadDbStateHelper(const base::FilePath& filename, + StateInternal* db_state) { + file_util::ScopedFILE file(base::OpenFile(filename, "rb")); + if (file.get() == NULL) + return false; + + std::set<int32> add_chunks; + std::set<int32> sub_chunks; + + base::MD5Context context; + FileHeader header; + const int version = + ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks, + file.get(), &context); + if (version == kInvalidVersion) + return false; + + if (version == 7) { + if (!db_state->AppendData(header.v7.add_prefix_count, + header.v7.sub_prefix_count, + header.v7.add_hash_count, + header.v7.sub_hash_count, + file.get(), &context)) { + return false; + } + + // v7 data was not stored sorted. + db_state->SortData(); + } else { + // Read until the shard start overflows, always at least one pass. + uint64 in_min = 0; + uint64 in_stride = header.v8.shard_stride; + if (!in_stride) + in_stride = kMaxShardStride; + if (!IsPowerOfTwo(in_stride)) + return false; + + do { + ShardHeader shard_header; + if (!ReadItem(&shard_header, file.get(), &context)) + return false; + + if (!db_state->AppendData(shard_header.add_prefix_count, + shard_header.sub_prefix_count, + shard_header.add_hash_count, + shard_header.sub_hash_count, + file.get(), &context)) { + return false; + } + + in_min += in_stride; + } while (in_min <= kMaxSBPrefix); + } + + if (!ReadAndVerifyChecksum(file.get(), &context)) + return false; + + int64 size = 0; + if (!base::GetFileSize(filename, &size)) + return false; + + return static_cast<int64>(ftell(file.get())) == size; +} + } // namespace // static @@ -258,15 +683,7 @@ bool SafeBrowsingStoreFile::CheckValidity() { bytes_left -= c; } - // Calculate the digest to this point. - base::MD5Digest calculated_digest; - base::MD5Final(&calculated_digest, &context); - - // Read the stored digest and verify it. - base::MD5Digest file_digest; - if (!ReadItem(&file_digest, file_.get(), NULL)) - return OnCorruptDatabase(); - if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest))) { + if (!ReadAndVerifyChecksum(file_.get(), &context)) { RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE); return OnCorruptDatabase(); } @@ -293,48 +710,29 @@ bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) { bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) { add_prefixes->clear(); + if (!base::PathExists(filename_)) + return true; - base::ScopedFILE file(base::OpenFile(filename_, "rb")); - if (file.get() == NULL) return false; - - FileHeader header; - if (!ReadAndVerifyHeader(filename_, file.get(), &header, NULL)) + StateInternal db_state; + if (!ReadDbStateHelper(filename_, &db_state)) return OnCorruptDatabase(); - size_t add_prefix_offset = header.add_chunk_count * sizeof(int32) + - header.sub_chunk_count * sizeof(int32); - if (!FileSkip(add_prefix_offset, file.get())) - return false; - - if (!ReadToContainer(add_prefixes, header.add_prefix_count, file.get(), NULL)) - return false; - + add_prefixes->swap(db_state.add_prefixes_); return true; } bool SafeBrowsingStoreFile::GetAddFullHashes( std::vector<SBAddFullHash>* add_full_hashes) { add_full_hashes->clear(); + if (!base::PathExists(filename_)) + return true; - base::ScopedFILE file(base::OpenFile(filename_, "rb")); - if (file.get() == NULL) return false; - - FileHeader header; - if (!ReadAndVerifyHeader(filename_, file.get(), &header, NULL)) + StateInternal db_state; + if (!ReadDbStateHelper(filename_, &db_state)) return OnCorruptDatabase(); - size_t offset = - header.add_chunk_count * sizeof(int32) + - header.sub_chunk_count * sizeof(int32) + - header.add_prefix_count * sizeof(SBAddPrefix) + - header.sub_prefix_count * sizeof(SBSubPrefix); - if (!FileSkip(offset, file.get())) - return false; - - return ReadToContainer(add_full_hashes, - header.add_hash_count, - file.get(), - NULL); + add_full_hashes->swap(db_state.add_full_hashes_); + return true; } bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id, @@ -415,15 +813,26 @@ bool SafeBrowsingStoreFile::BeginUpdate() { return true; } + base::MD5Context context; FileHeader header; - if (!ReadItem(&header, file.get(), NULL)) - return OnCorruptDatabase(); - - if (header.magic != kFileMagic || header.version != kFileVersion) { - if (!strcmp(reinterpret_cast<char*>(&header.magic), "SQLite format 3")) { - RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE); - } else { - RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN); + const int version = + ReadAndVerifyHeader(filename_, &header, + &add_chunks_cache_, &sub_chunks_cache_, + file.get(), &context); + if (version == kInvalidVersion) { + FileHeaderV8 retry_header; + if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL) && + (retry_header.magic != kFileMagic || + (retry_header.version != 8 && retry_header.version != 7))) { + // TODO(shess): Think on whether these histograms are generating any + // actionable data. I kid you not, SQLITE happens many thousands of times + // per day, UNKNOWN about 3x higher than that. + if (!strcmp(reinterpret_cast<char*>(&retry_header.magic), + "SQLite format 3")) { + RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE); + } else { + RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN); + } } // Close the file so that it can be deleted. @@ -432,20 +841,6 @@ bool SafeBrowsingStoreFile::BeginUpdate() { return OnCorruptDatabase(); } - // TODO(shess): Under POSIX it is possible that this could size a - // file different from the file which was opened. - if (!FileHeaderSanityCheck(filename_, header)) - return OnCorruptDatabase(); - - // Pull in the chunks-seen data for purposes of implementing - // |GetAddChunks()| and |GetSubChunks()|. This data is sent up to - // the server at the beginning of an update. - if (!ReadToContainer(&add_chunks_cache_, header.add_chunk_count, - file.get(), NULL) || - !ReadToContainer(&sub_chunks_cache_, header.sub_chunk_count, - file.get(), NULL)) - return OnCorruptDatabase(); - file_.swap(file); new_file_.swap(new_file); return true; @@ -485,80 +880,25 @@ bool SafeBrowsingStoreFile::DoUpdate( CHECK(builder); CHECK(add_full_hashes_result); - SBAddPrefixes add_prefixes; - SBSubPrefixes sub_prefixes; - std::vector<SBAddFullHash> add_full_hashes; - std::vector<SBSubFullHash> sub_full_hashes; - - // Read original data into the vectors. - if (!empty_) { - DCHECK(file_.get()); - - if (!FileRewind(file_.get())) - return OnCorruptDatabase(); - - base::MD5Context context; - base::MD5Init(&context); - - // Read the file header and make sure it looks right. - FileHeader header; - if (!ReadAndVerifyHeader(filename_, file_.get(), &header, &context)) - return OnCorruptDatabase(); - - // Re-read the chunks-seen data to get to the later data in the - // file and calculate the checksum. No new elements should be - // added to the sets. - if (!ReadToContainer(&add_chunks_cache_, header.add_chunk_count, - file_.get(), &context) || - !ReadToContainer(&sub_chunks_cache_, header.sub_chunk_count, - file_.get(), &context)) - return OnCorruptDatabase(); - - if (!ReadToContainer(&add_prefixes, header.add_prefix_count, - file_.get(), &context) || - !ReadToContainer(&sub_prefixes, header.sub_prefix_count, - file_.get(), &context) || - !ReadToContainer(&add_full_hashes, header.add_hash_count, - file_.get(), &context) || - !ReadToContainer(&sub_full_hashes, header.sub_hash_count, - file_.get(), &context)) - return OnCorruptDatabase(); - - // Calculate the digest to this point. - base::MD5Digest calculated_digest; - base::MD5Final(&calculated_digest, &context); - - // Read the stored checksum and verify it. - base::MD5Digest file_digest; - if (!ReadItem(&file_digest, file_.get(), NULL)) - return OnCorruptDatabase(); - - if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest))) { - RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE); - return OnCorruptDatabase(); - } - - // Close the file so we can later rename over it. - file_.reset(); - } - DCHECK(!file_.get()); - // Rewind the temporary storage. if (!FileRewind(new_file_.get())) return false; // Get chunk file's size for validating counts. - int64 size = 0; - if (!base::GetFileSize(TemporaryFileForFilename(filename_), &size)) + int64 update_size = 0; + if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size)) return OnCorruptDatabase(); // Track update size to answer questions at http://crbug.com/72216 . // Log small updates as 1k so that the 0 (underflow) bucket can be // used for "empty" in SafeBrowsingDatabase. UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes", - std::max(static_cast<int>(size / 1024), 1)); + std::max(static_cast<int>(update_size / 1024), 1)); - // Append the accumulated chunks onto the vectors read from |file_|. + // Chunk updates to integrate. + StateInternal new_state(pending_adds); + + // Read update chunks. for (int i = 0; i < chunks_written_; ++i) { ChunkHeader header; @@ -576,73 +916,202 @@ bool SafeBrowsingStoreFile::DoUpdate( expected_size += header.sub_prefix_count * sizeof(SBSubPrefix); expected_size += header.add_hash_count * sizeof(SBAddFullHash); expected_size += header.sub_hash_count * sizeof(SBSubFullHash); - if (expected_size > size) + if (expected_size > update_size) return false; - // TODO(shess): If the vectors were kept sorted, then this code - // could use std::inplace_merge() to merge everything together in - // sorted order. That might still be slower than just sorting at - // the end if there were a large number of chunks. In that case - // some sort of recursive binary merge might be in order (merge - // chunks pairwise, merge those chunks pairwise, and so on, then - // merge the result with the main list). - if (!ReadToContainer(&add_prefixes, header.add_prefix_count, - new_file_.get(), NULL) || - !ReadToContainer(&sub_prefixes, header.sub_prefix_count, - new_file_.get(), NULL) || - !ReadToContainer(&add_full_hashes, header.add_hash_count, - new_file_.get(), NULL) || - !ReadToContainer(&sub_full_hashes, header.sub_hash_count, - new_file_.get(), NULL)) + if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count, + header.add_hash_count, header.sub_hash_count, + new_file_.get(), NULL)) { return false; + } } - // Append items from |pending_adds|. - add_full_hashes.insert(add_full_hashes.end(), - pending_adds.begin(), pending_adds.end()); + // The state was accumulated by chunk, sort by prefix. + new_state.SortData(); + + // These strides control how much data is loaded into memory per pass. + // Strides must be an even power of two. |in_stride| will be derived from the + // input file. |out_stride| will be derived from an estimate of the resulting + // file's size. |process_stride| will be the max of both. + uint64 in_stride = kMaxShardStride; + uint64 out_stride = kMaxShardStride; + uint64 process_stride = 0; + + // The header info is only used later if |!empty_|. The v8 read loop only + // needs |in_stride|, while v7 needs to refer to header information. + base::MD5Context in_context; + int version = kInvalidVersion; + FileHeader header; - // Knock the subs from the adds and process deleted chunks. - SBProcessSubs(&add_prefixes, &sub_prefixes, - &add_full_hashes, &sub_full_hashes, - add_del_cache_, sub_del_cache_); + if (!empty_) { + DCHECK(file_.get()); + + version = ReadAndVerifyHeader(filename_, &header, + &add_chunks_cache_, &sub_chunks_cache_, + file_.get(), &in_context); + if (version == kInvalidVersion) + return OnCorruptDatabase(); + + if (version == 8 && header.v8.shard_stride) + in_stride = header.v8.shard_stride; + + // The header checksum should have prevented this case, but the code will be + // broken if this is not correct. + if (!IsPowerOfTwo(in_stride)) + return OnCorruptDatabase(); + } // We no longer need to track deleted chunks. DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_); DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_); - // Write the new data to new_file_. - if (!FileRewind(new_file_.get())) - return false; + // Calculate |out_stride| to break the file down into reasonable shards. + { + int64 original_size = 0; + if (!empty_ && !base::GetFileSize(filename_, &original_size)) + return OnCorruptDatabase(); - base::MD5Context context; - base::MD5Init(&context); + // Approximate the final size as everything. Subs and deletes will reduce + // the size, but modest over-sharding won't hurt much. + int64 shard_size = original_size + update_size; - // Write a file header. - FileHeader header; - header.magic = kFileMagic; - header.version = kFileVersion; - header.add_chunk_count = add_chunks_cache_.size(); - header.sub_chunk_count = sub_chunks_cache_.size(); - header.add_prefix_count = add_prefixes.size(); - header.sub_prefix_count = sub_prefixes.size(); - header.add_hash_count = add_full_hashes.size(); - header.sub_hash_count = sub_full_hashes.size(); - if (!WriteItem(header, new_file_.get(), &context)) - return false; + // Keep splitting until a single stride of data fits the target. + size_t shifts = 0; + while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) { + out_stride >>= 1; + shard_size >>= 1; + ++shifts; + } + UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts); + + DCHECK(IsPowerOfTwo(out_stride)); + } - // Write all the chunk data. - if (!WriteContainer(add_chunks_cache_, new_file_.get(), &context) || - !WriteContainer(sub_chunks_cache_, new_file_.get(), &context) || - !WriteContainer(add_prefixes, new_file_.get(), &context) || - !WriteContainer(sub_prefixes, new_file_.get(), &context) || - !WriteContainer(add_full_hashes, new_file_.get(), &context) || - !WriteContainer(sub_full_hashes, new_file_.get(), &context)) + // Outer loop strides by the max of the input stride (to read integral shards) + // and the output stride (to write integral shards). + process_stride = std::max(in_stride, out_stride); + DCHECK(IsPowerOfTwo(process_stride)); + DCHECK_EQ(0u, process_stride % in_stride); + DCHECK_EQ(0u, process_stride % out_stride); + + // Start writing the new data to |new_file_|. + base::MD5Context out_context; + if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_, + new_file_.get(), &out_context)) { return false; + } + + // Start at the beginning of the SBPrefix space. + uint64 in_min = 0; + uint64 out_min = 0; + uint64 process_min = 0; + + // Start at the beginning of the updates. + StateInternalPos new_pos = new_state.StateBegin(); + + // Re-usable container for shard processing. + StateInternal db_state; + + // Track aggregate counts for histograms. + size_t add_prefix_count = 0; + size_t sub_prefix_count = 0; + + do { + // Maximum element in the current shard. + SBPrefix process_max = + static_cast<SBPrefix>(process_min + process_stride - 1); + DCHECK_GT(process_max, process_min); + + // Drop the data from previous pass. + db_state.ClearData(); + + // Fill the processing shard with one or more input shards. + if (!empty_) { + if (version == 7) { + // Treat v7 as a single-shard file. + DCHECK_EQ(in_min, 0u); + DCHECK_EQ(in_stride, kMaxShardStride); + DCHECK_EQ(process_stride, kMaxShardStride); + if (!db_state.AppendData(header.v7.add_prefix_count, + header.v7.sub_prefix_count, + header.v7.add_hash_count, + header.v7.sub_hash_count, + file_.get(), &in_context)) + return OnCorruptDatabase(); + + // v7 data is not sorted correctly. + db_state.SortData(); + } else { + do { + ShardHeader shard_header; + if (!ReadItem(&shard_header, file_.get(), &in_context)) + return OnCorruptDatabase(); + + if (!db_state.AppendData(shard_header.add_prefix_count, + shard_header.sub_prefix_count, + shard_header.add_hash_count, + shard_header.sub_hash_count, + file_.get(), &in_context)) + return OnCorruptDatabase(); + + in_min += in_stride; + } while (in_min <= kMaxSBPrefix && in_min < process_max); + } + } - // Write the checksum at the end. - base::MD5Digest digest; - base::MD5Final(&digest, &context); - if (!WriteItem(digest, new_file_.get(), NULL)) + // Shard the update data to match the database data, then merge the update + // data and process the results. + { + StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max); + db_state.MergeDataAndProcess(new_pos, new_end, + add_del_cache_, sub_del_cache_); + new_pos = new_end; + } + + // Collect the processed data for return to caller. + for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) { + builder->AddPrefix(db_state.add_prefixes_[i].prefix); + } + add_full_hashes_result->insert(add_full_hashes_result->end(), + db_state.add_full_hashes_.begin(), + db_state.add_full_hashes_.end()); + add_prefix_count += db_state.add_prefixes_.size(); + sub_prefix_count += db_state.sub_prefixes_.size(); + + // Write one or more shards of processed output. + StateInternalPos out_pos = db_state.StateBegin(); + do { + SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1); + DCHECK_GT(out_max, out_min); + + StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max); + if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context)) + return false; + out_pos = out_end; + + out_min += out_stride; + } while (out_min == static_cast<SBPrefix>(out_min) && + out_min < process_max); + + process_min += process_stride; + } while (process_min <= kMaxSBPrefix); + + // Verify the overall checksum. + if (!empty_) { + if (!ReadAndVerifyChecksum(file_.get(), &in_context)) + return OnCorruptDatabase(); + + // TODO(shess): Verify EOF? + + // Close the input file so the new file can be renamed over it. + file_.reset(); + } + DCHECK(!file_.get()); + + // Write the overall checksum. + base::MD5Digest out_digest; + base::MD5Final(&out_digest, &out_context); + if (!WriteItem(out_digest, new_file_.get(), NULL)) return false; // Trim any excess left over from the temporary chunk data. @@ -660,15 +1129,8 @@ bool SafeBrowsingStoreFile::DoUpdate( return false; // Record counts before swapping to caller. - UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefixes.size()); - UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefixes.size()); - - // Pass the resulting data off to the caller. - for (SBAddPrefixes::const_iterator iter = add_prefixes.begin(); - iter != add_prefixes.end(); ++iter) { - builder->AddPrefix(iter->prefix); - } - add_full_hashes_result->swap(add_full_hashes); + UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count); + UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count); return true; } diff --git a/chrome/browser/safe_browsing/safe_browsing_store_file.h b/chrome/browser/safe_browsing/safe_browsing_store_file.h index 6b821b0..c8a2429 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store_file.h +++ b/chrome/browser/safe_browsing/safe_browsing_store_file.h @@ -21,38 +21,60 @@ // int32 version; // format version // // // Counts for the various data which follows the header. -// uint32 add_chunk_count; // Chunks seen, including empties. -// uint32 sub_chunk_count; // Ditto. -// uint32 add_prefix_count; -// uint32 sub_prefix_count; -// uint32 add_hash_count; -// uint32 sub_hash_count; -// +// uint32 add_chunk_count; // Chunks seen, including empties. +// uint32 sub_chunk_count; // Ditto. +// uint32 shard_stride; // SBPrefix space covered per shard. +// // 0==entire space in one shard. +// // Sorted by chunk_id. // array[add_chunk_count] { // int32 chunk_id; // } +// // Sorted by chunk_id. // array[sub_chunk_count] { // int32 chunk_id; // } -// array[add_prefix_count] { -// int32 chunk_id; -// uint32 prefix; -// } -// array[sub_prefix_count] { -// int32 chunk_id; -// int32 add_chunk_id; -// uint32 add_prefix; -// } -// array[add_hash_count] { -// int32 chunk_id; -// int32 received_time; // From base::Time::ToTimeT(). -// char[32] full_hash; -// array[sub_hash_count] { -// int32 chunk_id; -// int32 add_chunk_id; -// char[32] add_full_hash; +// MD5Digest header_checksum; // Checksum over preceeding data. +// +// // Sorted by prefix, then add chunk_id, then hash, both within shards and +// // overall. +// array[from 0 to wraparound to 0 by shard_stride] { +// uint32 add_prefix_count; +// uint32 sub_prefix_count; +// uint32 add_hash_count; +// uint32 sub_hash_count; +// array[add_prefix_count] { +// int32 chunk_id; +// uint32 prefix; +// } +// array[sub_prefix_count] { +// int32 chunk_id; +// int32 add_chunk_id; +// uint32 add_prefix; +// } +// array[add_hash_count] { +// int32 chunk_id; +// int32 received_time; // From base::Time::ToTimeT(). +// char[32] full_hash; +// } +// array[sub_hash_count] { +// int32 chunk_id; +// int32 add_chunk_id; +// char[32] add_full_hash; +// } // } -// MD5Digest checksum; // Checksum over preceeding data. +// MD5Digest checksum; // Checksum over entire file. +// +// The checksums are used to allow writing the file without doing an expensive +// fsync(). Since the data can be re-fetched, failing the checksum is not +// catastrophic. Histograms indicate that file corruption here is pretty +// uncommon. +// +// The |header_checksum| is present to guarantee valid header and chunk data for +// updates. Only that part of the file needs to be read to post the update. +// +// |shard_stride| breaks the file into approximately-equal portions, allowing +// updates to stream from one file to another with modest memory usage. It is +// dynamic to adjust to different file sizes without adding excessive overhead. // // During the course of an update, uncommitted data is stored in a // temporary file (which is later re-used to commit). This is an @@ -91,19 +113,15 @@ // - Open a temp file for storing new chunk info. // - Write new chunks to the temp file. // - When the transaction is finished: -// - Read the rest of the original file's data into buffers. -// - Rewind the temp file and merge the new data into buffers. -// - Process buffers for deletions and apply subs. -// - Rewind and write the buffers out to temp file. +// - Read the update data from the temp file into memory. +// - Overwrite the temp file with new header data. +// - Until done: +// - Read shards of the original file's data into memory. +// - Merge from the update data. +// - Write shards to the temp file. // - Delete original file. // - Rename temp file to original filename. -// TODO(shess): By using a checksum, this code can avoid doing an -// fsync(), at the possible cost of more frequently retrieving the -// full dataset. Measure how often this occurs, and if it occurs too -// often, consider retaining the last known-good file for recovery -// purposes, rather than deleting it. - class SafeBrowsingStoreFile : public SafeBrowsingStore { public: SafeBrowsingStoreFile(); diff --git a/chrome/browser/safe_browsing/safe_browsing_store_file_unittest.cc b/chrome/browser/safe_browsing/safe_browsing_store_file_unittest.cc index 4f52241..c29fa60 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store_file_unittest.cc +++ b/chrome/browser/safe_browsing/safe_browsing_store_file_unittest.cc @@ -28,6 +28,9 @@ const SBFullHash kHash3 = SBFullHashForString("three"); const SBFullHash kHash4 = SBFullHashForString("four"); const SBFullHash kHash5 = SBFullHashForString("five"); +const SBPrefix kMinSBPrefix = 0u; +const SBPrefix kMaxSBPrefix = ~kMinSBPrefix; + class SafeBrowsingStoreFileTest : public PlatformTest { public: virtual void SetUp() { @@ -58,7 +61,7 @@ class SafeBrowsingStoreFileTest : public PlatformTest { // Populate the store with some testing data. void PopulateStore(const base::Time& now) { - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); EXPECT_TRUE(store_->BeginChunk()); store_->SetAddChunk(kAddChunk1); @@ -85,7 +88,17 @@ class SafeBrowsingStoreFileTest : public PlatformTest { EXPECT_TRUE(store_->FinishUpdate(pending_adds, &builder, &add_full_hashes_result)); -} + } + + // Manually read the shard stride info from the file. + uint32 ReadStride() { + base::ScopedFILE file(base::OpenFile(filename_, "rb")); + const long kOffset = 4 * sizeof(uint32); + EXPECT_EQ(fseek(file.get(), kOffset, SEEK_SET), 0); + uint32 shard_stride = 0; + EXPECT_EQ(fread(&shard_stride, sizeof(shard_stride), 1, file.get()), 1U); + return shard_stride; + } base::ScopedTempDir temp_dir_; base::FilePath filename_; @@ -95,7 +108,7 @@ class SafeBrowsingStoreFileTest : public PlatformTest { // Test that the empty store looks empty. TEST_F(SafeBrowsingStoreFileTest, Empty) { - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); std::vector<int> chunks; store_->GetAddChunks(&chunks); @@ -132,7 +145,7 @@ TEST_F(SafeBrowsingStoreFileTest, StorePrefix) { const base::Time now = base::Time::Now(); PopulateStore(now); - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); std::vector<int> chunks; store_->GetAddChunks(&chunks); @@ -164,7 +177,7 @@ TEST_F(SafeBrowsingStoreFileTest, StorePrefix) { EXPECT_TRUE(SBFullHashEqual(kHash2, add_full_hashes_result[0].full_hash)); } - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); // Still has the chunks expected in the next update. store_->GetAddChunks(&chunks); @@ -200,9 +213,63 @@ TEST_F(SafeBrowsingStoreFileTest, StorePrefix) { } } +// Verify that the min and max prefixes are stored and operated on. +TEST_F(SafeBrowsingStoreFileTest, PrefixMinMax) { + const base::Time now = base::Time::Now(); + PopulateStore(now); + + ASSERT_TRUE(store_->BeginUpdate()); + + EXPECT_TRUE(store_->BeginChunk()); + store_->SetAddChunk(kAddChunk2); + EXPECT_TRUE(store_->WriteAddPrefix(kAddChunk2, kMinSBPrefix)); + EXPECT_TRUE(store_->WriteAddPrefix(kAddChunk2, kMaxSBPrefix)); + EXPECT_TRUE(store_->FinishChunk()); + + { + std::vector<SBAddFullHash> pending_adds; + safe_browsing::PrefixSetBuilder builder; + std::vector<SBAddFullHash> add_full_hashes_result; + EXPECT_TRUE(store_->FinishUpdate(pending_adds, + &builder, + &add_full_hashes_result)); + + std::vector<SBPrefix> prefixes_result; + builder.GetPrefixSet()->GetPrefixes(&prefixes_result); + ASSERT_EQ(4U, prefixes_result.size()); + EXPECT_EQ(kMinSBPrefix, prefixes_result[0]); + EXPECT_EQ(kHash1.prefix, prefixes_result[1]); + EXPECT_EQ(kHash2.prefix, prefixes_result[2]); + EXPECT_EQ(kMaxSBPrefix, prefixes_result[3]); + } + + ASSERT_TRUE(store_->BeginUpdate()); + + EXPECT_TRUE(store_->BeginChunk()); + store_->SetAddChunk(kSubChunk2); + EXPECT_TRUE(store_->WriteSubPrefix(kSubChunk2, kAddChunk2, kMinSBPrefix)); + EXPECT_TRUE(store_->WriteSubPrefix(kSubChunk2, kAddChunk2, kMaxSBPrefix)); + EXPECT_TRUE(store_->FinishChunk()); + + { + std::vector<SBAddFullHash> pending_adds; + safe_browsing::PrefixSetBuilder builder; + std::vector<SBAddFullHash> add_full_hashes_result; + EXPECT_TRUE(store_->FinishUpdate(pending_adds, + &builder, + &add_full_hashes_result)); + + std::vector<SBPrefix> prefixes_result; + builder.GetPrefixSet()->GetPrefixes(&prefixes_result); + ASSERT_EQ(2U, prefixes_result.size()); + EXPECT_EQ(kHash1.prefix, prefixes_result[0]); + EXPECT_EQ(kHash2.prefix, prefixes_result[1]); + } +} + // Test that subs knockout adds. TEST_F(SafeBrowsingStoreFileTest, SubKnockout) { - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); const base::Time now = base::Time::Now(); @@ -233,7 +300,7 @@ TEST_F(SafeBrowsingStoreFileTest, SubKnockout) { EXPECT_TRUE(add_full_hashes_result.empty()); } - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); // This add should be knocked out by an existing sub. EXPECT_TRUE(store_->BeginChunk()); @@ -256,7 +323,7 @@ TEST_F(SafeBrowsingStoreFileTest, SubKnockout) { EXPECT_TRUE(add_full_hashes_result.empty()); } - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); // But by here the sub should be gone, so it should stick this time. EXPECT_TRUE(store_->BeginChunk()); @@ -283,7 +350,7 @@ TEST_F(SafeBrowsingStoreFileTest, SubKnockout) { // Test that deletes delete the chunk's data. TEST_F(SafeBrowsingStoreFileTest, DeleteChunks) { - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); const base::Time now = base::Time::Now(); @@ -349,7 +416,7 @@ TEST_F(SafeBrowsingStoreFileTest, DeleteChunks) { } // Expected chunks are there in another update. - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); EXPECT_FALSE(store_->CheckAddChunk(kAddChunk1)); EXPECT_TRUE(store_->CheckAddChunk(kAddChunk2)); EXPECT_FALSE(store_->CheckSubChunk(kSubChunk1)); @@ -369,7 +436,7 @@ TEST_F(SafeBrowsingStoreFileTest, DeleteChunks) { } // Expect no more chunks. - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); EXPECT_FALSE(store_->CheckAddChunk(kAddChunk1)); EXPECT_FALSE(store_->CheckAddChunk(kAddChunk2)); EXPECT_FALSE(store_->CheckSubChunk(kSubChunk1)); @@ -413,7 +480,7 @@ TEST_F(SafeBrowsingStoreFileTest, DeleteTemp) { EXPECT_FALSE(base::PathExists(temp_file)); // Starting a transaction creates a temporary file. - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); EXPECT_TRUE(base::PathExists(temp_file)); // Pull the rug out from under the existing store, simulating a @@ -440,7 +507,7 @@ TEST_F(SafeBrowsingStoreFileTest, DetectsCorruption) { std::vector<SBPrefix> orig_prefixes; std::vector<SBAddFullHash> orig_hashes; safe_browsing::PrefixSetBuilder builder; - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); EXPECT_TRUE(store_->FinishUpdate(pending_adds, &builder, &orig_hashes)); builder.GetPrefixSet()->GetPrefixes(&orig_prefixes); EXPECT_GT(orig_prefixes.size(), 0U); @@ -466,7 +533,7 @@ TEST_F(SafeBrowsingStoreFileTest, DetectsCorruption) { { std::vector<SBAddFullHash> pending_adds; safe_browsing::PrefixSetBuilder builder; - EXPECT_TRUE(store_->BeginUpdate()); + ASSERT_TRUE(store_->BeginUpdate()); EXPECT_FALSE(store_->FinishUpdate(pending_adds, &builder, &add_hashes)); EXPECT_TRUE(corruption_detected_); } @@ -505,14 +572,13 @@ TEST_F(SafeBrowsingStoreFileTest, CheckValidity) { EXPECT_TRUE(store_->CancelUpdate()); } -// Corrupt the payload. -TEST_F(SafeBrowsingStoreFileTest, CheckValidityPayload) { +// Corrupt the header. +TEST_F(SafeBrowsingStoreFileTest, CheckValidityHeader) { PopulateStore(base::Time::Now()); EXPECT_TRUE(base::PathExists(filename_)); - // 37 is the most random prime number. It's also past the header, - // as corrupting the header would fail BeginUpdate() in which case - // CheckValidity() cannot be called. + // 37 is the most random prime number. It's also past the initial header + // struct, somewhere in the chunk list. const size_t kOffset = 37; { @@ -520,6 +586,25 @@ TEST_F(SafeBrowsingStoreFileTest, CheckValidityPayload) { EXPECT_EQ(0, fseek(file.get(), kOffset, SEEK_SET)); EXPECT_GE(fputs("hello", file.get()), 0); } + ASSERT_FALSE(store_->BeginUpdate()); + EXPECT_TRUE(corruption_detected_); +} + +// Corrupt the prefix payload. +TEST_F(SafeBrowsingStoreFileTest, CheckValidityPayload) { + PopulateStore(base::Time::Now()); + EXPECT_TRUE(base::PathExists(filename_)); + + // 137 is the second most random prime number. It's also past the header and + // chunk-id area. Corrupting the header would fail BeginUpdate() in which + // case CheckValidity() cannot be called. + const size_t kOffset = 137; + + { + base::ScopedFILE file(base::OpenFile(filename_, "rb+")); + EXPECT_EQ(0, fseek(file.get(), kOffset, SEEK_SET)); + EXPECT_GE(fputs("hello", file.get()), 0); + } ASSERT_TRUE(store_->BeginUpdate()); EXPECT_FALSE(corruption_detected_); EXPECT_FALSE(store_->CheckValidity()); @@ -547,4 +632,131 @@ TEST_F(SafeBrowsingStoreFileTest, CheckValidityChecksum) { EXPECT_TRUE(store_->CancelUpdate()); } +TEST_F(SafeBrowsingStoreFileTest, GetAddPrefixesAndHashes) { + ASSERT_TRUE(store_->BeginUpdate()); + + const base::Time now = base::Time::Now(); + + EXPECT_TRUE(store_->BeginChunk()); + store_->SetAddChunk(kAddChunk1); + EXPECT_TRUE(store_->CheckAddChunk(kAddChunk1)); + EXPECT_TRUE(store_->WriteAddPrefix(kAddChunk1, kHash1.prefix)); + EXPECT_TRUE(store_->WriteAddPrefix(kAddChunk1, kHash2.prefix)); + EXPECT_TRUE(store_->WriteAddHash(kAddChunk1, now, kHash2)); + + store_->SetSubChunk(kSubChunk1); + EXPECT_TRUE(store_->CheckSubChunk(kSubChunk1)); + EXPECT_TRUE(store_->WriteSubPrefix(kSubChunk1, kAddChunk3, kHash3.prefix)); + EXPECT_TRUE(store_->WriteSubHash(kSubChunk1, kAddChunk3, kHash3)); + EXPECT_TRUE(store_->FinishChunk()); + + // Chunk numbers shouldn't leak over. + EXPECT_FALSE(store_->CheckAddChunk(kSubChunk1)); + EXPECT_FALSE(store_->CheckAddChunk(kAddChunk3)); + EXPECT_FALSE(store_->CheckSubChunk(kAddChunk1)); + + std::vector<int> chunks; + store_->GetAddChunks(&chunks); + ASSERT_EQ(1U, chunks.size()); + EXPECT_EQ(kAddChunk1, chunks[0]); + + store_->GetSubChunks(&chunks); + ASSERT_EQ(1U, chunks.size()); + EXPECT_EQ(kSubChunk1, chunks[0]); + + safe_browsing::PrefixSetBuilder builder; + std::vector<SBAddFullHash> add_full_hashes_result; + EXPECT_TRUE(store_->FinishUpdate(std::vector<SBAddFullHash>(), + &builder, + &add_full_hashes_result)); + + SBAddPrefixes add_prefixes; + EXPECT_TRUE(store_->GetAddPrefixes(&add_prefixes)); + ASSERT_EQ(2U, add_prefixes.size()); + EXPECT_EQ(kAddChunk1, add_prefixes[0].chunk_id); + EXPECT_EQ(kHash1.prefix, add_prefixes[0].prefix); + EXPECT_EQ(kAddChunk1, add_prefixes[1].chunk_id); + EXPECT_EQ(kHash2.prefix, add_prefixes[1].prefix); + + std::vector<SBAddFullHash> add_hashes; + EXPECT_TRUE(store_->GetAddFullHashes(&add_hashes)); + ASSERT_EQ(1U, add_hashes.size()); + EXPECT_EQ(kAddChunk1, add_hashes[0].chunk_id); + EXPECT_TRUE(SBFullHashEqual(kHash2, add_hashes[0].full_hash)); +} + +// Test that the database handles resharding correctly, both when growing and +// which shrinking. +TEST_F(SafeBrowsingStoreFileTest, Resharding) { + // Loop through multiple stride boundaries (1<<32, 1<<31, 1<<30, 1<<29). + const uint32 kTargetStride = 1 << 29; + + // Each chunk will require 8 bytes per prefix, plus 4 bytes for chunk + // information. It should be less than |kTargetFootprint| in the + // implementation, but high enough to keep the number of rewrites modest (to + // keep the test fast). + const size_t kPrefixesPerChunk = 10000; + + uint32 shard_stride = 0; + int chunk_id = 1; + + // Add a series of chunks, tracking that the stride size changes in a + // direction appropriate to increasing file size. + do { + ASSERT_TRUE(store_->BeginUpdate()); + + EXPECT_TRUE(store_->BeginChunk()); + store_->SetAddChunk(chunk_id); + EXPECT_TRUE(store_->CheckAddChunk(chunk_id)); + for (size_t i = 0; i < kPrefixesPerChunk; ++i) { + EXPECT_TRUE(store_->WriteAddPrefix(chunk_id, static_cast<SBPrefix>(i))); + } + EXPECT_TRUE(store_->FinishChunk()); + + std::vector<SBAddFullHash> pending_adds; + safe_browsing::PrefixSetBuilder builder; + std::vector<SBAddFullHash> add_full_hashes_result; + EXPECT_TRUE(store_->FinishUpdate(pending_adds, + &builder, + &add_full_hashes_result)); + + SBAddPrefixes add_prefixes; + EXPECT_TRUE(store_->GetAddPrefixes(&add_prefixes)); + ASSERT_EQ(chunk_id * kPrefixesPerChunk, add_prefixes.size()); + + // New stride should be the same, or shifted one right. + const uint32 new_shard_stride = ReadStride(); + EXPECT_TRUE((new_shard_stride == shard_stride) || + ((new_shard_stride << 1) == shard_stride)); + shard_stride = new_shard_stride; + ++chunk_id; + } while (!shard_stride || shard_stride > kTargetStride); + + // Guard against writing too many chunks. If this gets too big, adjust + // |kPrefixesPerChunk|. + EXPECT_LT(chunk_id, 20); + + // Remove each chunk and check that the stride goes back to 0. + while (--chunk_id) { + ASSERT_TRUE(store_->BeginUpdate()); + EXPECT_TRUE(store_->CheckAddChunk(chunk_id)); + EXPECT_FALSE(store_->CheckAddChunk(chunk_id + 1)); + store_->DeleteAddChunk(chunk_id); + + std::vector<SBAddFullHash> pending_adds; + safe_browsing::PrefixSetBuilder builder; + std::vector<SBAddFullHash> add_full_hashes_result; + EXPECT_TRUE(store_->FinishUpdate(pending_adds, + &builder, + &add_full_hashes_result)); + + // New stride should be the same, or shifted one left. + const uint32 new_shard_stride = ReadStride(); + EXPECT_TRUE((new_shard_stride == shard_stride) || + (new_shard_stride == (shard_stride << 1))); + shard_stride = new_shard_stride; + } + EXPECT_EQ(0u, shard_stride); +} + } // namespace diff --git a/chrome/browser/safe_browsing/safe_browsing_store_unittest.cc b/chrome/browser/safe_browsing/safe_browsing_store_unittest.cc index 199138e..420ed1c 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store_unittest.cc +++ b/chrome/browser/safe_browsing/safe_browsing_store_unittest.cc @@ -178,6 +178,15 @@ TEST(SafeBrowsingStoreTest, SBProcessSubsKnockout) { add_hashes.push_back(SBAddFullHash(kAddChunk1, kNow, kHash4mod)); sub_hashes.push_back(SBSubFullHash(kSubChunk1, kAddChunk1, kHash4mod)); + std::sort(add_prefixes.begin(), add_prefixes.end(), + SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); + std::sort(sub_prefixes.begin(), sub_prefixes.end(), + SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); + std::sort(add_hashes.begin(), add_hashes.end(), + SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); + std::sort(sub_hashes.begin(), sub_hashes.end(), + SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); + const base::hash_set<int32> no_deletions; SBProcessSubs(&add_prefixes, &sub_prefixes, &add_hashes, &sub_hashes, no_deletions, no_deletions); @@ -249,6 +258,15 @@ TEST(SafeBrowsingStoreTest, SBProcessSubsDeleteChunk) { sub_hashes.push_back(SBSubFullHash(kSubChunk1, kAddChunk1, kHash3)); sub_prefixes.push_back(SBSubPrefix(kSubChunk1, kAddChunk1, kHash3.prefix)); + std::sort(add_prefixes.begin(), add_prefixes.end(), + SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); + std::sort(sub_prefixes.begin(), sub_prefixes.end(), + SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); + std::sort(add_hashes.begin(), add_hashes.end(), + SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); + std::sort(sub_hashes.begin(), sub_hashes.end(), + SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); + const base::hash_set<int32> no_deletions; base::hash_set<int32> add_deletions; add_deletions.insert(kAddChunk1); @@ -268,6 +286,15 @@ TEST(SafeBrowsingStoreTest, SBProcessSubsDeleteChunk) { EXPECT_EQ(kAddChunk1, sub_hashes[0].add_chunk_id); EXPECT_TRUE(SBFullHashEqual(kHash3, sub_hashes[0].full_hash)); + std::sort(add_prefixes.begin(), add_prefixes.end(), + SBAddPrefixLess<SBAddPrefix,SBAddPrefix>); + std::sort(sub_prefixes.begin(), sub_prefixes.end(), + SBAddPrefixLess<SBSubPrefix,SBSubPrefix>); + std::sort(add_hashes.begin(), add_hashes.end(), + SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>); + std::sort(sub_hashes.begin(), sub_hashes.end(), + SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>); + base::hash_set<int32> sub_deletions; sub_deletions.insert(kSubChunk1); SBProcessSubs(&add_prefixes, &sub_prefixes, &add_hashes, &sub_hashes, |