diff options
author | shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-02 23:32:41 +0000 |
---|---|---|
committer | shess@chromium.org <shess@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-02 23:32:41 +0000 |
commit | d5c3e6e4bce69022480c90a14600cc0a9c092a6a (patch) | |
tree | d48a4111d92233b895d8e791614686c39b318b39 /chrome/browser/safe_browsing | |
parent | 508b46914cf6c78923476e5f029408cde61b8163 (diff) | |
download | chromium_src-d5c3e6e4bce69022480c90a14600cc0a9c092a6a.zip chromium_src-d5c3e6e4bce69022480c90a14600cc0a9c092a6a.tar.gz chromium_src-d5c3e6e4bce69022480c90a14600cc0a9c092a6a.tar.bz2 |
Checksum SafeBrowsingStoreFile.
MD5 checksum verifies that the file read is the file that was written.
Also added sanity checks for the file header to prevent interpretting
random garbage as a valid count. Unit test corruption-detection.
BUG=none
TEST=none
Review URL: http://codereview.chromium.org/660334
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@40457 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/safe_browsing')
3 files changed, 209 insertions, 102 deletions
diff --git a/chrome/browser/safe_browsing/safe_browsing_store_file.cc b/chrome/browser/safe_browsing/safe_browsing_store_file.cc index ef58b44..7186388 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store_file.cc +++ b/chrome/browser/safe_browsing/safe_browsing_store_file.cc @@ -5,6 +5,7 @@ #include "chrome/browser/safe_browsing/safe_browsing_store_file.h" #include "base/callback.h" +#include "base/md5.h" namespace { @@ -16,15 +17,15 @@ const int32 kFileVersion = 7; // SQLite storage was 6... // Header at the front of the main database file. struct FileHeader { int32 magic, version; - int32 add_chunk_count, sub_chunk_count; - int32 add_prefix_count, sub_prefix_count; - int32 add_hash_count, sub_hash_count; + uint32 add_chunk_count, sub_chunk_count; + uint32 add_prefix_count, sub_prefix_count; + uint32 add_hash_count, sub_hash_count; }; // Header for each chunk in the chunk-accumulation file. struct ChunkHeader { - int32 add_prefix_count, sub_prefix_count; - int32 add_hash_count, sub_hash_count; + uint32 add_prefix_count, sub_prefix_count; + uint32 add_hash_count, sub_hash_count; }; // Rewind the file. Using fseek(2) because rewind(3) errors are @@ -35,30 +36,41 @@ bool FileRewind(FILE* fp) { return rv == 0; } -// Read an array of |nmemb| items from |fp| into |ptr|. Return true -// on success. +// Read an array of |nmemb| items from |fp| into |ptr|, and fold the +// input data into the checksum in |context|, if non-NULL. Return +// true on success. template <class T> -bool ReadArray(T* ptr, size_t nmemb, FILE* fp) { +bool ReadArray(T* ptr, size_t nmemb, FILE* fp, MD5Context* context) { const size_t ret = fread(ptr, sizeof(T), nmemb, fp); if (ret != nmemb) return false; + + if (context) + MD5Update(context, ptr, sizeof(T) * nmemb); return true; } -// Write an array of |nmemb| items from |ptr| to |fp|. Return true on -// success. +// Write an array of |nmemb| items from |ptr| to |fp|, and fold the +// output data into the checksum in |context|, if non-NULL. Return +// true on success. template <class T> -bool WriteArray(const T* ptr, size_t nmemb, FILE* fp) { +bool WriteArray(const T* ptr, size_t nmemb, FILE* fp, MD5Context* context) { const size_t ret = fwrite(ptr, sizeof(T), nmemb, fp); if (ret != nmemb) return false; + + if (context) + MD5Update(context, ptr, sizeof(T) * nmemb); + return true; } -// Expand |values| to fit |count| new items, and read those items from -// |fp|. Returns true on success. +// Expand |values| to fit |count| new items, read those items from +// |fp| and fold them into the checksum in |context|. Returns true on +// success. template <class T> -bool ReadToVector(std::vector<T>* values, size_t count, FILE* fp) { +bool ReadToVector(std::vector<T>* values, size_t count, + FILE* fp, MD5Context* context) { // Pointers into an empty vector may not be valid. if (!count) return true; @@ -71,7 +83,7 @@ bool ReadToVector(std::vector<T>* values, size_t count, FILE* fp) { // Sayeth Herb Sutter: Vectors are guaranteed to be contiguous. So // get a pointer to where to read the data to. T* ptr = &((*values)[original_size]); - if (!ReadArray(ptr, count, fp)) { + if (!ReadArray(ptr, count, fp, context)) { values->resize(original_size); return false; } @@ -79,9 +91,10 @@ bool ReadToVector(std::vector<T>* values, size_t count, FILE* fp) { return true; } -// Write all of |values| to |fp|. Returns true on succsess. +// Write all of |values| to |fp|, and fold the data into the checksum +// in |context|, if non-NULL. Returns true on succsess. template <class T> -bool WriteVector(const std::vector<T>& values, FILE* fp) { +bool WriteVector(const std::vector<T>& values, FILE* fp, MD5Context* context) { // Pointers into empty vectors may not be valid. if (values.empty()) return true; @@ -89,7 +102,7 @@ bool WriteVector(const std::vector<T>& values, FILE* fp) { // Sayeth Herb Sutter: Vectors are guaranteed to be contiguous. So // get a pointer to where to write from. const T* ptr = &(values[0]); - return WriteArray(ptr, values.size(), fp); + return WriteArray(ptr, values.size(), fp, context); } // Remove deleted items (|chunk_id| in |del_set|) from the vector @@ -114,10 +127,11 @@ void RemoveDeleted(std::vector<T>* vec, size_t offset, // Combine |ReadToVector()| and |RemoveDeleted()|. Returns true on // success. template <class T> -bool ReadToVectorAndDelete(std::vector<T>* values, size_t count, FILE* fp, +bool ReadToVectorAndDelete(std::vector<T>* values, size_t count, + FILE* fp, MD5Context* context, const base::hash_set<int32>& del_set) { const size_t original_size = values->size(); - if (!ReadToVector(values, count, fp)) + if (!ReadToVector(values, count, fp, context)) return false; RemoveDeleted(values, original_size, del_set); @@ -126,12 +140,13 @@ bool ReadToVectorAndDelete(std::vector<T>* values, size_t count, FILE* fp, // Read an array of |count| integers and add them to |values|. // Returns true on success. -bool ReadToChunkSet(std::set<int32>* values, size_t count, FILE* fp) { +bool ReadToChunkSet(std::set<int32>* values, size_t count, + FILE* fp, MD5Context* context) { if (!count) return true; std::vector<int32> flat_values; - if (!ReadToVector(&flat_values, count, fp)) + if (!ReadToVector(&flat_values, count, fp, context)) return false; values->insert(flat_values.begin(), flat_values.end()); @@ -140,12 +155,13 @@ bool ReadToChunkSet(std::set<int32>* values, size_t count, FILE* fp) { // Write the contents of |values| as an array of integers. Returns // true on success. -bool WriteChunkSet(const std::set<int32>& values, FILE* fp) { +bool WriteChunkSet(const std::set<int32>& values, + FILE* fp, MD5Context* context) { if (values.empty()) return true; const std::vector<int32> flat_values(values.begin(), values.end()); - return WriteVector(flat_values, fp); + return WriteVector(flat_values, fp, context); } // Delete the chunks in |deleted| from |chunks|. @@ -248,14 +264,39 @@ bool SafeBrowsingStoreFile::BeginUpdate() { } FileHeader header; - if (!ReadArray(&header, 1, file.get())) - return OnCorruptDatabase(); + if (!ReadArray(&header, 1, file.get(), NULL)) + return OnCorruptDatabase(); if (header.magic != kFileMagic || header.version != kFileVersion) return OnCorruptDatabase(); - if (!ReadToChunkSet(&add_chunks_cache_, header.add_chunk_count, file.get()) || - !ReadToChunkSet(&sub_chunks_cache_, header.sub_chunk_count, file.get())) + // Check that the file size makes sense given the header. This is a + // cheap way to protect against header corruption while deferring + // the checksum calculation until the end of the update. + // TODO(shess): Under POSIX it is possible that this could size a + // file different from the file which was opened. + int64 size = 0; + if (!file_util::GetFileSize(filename_, &size)) + return OnCorruptDatabase(); + + int64 expected_size = sizeof(FileHeader); + expected_size += header.add_chunk_count * sizeof(int32); + expected_size += header.sub_chunk_count * sizeof(int32); + expected_size += header.add_prefix_count * sizeof(SBAddPrefix); + expected_size += header.sub_prefix_count * sizeof(SBSubPrefix); + expected_size += header.add_hash_count * sizeof(SBAddFullHash); + expected_size += header.sub_hash_count * sizeof(SBSubFullHash); + expected_size += sizeof(MD5Digest); + if (size != expected_size) + return OnCorruptDatabase(); + + // Pull in the chunks-seen data for purposes of implementing + // |GetAddChunks()| and |GetSubChunks()|. This data is sent up to + // the server at the beginning of an update. + if (!ReadToChunkSet(&add_chunks_cache_, header.add_chunk_count, + file.get(), NULL) || + !ReadToChunkSet(&sub_chunks_cache_, header.sub_chunk_count, + file.get(), NULL)) return OnCorruptDatabase(); file_.swap(file); @@ -273,13 +314,13 @@ bool SafeBrowsingStoreFile::FinishChunk() { header.sub_prefix_count = sub_prefixes_.size(); header.add_hash_count = add_hashes_.size(); header.sub_hash_count = sub_hashes_.size(); - if (!WriteArray(&header, 1, new_file_.get())) + if (!WriteArray(&header, 1, new_file_.get(), NULL)) return false; - if (!WriteVector(add_prefixes_, new_file_.get()) || - !WriteVector(sub_prefixes_, new_file_.get()) || - !WriteVector(add_hashes_, new_file_.get()) || - !WriteVector(sub_hashes_, new_file_.get())) + if (!WriteVector(add_prefixes_, new_file_.get(), NULL) || + !WriteVector(sub_prefixes_, new_file_.get(), NULL) || + !WriteVector(add_hashes_, new_file_.get(), NULL) || + !WriteVector(sub_hashes_, new_file_.get(), NULL)) return false; ++chunks_written_; @@ -307,35 +348,45 @@ bool SafeBrowsingStoreFile::DoUpdate( if (!FileRewind(file_.get())) return OnCorruptDatabase(); + MD5Context context; + MD5Init(&context); + // Read the file header and make sure it looks right. FileHeader header; - if (!ReadArray(&header, 1, file_.get())) + if (!ReadArray(&header, 1, file_.get(), &context)) return OnCorruptDatabase(); if (header.magic != kFileMagic || header.version != kFileVersion) return OnCorruptDatabase(); // Re-read the chunks-seen data to get to the later data in the - // file. No new elements should be added to the sets. - // NOTE(shess): Reading rather than fseek() because calculating - // checksums (future CL) will need to scan all data. The code - // could just remember state from |BeginUpdate()|, but that call - // may be far removed from this call in time, so this seems like a - // reasonable trade-off. + // file and calculate the checksum. No new elements should be + // added to the sets. if (!ReadToChunkSet(&add_chunks_cache_, header.add_chunk_count, - file_.get()) || + file_.get(), &context) || !ReadToChunkSet(&sub_chunks_cache_, header.sub_chunk_count, - file_.get())) + file_.get(), &context)) return OnCorruptDatabase(); if (!ReadToVectorAndDelete(&add_prefixes, header.add_prefix_count, - file_.get(), add_del_cache_) || + file_.get(), &context, add_del_cache_) || !ReadToVectorAndDelete(&sub_prefixes, header.sub_prefix_count, - file_.get(), sub_del_cache_) || + file_.get(), &context, sub_del_cache_) || !ReadToVectorAndDelete(&add_full_hashes, header.add_hash_count, - file_.get(), add_del_cache_) || + file_.get(), &context, add_del_cache_) || !ReadToVectorAndDelete(&sub_full_hashes, header.sub_hash_count, - file_.get(), sub_del_cache_)) + file_.get(), &context, sub_del_cache_)) + return OnCorruptDatabase(); + + // Calculate the digest to this point. + MD5Digest calculated_digest; + MD5Final(&calculated_digest, &context); + + // Read the stored checksum and verify it. + MD5Digest file_digest; + if (!ReadArray(&file_digest, 1, file_.get(), NULL)) + return OnCorruptDatabase(); + if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest))) return OnCorruptDatabase(); // Close the file so we can later rename over it. @@ -347,11 +398,11 @@ bool SafeBrowsingStoreFile::DoUpdate( if (!FileRewind(new_file_.get())) return false; - // Append the accumulated chunks onto the vectors from file_. + // Append the accumulated chunks onto the vectors read from |file_|. for (int i = 0; i < chunks_written_; ++i) { ChunkHeader header; - if (!ReadArray(&header, 1, new_file_.get())) + if (!ReadArray(&header, 1, new_file_.get(), NULL)) return false; // TODO(shess): If the vectors were kept sorted, then this code @@ -362,13 +413,13 @@ bool SafeBrowsingStoreFile::DoUpdate( // chunks pairwise, merge those chunks pairwise, and so on, then // merge the result with the main list). if (!ReadToVectorAndDelete(&add_prefixes, header.add_prefix_count, - new_file_.get(), add_del_cache_) || + new_file_.get(), NULL, add_del_cache_) || !ReadToVectorAndDelete(&sub_prefixes, header.sub_prefix_count, - new_file_.get(), sub_del_cache_) || + new_file_.get(), NULL, sub_del_cache_) || !ReadToVectorAndDelete(&add_full_hashes, header.add_hash_count, - new_file_.get(), add_del_cache_) || + new_file_.get(), NULL, add_del_cache_) || !ReadToVectorAndDelete(&sub_full_hashes, header.sub_hash_count, - new_file_.get(), sub_del_cache_)) + new_file_.get(), NULL, sub_del_cache_)) return false; } @@ -388,15 +439,13 @@ bool SafeBrowsingStoreFile::DoUpdate( DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_); // Write the new data to new_file_. - // TODO(shess): If we receive a lot of subs relative to adds, - // overwriting the temporary chunk data in new_file_ with the - // permanent data could leave additional data at the end. Won't - // cause any problems, but does waste space. There is no truncate() - // for stdio. Could use ftruncate() or re-open the file. Or maybe - // ignore it, since we'll likely rewrite the file soon enough. if (!FileRewind(new_file_.get())) return false; + MD5Context context; + MD5Init(&context); + + // Write a file header. FileHeader header; header.magic = kFileMagic; header.version = kFileVersion; @@ -406,16 +455,26 @@ bool SafeBrowsingStoreFile::DoUpdate( header.sub_prefix_count = sub_prefixes.size(); header.add_hash_count = add_full_hashes.size(); header.sub_hash_count = sub_full_hashes.size(); - if (!WriteArray(&header, 1, new_file_.get())) + if (!WriteArray(&header, 1, new_file_.get(), &context)) return false; // Write all the chunk data. - if (!WriteChunkSet(add_chunks_cache_, new_file_.get()) || - !WriteChunkSet(sub_chunks_cache_, new_file_.get()) || - !WriteVector(add_prefixes, new_file_.get()) || - !WriteVector(sub_prefixes, new_file_.get()) || - !WriteVector(add_full_hashes, new_file_.get()) || - !WriteVector(sub_full_hashes, new_file_.get())) + if (!WriteChunkSet(add_chunks_cache_, new_file_.get(), &context) || + !WriteChunkSet(sub_chunks_cache_, new_file_.get(), &context) || + !WriteVector(add_prefixes, new_file_.get(), &context) || + !WriteVector(sub_prefixes, new_file_.get(), &context) || + !WriteVector(add_full_hashes, new_file_.get(), &context) || + !WriteVector(sub_full_hashes, new_file_.get(), &context)) + return false; + + // Write the checksum at the end. + MD5Digest digest; + MD5Final(&digest, &context); + if (!WriteArray(&digest, 1, new_file_.get(), NULL)) + return false; + + // Trim any excess left over from the temporary chunk data. + if (!file_util::TruncateFile(new_file_.get())) return false; // Close the file handle and swizzle the file into place. diff --git a/chrome/browser/safe_browsing/safe_browsing_store_file.h b/chrome/browser/safe_browsing/safe_browsing_store_file.h index 269067c..4e0cda9 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store_file.h +++ b/chrome/browser/safe_browsing/safe_browsing_store_file.h @@ -20,12 +20,12 @@ // int32 version; // format version // // // Counts for the various data which follows the header. -// int32 add_chunk_count; // Chunks seen, including empties. -// int32 sub_chunk_count; // Ditto. -// int32 add_prefix_count; -// int32 sub_prefix_count; -// int32 add_hash_count; -// int32 sub_hash_count; +// uint32 add_chunk_count; // Chunks seen, including empties. +// uint32 sub_chunk_count; // Ditto. +// uint32 add_prefix_count; +// uint32 sub_prefix_count; +// uint32 add_hash_count; +// uint32 sub_hash_count; // // array[add_chunk_count] { // int32 chunk_id; @@ -44,17 +44,14 @@ // } // array[add_hash_count] { // int32 chunk_id; -// // From base::Time::ToTimeT(). This data should never last long -// // enough for 32 bits to be a problem. -// int32 received_time; +// int32 received_time; // From base::Time::ToTimeT(). // char[32] full_hash; // array[sub_hash_count] { // int32 chunk_id; // int32 add_chunk_id; // char[32] add_full_hash; // } -// TODO(shess): Would a checksum be worthwhile? If so, check at open, -// or at commit? +// MD5Digest checksum; // Checksum over preceeding data. // // During the course of an update, uncommitted data is stored in a // temporary file (which is later re-used to commit). This is an @@ -63,10 +60,10 @@ // the list of chunks seen omitted, as that data is tracked in-memory: // // array[] { -// int32 add_prefix_count; -// int32 sub_prefix_count; -// int32 add_hash_count; -// int32 sub_hash_count; +// uint32 add_prefix_count; +// uint32 sub_prefix_count; +// uint32 add_hash_count; +// uint32 sub_hash_count; // array[add_prefix_count] { // int32 chunk_id; // int32 prefix; @@ -78,13 +75,12 @@ // } // array[add_hash_count] { // int32 chunk_id; -// int32 prefix; -// int64 received_time; +// int32 received_time; // From base::Time::ToTimeT(). // char[32] full_hash; +// } // array[sub_hash_count] { // int32 chunk_id; // int32 add_chunk_id; -// int32 add_prefix; // char[32] add_full_hash; // } // } @@ -100,25 +96,12 @@ // - Rewind and write the buffers out to temp file. // - Delete original file. // - Rename temp file to original filename. -// -// TODO(shess): Does there need to be an fsync() before the rename? -// important_file_writer.h seems to think that -// http://valhenson.livejournal.com/37921.html means you don't, but I -// don't think it follows (and, besides, this needs to run on other -// operating systems). -// -// TODO(shess): Using a checksum to validate the file would allow -// correctness without fsync, at the cost of periodically needing to -// regenerate the database from scratch. -// TODO(shess): Regeneration could be moderated by saving the previous -// file, if valid, as a checkpoint. During update, if the current -// file is found to be invalid, rollback to the checkpoint and run the -// updat forward from there. This would require that the current file -// be validated at BeginUpdate() rather than FinishUpdate(), because -// the chunks-seen data may have changed. [Does this have -// implications for the pending_hashes, which were generated while -// using a newer bloom filter?] +// TODO(shess): By using a checksum, this code can avoid doing an +// fsync(), at the possible cost of more frequently retrieving the +// full dataset. Measure how often this occurs, and if it occurs too +// often, consider retaining the last known-good file for recovery +// purposes, rather than deleting it. class SafeBrowsingStoreFile : public SafeBrowsingStore { public: diff --git a/chrome/browser/safe_browsing/safe_browsing_store_file_unittest.cc b/chrome/browser/safe_browsing/safe_browsing_store_file_unittest.cc index 560a151..8e6ee61 100644 --- a/chrome/browser/safe_browsing/safe_browsing_store_file_unittest.cc +++ b/chrome/browser/safe_browsing/safe_browsing_store_file_unittest.cc @@ -4,6 +4,7 @@ #include "chrome/browser/safe_browsing/safe_browsing_store_file.h" +#include "base/callback.h" #include "chrome/browser/safe_browsing/safe_browsing_store_unittest_helper.h" #include "chrome/test/file_test_utils.h" #include "testing/gtest/include/gtest/gtest.h" @@ -45,9 +46,14 @@ class SafeBrowsingStoreFileTest : public PlatformTest { PlatformTest::TearDown(); } + void OnCorruptionDetected() { + corruption_detected_ = true; + } + scoped_ptr<FileAutoDeleter> file_deleter_; FilePath filename_; scoped_ptr<SafeBrowsingStoreFile> store_; + bool corruption_detected_; }; TEST_STORE(SafeBrowsingStoreFileTest, store_.get(), filename_); @@ -77,6 +83,65 @@ TEST_F(SafeBrowsingStoreFileTest, DeleteTemp) { EXPECT_FALSE(file_util::PathExists(temp_file)); } -// TODO(shess): Test corruption-handling? +// Test basic corruption-handling. +TEST_F(SafeBrowsingStoreFileTest, DetectsCorruption) { + // Load a store with some data. + SafeBrowsingStoreTestStorePrefix(store_.get()); + + SafeBrowsingStoreFile test_store; + test_store.Init( + filename_, + NewCallback(static_cast<SafeBrowsingStoreFileTest*>(this), + &SafeBrowsingStoreFileTest::OnCorruptionDetected)); + + corruption_detected_ = false; + + // Can successfully open and read the store. + std::vector<SBAddFullHash> pending_adds; + std::vector<SBAddPrefix> orig_prefixes; + std::vector<SBAddFullHash> orig_hashes; + EXPECT_TRUE(test_store.BeginUpdate()); + EXPECT_TRUE(test_store.FinishUpdate(pending_adds, + &orig_prefixes, &orig_hashes)); + EXPECT_GT(orig_prefixes.size(), 0U); + EXPECT_GT(orig_hashes.size(), 0U); + EXPECT_FALSE(corruption_detected_); + + // Corrupt the store. + file_util::ScopedFILE file(file_util::OpenFile(filename_, "rb+")); + const long kOffset = 60; + EXPECT_EQ(fseek(file.get(), kOffset, SEEK_SET), 0); + const int32 kZero = 0; + int32 previous = kZero; + EXPECT_EQ(fread(&previous, sizeof(previous), 1, file.get()), 1U); + EXPECT_NE(previous, kZero); + EXPECT_EQ(fseek(file.get(), kOffset, SEEK_SET), 0); + EXPECT_EQ(fwrite(&kZero, sizeof(kZero), 1, file.get()), 1U); + file.reset(); + + // Update fails and corruption callback is called. + std::vector<SBAddPrefix> add_prefixes; + std::vector<SBAddFullHash> add_hashes; + corruption_detected_ = false; + EXPECT_TRUE(test_store.BeginUpdate()); + EXPECT_FALSE(test_store.FinishUpdate(pending_adds, + &add_prefixes, &add_hashes)); + EXPECT_TRUE(corruption_detected_); + EXPECT_EQ(add_prefixes.size(), 0U); + EXPECT_EQ(add_hashes.size(), 0U); + + // Make it look like there is a lot of add-chunks-seen data. + const long kAddChunkCountOffset = 2 * sizeof(int32); + const int32 kLargeCount = 1000 * 1000 * 1000; + file.reset(file_util::OpenFile(filename_, "rb+")); + EXPECT_EQ(fseek(file.get(), kAddChunkCountOffset, SEEK_SET), 0); + EXPECT_EQ(fwrite(&kLargeCount, sizeof(kLargeCount), 1, file.get()), 1U); + file.reset(); + + // Detects corruption and fails to even begin the update. + corruption_detected_ = false; + EXPECT_FALSE(test_store.BeginUpdate()); + EXPECT_TRUE(corruption_detected_); +} } // namespace |