// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_ #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_ #include #include #include "chrome/browser/safe_browsing/safe_browsing_store.h" #include "base/file_util.h" // Implement SafeBrowsingStore in terms of a flat file. The file // format is pretty literal: // // int32 magic; // magic number "validating" file // int32 version; // format version // // // Counts for the various data which follows the header. // int32 add_chunk_count; // Chunks seen, including empties. // int32 sub_chunk_count; // Ditto. // int32 add_prefix_count; // int32 sub_prefix_count; // int32 add_hash_count; // int32 sub_hash_count; // // array[add_chunk_count] { // int32 chunk_id; // } // array[sub_chunk_count] { // int32 chunk_id; // } // array[add_prefix_count] { // int32 chunk_id; // int32 prefix; // } // array[sub_prefix_count] { // int32 chunk_id; // int32 add_chunk_id; // int32 add_prefix; // } // array[add_hash_count] { // int32 chunk_id; // // TODO(shess): This duplicates first four bytes of full_hash! // int32 prefix; // // From base::Time::ToTimeT(). // // TODO(shess): an int32 probably has enough resolution. // int64 received_time; // char[32] full_hash; // array[sub_hash_count] { // int32 chunk_id; // int32 add_chunk_id; // int32 add_prefix; // char[32] add_full_hash; // } // TODO(shess): Would a checksum be worthwhile? If so, check at open, // or at commit? // // During the course of an update, uncommitted data is stored in a // temporary file (which is later re-used to commit). This is an // array of chunks, with the count kept in memory until the end of the // transaction. The format of this file is like the main file, with // the list of chunks seen omitted, as that data is tracked in-memory: // // array[] { // int32 add_prefix_count; // int32 sub_prefix_count; // int32 add_hash_count; // int32 sub_hash_count; // array[add_prefix_count] { // int32 chunk_id; // int32 prefix; // } // array[sub_prefix_count] { // int32 chunk_id; // int32 add_chunk_id; // int32 add_prefix; // } // array[add_hash_count] { // int32 chunk_id; // int32 prefix; // int64 received_time; // char[32] full_hash; // array[sub_hash_count] { // int32 chunk_id; // int32 add_chunk_id; // int32 add_prefix; // char[32] add_full_hash; // } // } // // The overall transaction works like this: // - Open the original file to get the chunks-seen data. // - Open a temp file for storing new chunk info. // - Write new chunks to the temp file. // - When the transaction is finished: // - Read the rest of the original file's data into buffers. // - Rewind the temp file and merge the new data into buffers. // - Process buffers for deletions and apply subs. // - Rewind and write the buffers out to temp file. // - Delete original file. // - Rename temp file to original filename. // // TODO(shess): Does there need to be an fsync() before the rename? // important_file_writer.h seems to think that // http://valhenson.livejournal.com/37921.html means you don't, but I // don't think it follows (and, besides, this needs to run on other // operating systems). // // TODO(shess): Using a checksum to validate the file would allow // correctness without fsync, at the cost of periodically needing to // regenerate the database from scratch. // TODO(shess): Regeneration could be moderated by saving the previous // file, if valid, as a checkpoint. During update, if the current // file is found to be invalid, rollback to the checkpoint and run the // updat forward from there. This would require that the current file // be validated at BeginUpdate() rather than FinishUpdate(), because // the chunks-seen data may have changed. [Does this have // implications for the pending_hashes, which were generated while // using a newer bloom filter?] class SafeBrowsingStoreFile : public SafeBrowsingStore { public: SafeBrowsingStoreFile(); virtual ~SafeBrowsingStoreFile(); virtual void Init(const FilePath& filename, Callback0::Type* corruption_callback); // Delete any on-disk files, including the permanent storage. virtual bool Delete(); virtual bool BeginChunk() { return ClearChunkBuffers(); } virtual bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) { add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix)); return true; } virtual bool WriteAddHash(int32 chunk_id, SBPrefix prefix, base::Time receive_time, SBFullHash full_hash) { add_hashes_.push_back( SBAddFullHash(chunk_id, prefix, receive_time, full_hash)); return true; } virtual bool WriteSubPrefix(int32 chunk_id, int32 add_chunk_id, SBPrefix prefix) { sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix)); return true; } virtual bool WriteSubHash(int32 chunk_id, int32 add_chunk_id, SBPrefix prefix, SBFullHash full_hash) { sub_hashes_.push_back( SBSubFullHash(chunk_id, add_chunk_id, prefix, full_hash)); return true; } virtual bool FinishChunk(); virtual bool BeginUpdate(); virtual bool DoUpdate(const std::vector& pending_adds, std::vector* add_prefixes_result, std::vector* add_full_hashes_result); virtual bool FinishUpdate(const std::vector& pending_adds, std::vector* add_prefixes_result, std::vector* add_full_hashes_result); virtual bool CancelUpdate(); virtual void SetAddChunk(int32 chunk_id) { add_chunks_cache_.insert(chunk_id); } virtual bool CheckAddChunk(int32 chunk_id) { return add_chunks_cache_.count(chunk_id) > 0; } virtual void GetAddChunks(std::vector* out) { out->clear(); out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end()); } virtual void SetSubChunk(int32 chunk_id) { sub_chunks_cache_.insert(chunk_id); } virtual bool CheckSubChunk(int32 chunk_id) { return sub_chunks_cache_.count(chunk_id) > 0; } virtual void GetSubChunks(std::vector* out) { out->clear(); out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end()); } virtual void DeleteAddChunk(int32 chunk_id) { add_del_cache_.insert(chunk_id); } virtual void DeleteSubChunk(int32 chunk_id) { sub_del_cache_.insert(chunk_id); } // Returns the name of the temporary file used to buffer data for // |filename|. Exported for unit tests. static const FilePath TemporaryFileForFilename(const FilePath& filename) { return FilePath(filename.value() + FILE_PATH_LITERAL("_new")); } private: // Close all files and clear all buffers. bool Close(); // Helpers to read/write the various data sets. Excepting // ReadChunksToSet(), which is called too early, the readers skip // items from deleted chunks (listed in add_del_cache_ and // sub_del_cache_). bool ReadChunksToSet(FILE* fp, std::set* chunks, int count); bool WriteChunksFromSet(const std::set& chunks); bool ReadAddPrefixes(FILE* fp, std::vector* add_prefixes, int count); bool WriteAddPrefixes(const std::vector& add_prefixes); bool ReadSubPrefixes(FILE* fp, std::vector* sub_prefixes, int count); bool WriteSubPrefixes(std::vector& sub_prefixes); bool ReadAddHashes(FILE* fp, std::vector* add_hashes, int count); bool WriteAddHashes(const std::vector& add_hashes); bool ReadSubHashes(FILE* fp, std::vector* sub_hashes, int count); bool WriteSubHashes(std::vector& sub_hashes); // Calls |corruption_callback_| if non-NULL, always returns false as // a convenience to the caller. bool OnCorruptDatabase(); // Clear temporary buffers used to accumulate chunk data. bool ClearChunkBuffers() { // NOTE: .clear() doesn't release memory. // TODO(shess): Figure out if this is overkill. Some amount of // pre-reserved space is probably reasonable between each chunk // collected. std::vector().swap(add_prefixes_); std::vector().swap(sub_prefixes_); std::vector().swap(add_hashes_); std::vector().swap(sub_hashes_); return true; } // Clear all buffers used during update. void ClearUpdateBuffers() { ClearChunkBuffers(); chunks_written_ = 0; std::set().swap(add_chunks_cache_); std::set().swap(sub_chunks_cache_); base::hash_set().swap(add_del_cache_); base::hash_set().swap(sub_del_cache_); } // Buffers for collecting data between BeginChunk() and // FinishChunk(). std::vector add_prefixes_; std::vector sub_prefixes_; std::vector add_hashes_; std::vector sub_hashes_; // Count of chunks collected in |new_file_|. int chunks_written_; // Name of the main database file. FilePath filename_; // Handles to the main and scratch files. |empty_| is true if the // main file didn't exist when the update was started. file_util::ScopedFILE file_; file_util::ScopedFILE new_file_; bool empty_; // Cache of chunks which have been seen. Loaded from the database // on BeginUpdate() so that it can be queried during the // transaction. std::set add_chunks_cache_; std::set sub_chunks_cache_; // Cache the set of deleted chunks during a transaction, applied on // FinishUpdate(). // TODO(shess): If the set is small enough, hash_set<> might be // slower than plain set<>. base::hash_set add_del_cache_; base::hash_set sub_del_cache_; scoped_ptr corruption_callback_; DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile); }; #endif // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_