diff options
Diffstat (limited to 'sdch/open-vcdiff/src/blockhash.cc')
-rw-r--r-- | sdch/open-vcdiff/src/blockhash.cc | 440 |
1 files changed, 440 insertions, 0 deletions
diff --git a/sdch/open-vcdiff/src/blockhash.cc b/sdch/open-vcdiff/src/blockhash.cc new file mode 100644 index 0000000..76461cc --- /dev/null +++ b/sdch/open-vcdiff/src/blockhash.cc @@ -0,0 +1,440 @@ +// Copyright 2006, 2008 Google Inc. +// Authors: Chandra Chereddi, Lincoln Smith +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <config.h> +#include "blockhash.h" +#include "compile_assert.h" +#include <stdint.h> // uint32_t +#include <string.h> // memcpy, memcmp +#include "logging.h" +#include "rolling_hash.h" + +namespace open_vcdiff { + +typedef unsigned long uword_t; // a machine word NOLINT + +BlockHash::BlockHash(const char* source_data, + size_t source_size, + int starting_offset) + : source_data_(source_data), + source_size_(source_size), + hash_table_mask_(0), + starting_offset_(starting_offset), + last_block_added_(-1) { +} + +BlockHash::~BlockHash() { } + +// kBlockSize must be at least 2 to be meaningful. Since it's a compile-time +// constant, check its value at compile time rather than wasting CPU cycles +// on runtime checks. +COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2); + +// kBlockSize is required to be a power of 2 because multiplication +// (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize) +// are commonly-used operations. If kBlockSize is a compile-time +// constant and a power of 2, the compiler can convert these three operations +// into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much +// more efficient than executing full integer multiply, divide, or remainder +// instructions. +COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0, + kBlockSize_must_be_a_power_of_2); + +bool BlockHash::Init(bool populate_hash_table) { + if (!hash_table_.empty() || + !next_block_table_.empty() || + !last_block_table_.empty()) { + LOG(DFATAL) << "Init() called twice for same BlockHash object" << LOG_ENDL; + return false; + } + const size_t table_size = CalcTableSize(source_size_); + if (table_size == 0) { + LOG(DFATAL) << "Error finding table size for source size " << source_size_ + << LOG_ENDL; + return false; + } + // Since table_size is a power of 2, (table_size - 1) is a bit mask + // containing all the bits below table_size. + hash_table_mask_ = static_cast<uint32_t>(table_size - 1); + hash_table_.resize(table_size, -1); + next_block_table_.resize(GetNumberOfBlocks(), -1); + last_block_table_.resize(GetNumberOfBlocks(), -1); + if (populate_hash_table) { + AddAllBlocks(); + } + return true; +} + +const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data, + size_t dictionary_size) { + BlockHash* new_dictionary_hash = new BlockHash(dictionary_data, + dictionary_size, + 0); + if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) { + delete new_dictionary_hash; + return NULL; + } else { + return new_dictionary_hash; + } +} + +BlockHash* BlockHash::CreateTargetHash(const char* target_data, + size_t target_size, + size_t dictionary_size) { + BlockHash* new_target_hash = new BlockHash(target_data, + target_size, + static_cast<int>(dictionary_size)); + if (!new_target_hash->Init(/* populate_hash_table = */ false)) { + delete new_target_hash; + return NULL; + } else { + return new_target_hash; + } +} + +// Returns zero if an error occurs. +size_t BlockHash::CalcTableSize(const size_t dictionary_size) { + // Overallocate the hash table by making it the same size (in bytes) + // as the source data. This is a trade-off between space and time: + // the empty entries in the hash table will reduce the + // probability of a hash collision to (sizeof(int) / kblockSize), + // and so save time comparing false matches. + const size_t min_size = (dictionary_size / sizeof(int)) + 1; // NOLINT + size_t table_size = 1; + // Find the smallest power of 2 that is >= min_size, and assign + // that value to table_size. + while (table_size < min_size) { + table_size <<= 1; + // Guard against an infinite loop + if (table_size <= 0) { + LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = " + << dictionary_size + << "): resulting table_size " << table_size + << " is zero or negative" << LOG_ENDL; + return 0; + } + } + // Check size sanity + if ((table_size & (table_size - 1)) != 0) { + LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = " + << dictionary_size + << "): resulting table_size " << table_size + << " is not a power of 2" << LOG_ENDL; + return 0; + } + // The loop above tries to find the smallest power of 2 that is >= min_size. + // That value must lie somewhere between min_size and (min_size * 2), + // except for the case (dictionary_size == 0, table_size == 1). + if ((dictionary_size > 0) && (table_size > (min_size * 2))) { + LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = " + << dictionary_size + << "): resulting table_size " << table_size + << " is too large" << LOG_ENDL; + return 0; + } + return table_size; +} + +// If the hash value is already available from the rolling hash, +// call this function to save time. +void BlockHash::AddBlock(uint32_t hash_value) { + if (hash_table_.empty()) { + LOG(DFATAL) << "BlockHash::AddBlock() called before BlockHash::Init()" + << LOG_ENDL; + return; + } + // The initial value of last_block_added_ is -1. + int block_number = last_block_added_ + 1; + const int total_blocks = + static_cast<int>(source_size_ / kBlockSize); // round down + if (block_number >= total_blocks) { + LOG(DFATAL) << "BlockHash::AddBlock() called" + " with block number " << block_number + << " that is past last block " << (total_blocks - 1) + << LOG_ENDL; + return; + } + if (next_block_table_[block_number] != -1) { + LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): " + "block number = " << block_number + << ", next block should be -1 but is " + << next_block_table_[block_number] << LOG_ENDL; + return; + } + const uint32_t hash_table_index = GetHashTableIndex(hash_value); + const int first_matching_block = hash_table_[hash_table_index]; + if (first_matching_block < 0) { + // This is the first entry with this hash value + hash_table_[hash_table_index] = block_number; + last_block_table_[block_number] = block_number; + } else { + // Add this entry at the end of the chain of matching blocks + const int last_matching_block = last_block_table_[first_matching_block]; + if (next_block_table_[last_matching_block] != -1) { + LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): " + "first matching block = " << first_matching_block + << ", last matching block = " << last_matching_block + << ", next block should be -1 but is " + << next_block_table_[last_matching_block] << LOG_ENDL; + return; + } + next_block_table_[last_matching_block] = block_number; + last_block_table_[first_matching_block] = block_number; + } + last_block_added_ = block_number; +} + +void BlockHash::AddAllBlocks() { + AddAllBlocksThroughIndex(static_cast<int>(source_size_)); +} + +void BlockHash::AddAllBlocksThroughIndex(int end_index) { + if (end_index > static_cast<int>(source_size_)) { + LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called" + " with index " << end_index + << " higher than end index " << source_size_ << LOG_ENDL; + return; + } + const int last_index_added = last_block_added_ * kBlockSize; + if (end_index <= last_index_added) { + LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called" + " with index " << end_index + << " <= last index added ( " << last_index_added + << ")" << LOG_ENDL; + return; + } + int end_limit = end_index; + // Don't allow reading any indices at or past source_size_. + // The Hash function extends (kBlockSize - 1) bytes past the index, + // so leave a margin of that size. + int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize); + if (end_limit > last_legal_hash_index) { + end_limit = last_legal_hash_index + 1; + } + const char* block_ptr = source_data() + NextIndexToAdd(); + const char* const end_ptr = source_data() + end_limit; + while (block_ptr < end_ptr) { + AddBlock(RollingHash<kBlockSize>::Hash(block_ptr)); + block_ptr += kBlockSize; + } +} + +COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0, + kBlockSize_must_be_a_multiple_of_machine_word_size); + +// A recursive template to compare a fixed number +// of (possibly unaligned) machine words starting +// at addresses block1 and block2. Returns true or false +// depending on whether an exact match was found. +template<int number_of_words> +inline bool CompareWholeWordValues(const char* block1, + const char* block2) { + return CompareWholeWordValues<1>(block1, block2) && + CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t), + block2 + sizeof(uword_t)); +} + +// The base of the recursive template: compare one pair of machine words. +template<> +inline bool CompareWholeWordValues<1>(const char* word1, + const char* word2) { + uword_t aligned_word1, aligned_word2; + memcpy(&aligned_word1, word1, sizeof(aligned_word1)); + memcpy(&aligned_word2, word2, sizeof(aligned_word2)); + return aligned_word1 == aligned_word2; +} + +// A block must be composed of an integral number of machine words +// (uword_t values.) This function takes advantage of that fact +// by comparing the blocks as series of (possibly unaligned) word values. +// A word-sized comparison can be performed as a single +// machine instruction. Comparing words instead of bytes means that, +// on a 64-bit platform, this function will use 8 times fewer test-and-branch +// instructions than a byte-by-byte comparison. Even with the extra +// cost of the calls to memcpy, this method is still at least twice as fast +// as memcmp (measured using gcc on a 64-bit platform, with a block size +// of 32.) For blocks with identical contents (a common case), this method +// is over six times faster than memcmp. +inline bool BlockCompareWordsInline(const char* block1, const char* block2) { + static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t); + return CompareWholeWordValues<kWordsPerBlock>(block1, block2); +} + +bool BlockHash::BlockCompareWords(const char* block1, const char* block2) { + return BlockCompareWordsInline(block1, block2); +} + +inline bool BlockContentsMatchInline(const char* block1, const char* block2) { + // Optimize for mismatch in first byte. Since this function is called only + // when the hash values of the two blocks match, it is very likely that either + // the blocks are identical, or else the first byte does not match. + if (*block1 != *block2) { + return false; + } +#ifdef VCDIFF_USE_BLOCK_COMPARE_WORDS + return BlockCompareWordsInline(block1, block2); +#else // !VCDIFF_USE_BLOCK_COMPARE_WORDS + return memcmp(block1, block2, BlockHash::kBlockSize) == 0; +#endif // VCDIFF_USE_BLOCK_COMPARE_WORDS +} + +bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) { + return BlockContentsMatchInline(block1, block2); +} + +inline int BlockHash::SkipNonMatchingBlocks(int block_number, + const char* block_ptr) const { + int probes = 0; + while ((block_number >= 0) && + !BlockContentsMatchInline(block_ptr, + &source_data_[block_number * kBlockSize])) { + if (++probes > kMaxProbes) { + return -1; // Avoid too much chaining + } + block_number = next_block_table_[block_number]; + } + return block_number; +} + +// Init() must have been called and returned true before using +// FirstMatchingBlock or NextMatchingBlock. No check is performed +// for this condition; the code will crash if this condition is violated. +inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value, + const char* block_ptr) const { + return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)], + block_ptr); +} + +int BlockHash::FirstMatchingBlock(uint32_t hash_value, + const char* block_ptr) const { + return FirstMatchingBlockInline(hash_value, block_ptr); +} + +int BlockHash::NextMatchingBlock(int block_number, + const char* block_ptr) const { + if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) { + LOG(DFATAL) << "NextMatchingBlock called for invalid block number " + << block_number << LOG_ENDL; + return -1; + } + return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr); +} + +// Keep a count of the number of matches found. This will throttle the +// number of iterations in FindBestMatch. For example, if the entire +// dictionary is made up of spaces (' ') and the search string is also +// made up of spaces, there will be one match for each block in the +// dictionary. +inline bool BlockHash::TooManyMatches(int* match_counter) { + ++(*match_counter); + return (*match_counter) > kMaxMatchesToCheck; +} + +// Returns the number of bytes to the left of source_match_start +// that match the corresponding bytes to the left of target_match_start. +// Will not examine more than max_bytes bytes, which is to say that +// the return value will be in the range [0, max_bytes] inclusive. +int BlockHash::MatchingBytesToLeft(const char* source_match_start, + const char* target_match_start, + int max_bytes) { + const char* source_ptr = source_match_start; + const char* target_ptr = target_match_start; + int bytes_found = 0; + while (bytes_found < max_bytes) { + --source_ptr; + --target_ptr; + if (*source_ptr != *target_ptr) { + break; + } + ++bytes_found; + } + return bytes_found; +} + +// Returns the number of bytes starting at source_match_end +// that match the corresponding bytes starting at target_match_end. +// Will not examine more than max_bytes bytes, which is to say that +// the return value will be in the range [0, max_bytes] inclusive. +int BlockHash::MatchingBytesToRight(const char* source_match_end, + const char* target_match_end, + int max_bytes) { + const char* source_ptr = source_match_end; + const char* target_ptr = target_match_end; + int bytes_found = 0; + while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) { + ++bytes_found; + ++source_ptr; + ++target_ptr; + } + return bytes_found; +} + +// No NULL checks are performed on the pointer arguments. The caller +// must guarantee that none of the arguments is NULL, or a crash will occur. +// +// The vast majority of calls to FindBestMatch enter the loop *zero* times, +// which is to say that most candidate blocks find no matches in the dictionary. +// The important sections for optimization are therefore the code outside the +// loop and the code within the loop conditions. Keep this to a minimum. +void BlockHash::FindBestMatch(uint32_t hash_value, + const char* target_candidate_start, + const char* target_start, + size_t target_size, + Match* best_match) const { + int match_counter = 0; + for (int block_number = FirstMatchingBlockInline(hash_value, + target_candidate_start); + (block_number >= 0) && !TooManyMatches(&match_counter); + block_number = NextMatchingBlock(block_number, target_candidate_start)) { + int source_match_offset = block_number * kBlockSize; + const int source_match_end = source_match_offset + kBlockSize; + + int target_match_offset = + static_cast<int>(target_candidate_start - target_start); + const int target_match_end = target_match_offset + kBlockSize; + + size_t match_size = kBlockSize; + { + // Extend match start towards beginning of unencoded data + const int limit_bytes_to_left = std::min(source_match_offset, + target_match_offset); + const int matching_bytes_to_left = + MatchingBytesToLeft(source_data_ + source_match_offset, + target_start + target_match_offset, + limit_bytes_to_left); + source_match_offset -= matching_bytes_to_left; + target_match_offset -= matching_bytes_to_left; + match_size += matching_bytes_to_left; + } + { + // Extend match end towards end of unencoded data + const size_t source_bytes_to_right = source_size_ - source_match_end; + const size_t target_bytes_to_right = target_size - target_match_end; + const size_t limit_bytes_to_right = std::min(source_bytes_to_right, + target_bytes_to_right); + match_size += + MatchingBytesToRight(source_data_ + source_match_end, + target_start + target_match_end, + static_cast<int>(limit_bytes_to_right)); + } + // Update in/out parameter if the best match found was better + // than any match already stored in *best_match. + best_match->ReplaceIfBetterMatch(match_size, + source_match_offset + starting_offset_, + target_match_offset); + } +} + +} // namespace open_vcdiff |