1 files changed, 237 insertions, 0 deletions
diff --git a/sdch/open-vcdiff/src/rolling_hash.h b/sdch/open-vcdiff/src/rolling_hash.h
new file mode 100644
index 0000000..c757c83
--- /dev/null
+++ b/sdch/open-vcdiff/src/rolling_hash.h
@@ -0,0 +1,237 @@
+// Copyright 2007, 2008 Google Inc.
+// Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPEN_VCDIFF_ROLLING_HASH_H_
+#define OPEN_VCDIFF_ROLLING_HASH_H_
+
+#include <config.h>
+#include <stdint.h>  // uint32_t
+#include "logging.h"
+
+namespace open_vcdiff {
+
+// Rabin-Karp hasher module -- this is a faster version with different
+// constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is
+// close enough for most applications.
+
+// Definitions common to all hash window sizes.
+class RollingHashUtil {
+ public:
+  // Multiplier for incremental hashing.  The compiler should be smart enough to
+  // convert (val * kMult) into ((val << 8) + val).
+  static const uint32_t kMult = 257;
+
+  // All hashes are returned modulo "kBase".  Current implementation requires
+  // kBase <= 2^32/kMult to avoid overflow.  Also, kBase must be a power of two
+  // so that we can compute modulus efficiently.
+  static const uint32_t kBase = (1 << 23);
+
+  // Returns operand % kBase, assuming that kBase is a power of two.
+  static inline uint32_t ModBase(uint32_t operand) {
+    return operand & (kBase - 1);
+  }
+
+  // Given an unsigned integer "operand", returns an unsigned integer "result"
+  // such that
+  //     result < kBase
+  // and
+  //     ModBase(operand + result) == 0
+  static inline uint32_t FindModBaseInverse(uint32_t operand) {
+    // The subtraction (0 - operand) produces an unsigned underflow for any
+    // operand except 0.  The underflow results in a (very large) unsigned
+    // number.  Binary subtraction is used instead of unary negation because
+    // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned
+    // value is negated.
+    //
+    // The C++ mod operation (operand % kBase) may produce different results for
+    // different compilers if operand is negative.  That is not a problem in
+    // this case, since all numbers used are unsigned, and ModBase does its work
+    // using bitwise arithmetic rather than the % operator.
+    return ModBase(uint32_t(0) - operand);
+  }
+
+  // Here's the heart of the hash algorithm.  Start with a partial_hash value of
+  // 0, and run this HashStep once against each byte in the data window to be
+  // hashed.  The result will be the hash value for the entire data window.  The
+  // Hash() function, below, does exactly this, albeit with some refinements.
+  static inline uint32_t HashStep(uint32_t partial_hash,
+                                  unsigned char next_byte) {
+    return ModBase((partial_hash * kMult) + next_byte);
+  }
+
+  // Use this function to start computing a new hash value based on the first
+  // two bytes in the window.  It is equivalent to calling
+  //     HashStep(HashStep(0, ptr[0]), ptr[1])
+  // but takes advantage of the fact that the maximum value of
+  // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus
+  // avoiding an unnecessary ModBase operation.
+  static inline uint32_t HashFirstTwoBytes(const char* ptr) {
+    return (static_cast<unsigned char>(ptr[0]) * kMult)
+        + static_cast<unsigned char>(ptr[1]);
+  }
+ private:
+  // Making these private avoids copy constructor and assignment operator.
+  // No objects of this type should be constructed.
+  RollingHashUtil();
+  RollingHashUtil(const RollingHashUtil&);  // NOLINT
+  void operator=(const RollingHashUtil&);
+};
+
+// window_size must be >= 2.
+template<int window_size>
+class RollingHash {
+ public:
+  // Perform global initialization that is required in order to instantiate a
+  // RollingHash.  This function *must* be called (preferably on startup) by any
+  // program that uses a RollingHash.  It is harmless to call this function more
+  // than once.  It is not thread-safe, but calling it from two different
+  // threads at the same time can only cause a memory leak, not incorrect
+  // behavior.  Make sure to call it before spawning any threads that could use
+  // RollingHash.  The function returns true if initialization succeeds, or
+  // false if initialization fails, in which case the caller should not proceed
+  // to construct any objects of type RollingHash.
+  static bool Init();
+
+  // Initialize hasher to maintain a window of the specified size.  You need an
+  // instance of this type to use UpdateHash(), but Hash() does not depend on
+  // remove_table_, so it is static.
+  RollingHash() {
+    if (!remove_table_) {
+      LOG(DFATAL) << "RollingHash object instantiated"
+                     " before calling RollingHash::Init()" << LOG_ENDL;
+    }
+  }
+
+  // Compute a hash of the window "ptr[0, window_size - 1]".
+  static uint32_t Hash(const char* ptr) {
+    uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr);
+    for (int i = 2; i < window_size; ++i) {
+      h = RollingHashUtil::HashStep(h, ptr[i]);
+    }
+    return h;
+  }
+
+  // Update a hash by removing the oldest byte and adding a new byte.
+  //
+  // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1]
+  // along with the value of buffer[0] (the "old_first_byte" argument)
+  // and the value of buffer[window_size] (the "new_last_byte" argument).
+  // It quickly computes the hash value of buffer[1] ... buffer[window_size]
+  // without having to run Hash() on the entire window.
+  //
+  // The larger the window, the more advantage comes from using UpdateHash()
+  // (which runs in time independent of window_size) instead of Hash().
+  // Each time window_size doubles, the time to execute Hash() also doubles,
+  // while the time to execute UpdateHash() remains constant.  Empirical tests
+  // have borne out this statement.
+  uint32_t UpdateHash(uint32_t old_hash,
+                      const char old_first_byte,
+                      const char new_last_byte) const {
+    uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte);
+    return RollingHashUtil::HashStep(partial_hash, new_last_byte);
+  }
+
+ protected:
+  // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the
+  // value of the first byte buffer[0], this function returns a *partial* hash
+  // value for buffer[1] ... buffer[window_size -1].  See the comments in
+  // Init(), below, for a description of how the contents of remove_table_ are
+  // computed.
+  static uint32_t RemoveFirstByteFromHash(uint32_t full_hash,
+                                          unsigned char first_byte) {
+    return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]);
+  }
+
+ private:
+  // We keep a table that maps from any byte "b" to
+  //    (- b * pow(kMult, window_size - 1)) % kBase
+  static const uint32_t* remove_table_;
+};
+
+// For each window_size, fill a 256-entry table such that
+//        the hash value of buffer[0] ... buffer[window_size - 1]
+//      + remove_table_[buffer[0]]
+//     == the hash value of buffer[1] ... buffer[window_size - 1]
+// See the comments in Init(), below, for a description of how the contents of
+// remove_table_ are computed.
+template<int window_size>
+const uint32_t* RollingHash<window_size>::remove_table_ = NULL;
+
+// Init() checks to make sure that the static object remove_table_ has been
+// initialized; if not, it does the considerable work of populating it.  Once
+// it's ready, the table can be used for any number of RollingHash objects of
+// the same window_size.
+//
+template<int window_size>
+bool RollingHash<window_size>::Init() {
+  if (window_size < 2) {
+    LOG(ERROR) << "RollingHash window size " << window_size
+               << " is too small" << LOG_ENDL;
+    return false;
+  }
+  if (remove_table_ == NULL) {
+    // The new object is placed into a local pointer instead of directly into
+    // remove_table_, for two reasons:
+    //   1. remove_table_ is a pointer to const.  The table is populated using
+    //      the non-const local pointer and then assigned to the global const
+    //      pointer once it's ready.
+    //   2. No other thread will ever see remove_table_ pointing to a
+    //      partially-initialized table.  If two threads happen to call Init()
+    //      at the same time, two tables with the same contents may be created
+    //      (causing a memory leak), but the results will be consistent
+    //      no matter which of the two tables is used.
+    uint32_t* new_remove_table = new uint32_t[256];
+    // Compute multiplier.  Concisely, it is:
+    //     pow(kMult, (window_size - 1)) % kBase,
+    // but we compute the power in integer form.
+    uint32_t multiplier = 1;
+    for (int i = 0; i < window_size - 1; ++i) {
+      multiplier =
+          RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult);
+    }
+    // For each character removed_byte, compute
+    //     remove_table_[removed_byte] ==
+    //         (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase
+    // where the power operator "pow" is taken in integer form.
+    //
+    // If you take a hash value fp representing the hash of
+    //     buffer[0] ... buffer[window_size - 1]
+    // and add the value of remove_table_[buffer[0]] to it, the result will be
+    // a partial hash value for
+    //     buffer[1] ... buffer[window_size - 1]
+    // that is to say, it no longer includes buffer[0].
+    //
+    // The following byte at buffer[window_size] can then be merged with this
+    // partial hash value to arrive quickly at the hash value for a window that
+    // has advanced by one byte, to
+    //     buffer[1] ... buffer[window_size]
+    // In fact, that is precisely what happens in UpdateHash, above.
+    uint32_t byte_times_multiplier = 0;
+    for (int removed_byte = 0; removed_byte < 256; ++removed_byte) {
+      new_remove_table[removed_byte] =
+          RollingHashUtil::FindModBaseInverse(byte_times_multiplier);
+      // Iteratively adding the multiplier in this loop is equivalent to
+      // computing (removed_byte * multiplier), and is faster
+      byte_times_multiplier =
+          RollingHashUtil::ModBase(byte_times_multiplier + multiplier);
+    }
+    remove_table_ = new_remove_table;
+  }
+  return true;
+}
+
+}  // namespace open_vcdiff
+
+#endif  // OPEN_VCDIFF_ROLLING_HASH_H_