1 files changed, 52 insertions, 62 deletions
diff --git a/src/crypto/sha/sha512.c b/src/crypto/sha/sha512.c
index 57c96ab..beb0f8c 100644
--- a/src/crypto/sha/sha512.c
+++ b/src/crypto/sha/sha512.c
@@ -65,35 +65,27 @@
 
 /* IMPLEMENTATION NOTES.
  *
- * As you might have noticed 32-bit hash algorithms:
- *
- * - permit SHA_LONG to be wider than 32-bit (case on CRAY);
- * - optimized versions implement two transform functions: one operating
- *   on [aligned] data in host byte order and one - on data in input
- *   stream byte order;
- * - share common byte-order neutral collector and padding function
- *   implementations, ../md32_common.h;
- *
- * Neither of the above applies to this SHA-512 implementations. Reasons
+ * The 32-bit hash algorithms share a common byte-order neutral collector and
+ * padding function implementations that operate on unaligned data,
+ * ../md32_common.h. This SHA-512 implementation does not. Reasons
  * [in reverse order] are:
  *
- * - it's the only 64-bit hash algorithm for the moment of this writing,
+ * - It's the only 64-bit hash algorithm for the moment of this writing,
  *   there is no need for common collector/padding implementation [yet];
- * - by supporting only one transform function [which operates on
- *   *aligned* data in input stream byte order, big-endian in this case]
- *   we minimize burden of maintenance in two ways: a) collector/padding
- *   function is simpler; b) only one transform function to stare at;
- * - SHA_LONG64 is required to be exactly 64-bit in order to be able to
- *   apply a number of optimizations to mitigate potential performance
- *   penalties caused by previous design decision; */
+ * - By supporting only a transform function that operates on *aligned* data
+ *   the collector/padding function is simpler and easier to optimize. */
 
 #if !defined(OPENSSL_NO_ASM) &&                         \
     (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
      defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
-#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 #define SHA512_ASM
 #endif
 
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
+    defined(__ARM_FEATURE_UNALIGNED)
+#define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
+#endif
+
 int SHA384_Init(SHA512_CTX *sha) {
   sha->h[0] = OPENSSL_U64(0xcbbb9d5dc1059ed8);
   sha->h[1] = OPENSSL_U64(0x629a292a367cd507);
@@ -163,7 +155,7 @@ uint8_t *SHA512(const uint8_t *data, size_t len, uint8_t *out) {
 #if !defined(SHA512_ASM)
 static
 #endif
-void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num);
+void sha512_block_data_order(uint64_t *state, const uint64_t *W, size_t num);
 
 
 int SHA384_Final(uint8_t *md, SHA512_CTX *sha) {
@@ -181,7 +173,7 @@ void SHA512_Transform(SHA512_CTX *c, const uint8_t *data) {
     data = c->u.p;
   }
 #endif
-  sha512_block_data_order(c, data, 1);
+  sha512_block_data_order(c->h, (uint64_t *)data, 1);
 }
 
 int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) {
@@ -213,7 +205,7 @@ int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) {
       memcpy(p + c->num, data, n), c->num = 0;
       len -= n;
       data += n;
-      sha512_block_data_order(c, p, 1);
+      sha512_block_data_order(c->h, (uint64_t *)p, 1);
     }
   }
 
@@ -222,14 +214,14 @@ int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) {
     if ((size_t)data % sizeof(c->u.d[0]) != 0) {
       while (len >= sizeof(c->u)) {
         memcpy(p, data, sizeof(c->u));
-        sha512_block_data_order(c, p, 1);
+        sha512_block_data_order(c->h, (uint64_t *)p, 1);
         len -= sizeof(c->u);
         data += sizeof(c->u);
       }
     } else
 #endif
     {
-      sha512_block_data_order(c, data, len / sizeof(c->u));
+      sha512_block_data_order(c->h, (uint64_t *)data, len / sizeof(c->u));
       data += len;
       len %= sizeof(c->u);
       data -= len;
@@ -253,7 +245,7 @@ int SHA512_Final(uint8_t *md, SHA512_CTX *sha) {
   if (n > (sizeof(sha->u) - 16)) {
     memset(p + n, 0, sizeof(sha->u) - n);
     n = 0;
-    sha512_block_data_order(sha, p, 1);
+    sha512_block_data_order(sha->h, (uint64_t *)p, 1);
   }
 
   memset(p + n, 0, sizeof(sha->u) - 16 - n);
@@ -274,7 +266,7 @@ int SHA512_Final(uint8_t *md, SHA512_CTX *sha) {
   p[sizeof(sha->u) - 15] = (uint8_t)(sha->Nh >> 48);
   p[sizeof(sha->u) - 16] = (uint8_t)(sha->Nh >> 56);
 
-  sha512_block_data_order(sha, p, 1);
+  sha512_block_data_order(sha->h, (uint64_t *)p, 1);
 
   if (md == NULL) {
     /* TODO(davidben): This NULL check is absent in other low-level hash 'final'
@@ -443,23 +435,22 @@ static uint64_t __fastcall __pull64be(const void *x) {
  * This code should give better results on 32-bit CPU with less than
  * ~24 registers, both size and performance wise...
  */
-static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
+static void sha512_block_data_order(uint64_t *state, const uint64_t *W,
                                     size_t num) {
-  const uint64_t *W = in;
   uint64_t A, E, T;
   uint64_t X[9 + 80], *F;
   int i;
 
   while (num--) {
     F = X + 80;
-    A = ctx->h[0];
-    F[1] = ctx->h[1];
-    F[2] = ctx->h[2];
-    F[3] = ctx->h[3];
-    E = ctx->h[4];
-    F[5] = ctx->h[5];
-    F[6] = ctx->h[6];
-    F[7] = ctx->h[7];
+    A = state[0];
+    F[1] = state[1];
+    F[2] = state[2];
+    F[3] = state[3];
+    E = state[4];
+    F[5] = state[5];
+    F[6] = state[6];
+    F[7] = state[7];
 
     for (i = 0; i < 16; i++, F--) {
       T = PULL64(W[i]);
@@ -484,14 +475,14 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
       A = T + Sigma0(A) + Maj(A, F[1], F[2]);
     }
 
-    ctx->h[0] += A;
-    ctx->h[1] += F[1];
-    ctx->h[2] += F[2];
-    ctx->h[3] += F[3];
-    ctx->h[4] += E;
-    ctx->h[5] += F[5];
-    ctx->h[6] += F[6];
-    ctx->h[7] += F[7];
+    state[0] += A;
+    state[1] += F[1];
+    state[2] += F[2];
+    state[3] += F[3];
+    state[4] += E;
+    state[5] += F[5];
+    state[6] += F[6];
+    state[7] += F[7];
 
     W += 16;
   }
@@ -517,23 +508,22 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
     ROUND_00_15(i + j, a, b, c, d, e, f, g, h);        \
   } while (0)
 
-static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
+static void sha512_block_data_order(uint64_t *state, const uint64_t *W,
                                     size_t num) {
-  const uint64_t *W = in;
   uint64_t a, b, c, d, e, f, g, h, s0, s1, T1;
   uint64_t X[16];
   int i;
 
   while (num--) {
 
-    a = ctx->h[0];
-    b = ctx->h[1];
-    c = ctx->h[2];
-    d = ctx->h[3];
-    e = ctx->h[4];
-    f = ctx->h[5];
-    g = ctx->h[6];
-    h = ctx->h[7];
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    f = state[5];
+    g = state[6];
+    h = state[7];
 
     T1 = X[0] = PULL64(W[0]);
     ROUND_00_15(0, a, b, c, d, e, f, g, h);
@@ -587,14 +577,14 @@ static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
       ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X);
     }
 
-    ctx->h[0] += a;
-    ctx->h[1] += b;
-    ctx->h[2] += c;
-    ctx->h[3] += d;
-    ctx->h[4] += e;
-    ctx->h[5] += f;
-    ctx->h[6] += g;
-    ctx->h[7] += h;
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    state[5] += f;
+    state[6] += g;
+    state[7] += h;
 
     W += 16;
   }