diff options
Diffstat (limited to 'src/crypto/chacha')
-rw-r--r-- | src/crypto/chacha/CMakeLists.txt | 20 | ||||
-rw-r--r-- | src/crypto/chacha/chacha_generic.c | 143 | ||||
-rw-r--r-- | src/crypto/chacha/chacha_vec.c | 340 | ||||
-rw-r--r-- | src/crypto/chacha/chacha_vec_arm.S | 1426 |
4 files changed, 1929 insertions, 0 deletions
diff --git a/src/crypto/chacha/CMakeLists.txt b/src/crypto/chacha/CMakeLists.txt new file mode 100644 index 0000000..6c3f87e --- /dev/null +++ b/src/crypto/chacha/CMakeLists.txt @@ -0,0 +1,20 @@ +include_directories(. .. ../../include) + +if (${ARCH} STREQUAL "arm") + set( + CHACHA_ARCH_SOURCES + + chacha_vec_arm.S + ) +endif() + +add_library( + chacha + + OBJECT + + chacha_generic.c + chacha_vec.c + + ${CHACHA_ARCH_SOURCES} +) diff --git a/src/crypto/chacha/chacha_generic.c b/src/crypto/chacha/chacha_generic.c new file mode 100644 index 0000000..31cf4f0 --- /dev/null +++ b/src/crypto/chacha/chacha_generic.c @@ -0,0 +1,143 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* Adapted from the public domain, estream code by D. Bernstein. */ + +#include <openssl/chacha.h> + +#include <string.h> + +#include <openssl/cpu.h> + + +#if defined(OPENSSL_WINDOWS) || (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || !defined(__SSE2__) + +/* sigma contains the ChaCha constants, which happen to be an ASCII string. */ +static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', + '2', '-', 'b', 'y', 't', 'e', ' ', 'k' }; + +#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) +#define XOR(v, w) ((v) ^ (w)) +#define PLUS(x, y) ((x) + (y)) +#define PLUSONE(v) (PLUS((v), 1)) + +#define U32TO8_LITTLE(p, v) \ + { \ + (p)[0] = (v >> 0) & 0xff; \ + (p)[1] = (v >> 8) & 0xff; \ + (p)[2] = (v >> 16) & 0xff; \ + (p)[3] = (v >> 24) & 0xff; \ + } + +#define U8TO32_LITTLE(p) \ + (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */ +#define QUARTERROUND(a,b,c,d) \ + x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ + x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ + x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ + x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); + +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) +/* Defined in chacha_vec.c */ +void CRYPTO_chacha_20_neon(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[8], + size_t counter); +#endif + +/* chacha_core performs 20 rounds of ChaCha on the input words in + * |input| and writes the 64 output bytes to |output|. */ +static void chacha_core(uint8_t output[64], const uint32_t input[16]) { + uint32_t x[16]; + int i; + + memcpy(x, input, sizeof(uint32_t) * 16); + for (i = 20; i > 0; i -= 2) { + QUARTERROUND(0, 4, 8, 12) + QUARTERROUND(1, 5, 9, 13) + QUARTERROUND(2, 6, 10, 14) + QUARTERROUND(3, 7, 11, 15) + QUARTERROUND(0, 5, 10, 15) + QUARTERROUND(1, 6, 11, 12) + QUARTERROUND(2, 7, 8, 13) + QUARTERROUND(3, 4, 9, 14) + } + + for (i = 0; i < 16; ++i) { + x[i] = PLUS(x[i], input[i]); + } + for (i = 0; i < 16; ++i) { + U32TO8_LITTLE(output + 4 * i, x[i]); + } +} + +void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[8], + size_t counter) { + uint32_t input[16]; + uint8_t buf[64]; + size_t todo, i; + +#if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) + if (CRYPTO_is_NEON_capable()) { + CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter); + return; + } +#endif + + input[0] = U8TO32_LITTLE(sigma + 0); + input[1] = U8TO32_LITTLE(sigma + 4); + input[2] = U8TO32_LITTLE(sigma + 8); + input[3] = U8TO32_LITTLE(sigma + 12); + + input[4] = U8TO32_LITTLE(key + 0); + input[5] = U8TO32_LITTLE(key + 4); + input[6] = U8TO32_LITTLE(key + 8); + input[7] = U8TO32_LITTLE(key + 12); + + input[8] = U8TO32_LITTLE(key + 16); + input[9] = U8TO32_LITTLE(key + 20); + input[10] = U8TO32_LITTLE(key + 24); + input[11] = U8TO32_LITTLE(key + 28); + + input[12] = counter; + input[13] = ((uint64_t)counter) >> 32; + input[14] = U8TO32_LITTLE(nonce + 0); + input[15] = U8TO32_LITTLE(nonce + 4); + + while (in_len > 0) { + todo = sizeof(buf); + if (in_len < todo) { + todo = in_len; + } + + chacha_core(buf, input); + for (i = 0; i < todo; i++) { + out[i] = in[i] ^ buf[i]; + } + + out += todo; + in += todo; + in_len -= todo; + + input[12]++; + if (input[12] == 0) { + input[13]++; + } + } +} + +#endif /* OPENSSL_WINDOWS || !OPENSSL_X86_64 && !OPENSSL_X86 || !__SSE2__ */ diff --git a/src/crypto/chacha/chacha_vec.c b/src/crypto/chacha/chacha_vec.c new file mode 100644 index 0000000..88830bc --- /dev/null +++ b/src/crypto/chacha/chacha_vec.c @@ -0,0 +1,340 @@ +/* Copyright (c) 2014, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* ==================================================================== + * + * When updating this file, also update chacha_vec_arm.S + * + * ==================================================================== */ + + +/* This implementation is by Ted Krovetz and was submitted to SUPERCOP and + * marked as public domain. It was been altered to allow for non-aligned inputs + * and to allow the block counter to be passed in specifically. */ + +#include <openssl/chacha.h> + +#if defined(ASM_GEN) || \ + !defined(OPENSSL_WINDOWS) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) && defined(__SSE2__) + +#define CHACHA_RNDS 20 /* 8 (high speed), 20 (conservative), 12 (middle) */ + +/* Architecture-neutral way to specify 16-byte vector of ints */ +typedef unsigned vec __attribute__((vector_size(16))); + +/* This implementation is designed for Neon, SSE and AltiVec machines. The + * following specify how to do certain vector operations efficiently on + * each architecture, using intrinsics. + * This implementation supports parallel processing of multiple blocks, + * including potentially using general-purpose registers. */ +#if __ARM_NEON__ +#include <arm_neon.h> +#define GPR_TOO 1 +#define VBPI 2 +#define ONE (vec) vsetq_lane_u32(1, vdupq_n_u32(0), 0) +#define LOAD_ALIGNED(m) (vec)(*((vec *)(m))) +#define LOAD(m) ({ \ + memcpy(alignment_buffer, m, 16); \ + LOAD_ALIGNED(alignment_buffer); \ + }) +#define STORE(m, r) ({ \ + (*((vec *)(alignment_buffer))) = (r); \ + memcpy(m, alignment_buffer, 16); \ + }) +#define ROTV1(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 1) +#define ROTV2(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 2) +#define ROTV3(x) (vec) vextq_u32((uint32x4_t)x, (uint32x4_t)x, 3) +#define ROTW16(x) (vec) vrev32q_u16((uint16x8_t)x) +#if __clang__ +#define ROTW7(x) (x << ((vec) {7, 7, 7, 7})) ^ (x >> ((vec) {25, 25, 25, 25})) +#define ROTW8(x) (x << ((vec) {8, 8, 8, 8})) ^ (x >> ((vec) {24, 24, 24, 24})) +#define ROTW12(x) \ + (x << ((vec) {12, 12, 12, 12})) ^ (x >> ((vec) {20, 20, 20, 20})) +#else +#define ROTW7(x) \ + (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 7), (uint32x4_t)x, 25) +#define ROTW8(x) \ + (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 8), (uint32x4_t)x, 24) +#define ROTW12(x) \ + (vec) vsriq_n_u32(vshlq_n_u32((uint32x4_t)x, 12), (uint32x4_t)x, 20) +#endif +#elif __SSE2__ +#include <emmintrin.h> +#define GPR_TOO 0 +#if __clang__ +#define VBPI 4 +#else +#define VBPI 3 +#endif +#define ONE (vec) _mm_set_epi32(0, 0, 0, 1) +#define LOAD(m) (vec) _mm_loadu_si128((__m128i *)(m)) +#define LOAD_ALIGNED(m) (vec) _mm_load_si128((__m128i *)(m)) +#define STORE(m, r) _mm_storeu_si128((__m128i *)(m), (__m128i)(r)) +#define ROTV1(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(0, 3, 2, 1)) +#define ROTV2(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(1, 0, 3, 2)) +#define ROTV3(x) (vec) _mm_shuffle_epi32((__m128i)x, _MM_SHUFFLE(2, 1, 0, 3)) +#define ROTW7(x) \ + (vec)(_mm_slli_epi32((__m128i)x, 7) ^ _mm_srli_epi32((__m128i)x, 25)) +#define ROTW12(x) \ + (vec)(_mm_slli_epi32((__m128i)x, 12) ^ _mm_srli_epi32((__m128i)x, 20)) +#if __SSSE3__ +#include <tmmintrin.h> +#define ROTW8(x) \ + (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, \ + 11, 6, 5, 4, 7, 2, 1, 0, 3)) +#define ROTW16(x) \ + (vec) _mm_shuffle_epi8((__m128i)x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, \ + 10, 5, 4, 7, 6, 1, 0, 3, 2)) +#else +#define ROTW8(x) \ + (vec)(_mm_slli_epi32((__m128i)x, 8) ^ _mm_srli_epi32((__m128i)x, 24)) +#define ROTW16(x) \ + (vec)(_mm_slli_epi32((__m128i)x, 16) ^ _mm_srli_epi32((__m128i)x, 16)) +#endif +#else +#error-- Implementation supports only machines with neon or SSE2 +#endif + +#ifndef REVV_BE +#define REVV_BE(x) (x) +#endif + +#ifndef REVW_BE +#define REVW_BE(x) (x) +#endif + +#define BPI (VBPI + GPR_TOO) /* Blocks computed per loop iteration */ + +#define DQROUND_VECTORS(a,b,c,d) \ + a += b; d ^= a; d = ROTW16(d); \ + c += d; b ^= c; b = ROTW12(b); \ + a += b; d ^= a; d = ROTW8(d); \ + c += d; b ^= c; b = ROTW7(b); \ + b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); \ + a += b; d ^= a; d = ROTW16(d); \ + c += d; b ^= c; b = ROTW12(b); \ + a += b; d ^= a; d = ROTW8(d); \ + c += d; b ^= c; b = ROTW7(b); \ + b = ROTV3(b); c = ROTV2(c); d = ROTV1(d); + +#define QROUND_WORDS(a,b,c,d) \ + a = a+b; d ^= a; d = d<<16 | d>>16; \ + c = c+d; b ^= c; b = b<<12 | b>>20; \ + a = a+b; d ^= a; d = d<< 8 | d>>24; \ + c = c+d; b ^= c; b = b<< 7 | b>>25; + +#define WRITE_XOR(in, op, d, v0, v1, v2, v3) \ + STORE(op + d + 0, LOAD(in + d + 0) ^ REVV_BE(v0)); \ + STORE(op + d + 4, LOAD(in + d + 4) ^ REVV_BE(v1)); \ + STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \ + STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3)); + +#if __ARM_NEON__ +/* For ARM, we can't depend on NEON support, so this function is compiled with + * a different name, along with the generic code, and can be enabled at + * run-time. */ +void CRYPTO_chacha_20_neon( +#else +void CRYPTO_chacha_20( +#endif + uint8_t *out, + const uint8_t *in, + size_t inlen, + const uint8_t key[32], + const uint8_t nonce[8], + size_t counter) + { + unsigned iters, i, *op=(unsigned *)out, *ip=(unsigned *)in, *kp; +#if defined(__ARM_NEON__) + unsigned *np; + uint8_t alignment_buffer[16] __attribute__((aligned(16))); +#endif + vec s0, s1, s2, s3; +#if !defined(__ARM_NEON__) && !defined(__SSE2__) + __attribute__ ((aligned (16))) unsigned key[8], nonce[4]; +#endif + __attribute__ ((aligned (16))) unsigned chacha_const[] = + {0x61707865,0x3320646E,0x79622D32,0x6B206574}; +#if defined(__ARM_NEON__) || defined(__SSE2__) + kp = (unsigned *)key; +#else + ((vec *)key)[0] = REVV_BE(((vec *)key)[0]); + ((vec *)key)[1] = REVV_BE(((vec *)key)[1]); + nonce[0] = REVW_BE(((unsigned *)nonce)[0]); + nonce[1] = REVW_BE(((unsigned *)nonce)[1]); + nonce[2] = REVW_BE(((unsigned *)nonce)[2]); + nonce[3] = REVW_BE(((unsigned *)nonce)[3]); + kp = (unsigned *)key; + np = (unsigned *)nonce; +#endif +#if defined(__ARM_NEON__) + np = (unsigned*) nonce; +#endif + s0 = LOAD_ALIGNED(chacha_const); + s1 = LOAD_ALIGNED(&((vec*)kp)[0]); + s2 = LOAD_ALIGNED(&((vec*)kp)[1]); + s3 = (vec){ + counter & 0xffffffff, +#if __ARM_NEON__ || defined(OPENSSL_X86) + 0, /* can't right-shift 32 bits on a 32-bit system. */ +#else + counter >> 32, +#endif + ((uint32_t*)nonce)[0], + ((uint32_t*)nonce)[1] + }; + + for (iters = 0; iters < inlen/(BPI*64); iters++) + { +#if GPR_TOO + register unsigned x0, x1, x2, x3, x4, x5, x6, x7, x8, + x9, x10, x11, x12, x13, x14, x15; +#endif +#if VBPI > 2 + vec v8,v9,v10,v11; +#endif +#if VBPI > 3 + vec v12,v13,v14,v15; +#endif + + vec v0,v1,v2,v3,v4,v5,v6,v7; + v4 = v0 = s0; v5 = v1 = s1; v6 = v2 = s2; v3 = s3; + v7 = v3 + ONE; +#if VBPI > 2 + v8 = v4; v9 = v5; v10 = v6; + v11 = v7 + ONE; +#endif +#if VBPI > 3 + v12 = v8; v13 = v9; v14 = v10; + v15 = v11 + ONE; +#endif +#if GPR_TOO + x0 = chacha_const[0]; x1 = chacha_const[1]; + x2 = chacha_const[2]; x3 = chacha_const[3]; + x4 = kp[0]; x5 = kp[1]; x6 = kp[2]; x7 = kp[3]; + x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7]; + x12 = counter+BPI*iters+(BPI-1); x13 = 0; + x14 = np[0]; x15 = np[1]; +#endif + for (i = CHACHA_RNDS/2; i; i--) + { + DQROUND_VECTORS(v0,v1,v2,v3) + DQROUND_VECTORS(v4,v5,v6,v7) +#if VBPI > 2 + DQROUND_VECTORS(v8,v9,v10,v11) +#endif +#if VBPI > 3 + DQROUND_VECTORS(v12,v13,v14,v15) +#endif +#if GPR_TOO + QROUND_WORDS( x0, x4, x8,x12) + QROUND_WORDS( x1, x5, x9,x13) + QROUND_WORDS( x2, x6,x10,x14) + QROUND_WORDS( x3, x7,x11,x15) + QROUND_WORDS( x0, x5,x10,x15) + QROUND_WORDS( x1, x6,x11,x12) + QROUND_WORDS( x2, x7, x8,x13) + QROUND_WORDS( x3, x4, x9,x14) +#endif + } + + WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3) + s3 += ONE; + WRITE_XOR(ip, op, 16, v4+s0, v5+s1, v6+s2, v7+s3) + s3 += ONE; +#if VBPI > 2 + WRITE_XOR(ip, op, 32, v8+s0, v9+s1, v10+s2, v11+s3) + s3 += ONE; +#endif +#if VBPI > 3 + WRITE_XOR(ip, op, 48, v12+s0, v13+s1, v14+s2, v15+s3) + s3 += ONE; +#endif + ip += VBPI*16; + op += VBPI*16; +#if GPR_TOO + op[0] = REVW_BE(REVW_BE(ip[0]) ^ (x0 + chacha_const[0])); + op[1] = REVW_BE(REVW_BE(ip[1]) ^ (x1 + chacha_const[1])); + op[2] = REVW_BE(REVW_BE(ip[2]) ^ (x2 + chacha_const[2])); + op[3] = REVW_BE(REVW_BE(ip[3]) ^ (x3 + chacha_const[3])); + op[4] = REVW_BE(REVW_BE(ip[4]) ^ (x4 + kp[0])); + op[5] = REVW_BE(REVW_BE(ip[5]) ^ (x5 + kp[1])); + op[6] = REVW_BE(REVW_BE(ip[6]) ^ (x6 + kp[2])); + op[7] = REVW_BE(REVW_BE(ip[7]) ^ (x7 + kp[3])); + op[8] = REVW_BE(REVW_BE(ip[8]) ^ (x8 + kp[4])); + op[9] = REVW_BE(REVW_BE(ip[9]) ^ (x9 + kp[5])); + op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6])); + op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7])); + op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter+BPI*iters+(BPI-1))); + op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13)); + op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[0])); + op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[1])); + s3 += ONE; + ip += 16; + op += 16; +#endif + } + + for (iters = inlen%(BPI*64)/64; iters != 0; iters--) + { + vec v0 = s0, v1 = s1, v2 = s2, v3 = s3; + for (i = CHACHA_RNDS/2; i; i--) + { + DQROUND_VECTORS(v0,v1,v2,v3); + } + WRITE_XOR(ip, op, 0, v0+s0, v1+s1, v2+s2, v3+s3) + s3 += ONE; + ip += 16; + op += 16; + } + + inlen = inlen % 64; + if (inlen) + { + __attribute__ ((aligned (16))) vec buf[4]; + vec v0,v1,v2,v3; + v0 = s0; v1 = s1; v2 = s2; v3 = s3; + for (i = CHACHA_RNDS/2; i; i--) + { + DQROUND_VECTORS(v0,v1,v2,v3); + } + + if (inlen >= 16) + { + STORE(op + 0, LOAD(ip + 0) ^ REVV_BE(v0 + s0)); + if (inlen >= 32) + { + STORE(op + 4, LOAD(ip + 4) ^ REVV_BE(v1 + s1)); + if (inlen >= 48) + { + STORE(op + 8, LOAD(ip + 8) ^ + REVV_BE(v2 + s2)); + buf[3] = REVV_BE(v3 + s3); + } + else + buf[2] = REVV_BE(v2 + s2); + } + else + buf[1] = REVV_BE(v1 + s1); + } + else + buf[0] = REVV_BE(v0 + s0); + + for (i=inlen & ~15; i<inlen; i++) + ((char *)op)[i] = ((char *)ip)[i] ^ ((char *)buf)[i]; + } + } + +#endif /* ASM_GEN || !OPENSSL_WINDOWS && (OPENSSL_X86_64 || OPENSSL_X86) && SSE2 */ diff --git a/src/crypto/chacha/chacha_vec_arm.S b/src/crypto/chacha/chacha_vec_arm.S new file mode 100644 index 0000000..15d4556 --- /dev/null +++ b/src/crypto/chacha/chacha_vec_arm.S @@ -0,0 +1,1426 @@ +# Copyright (c) 2014, Google Inc. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# This file contains a pre-compiled version of chacha_vec.c for ARM. This is +# needed to support switching on NEON code at runtime. If the whole of OpenSSL +# were to be compiled with the needed flags to build chacha_vec.c, then it +# wouldn't be possible to run on non-NEON systems. +# +# This file was generated by chacha_vec_arm_generate.go using the following +# compiler command: +# +# /opt/gcc-linaro-4.9-2014.11-x86_64_arm-linux-gnueabihf/bin/arm-linux-gnueabihf-gcc -O3 -mcpu=cortex-a8 -mfpu=neon -fpic -DASM_GEN -I ../../include -S chacha_vec.c -o - + +#if !defined(OPENSSL_NO_ASM) + + .syntax unified + .cpu cortex-a8 + .eabi_attribute 27, 3 + +# EABI attribute 28 sets whether VFP register arguments were used to build this +# file. If object files are inconsistent on this point, the linker will refuse +# to link them. Thus we report whatever the compiler expects since we don't use +# VFP arguments. + +#if defined(__ARM_PCS_VFP) + .eabi_attribute 28, 1 +#else + .eabi_attribute 28, 0 +#endif + + .fpu neon + .eabi_attribute 20, 1 + .eabi_attribute 21, 1 + .eabi_attribute 23, 3 + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + .eabi_attribute 26, 2 + .eabi_attribute 30, 2 + .eabi_attribute 34, 1 + .eabi_attribute 18, 4 + .thumb + .file "chacha_vec.c" + .text + .align 2 + .global CRYPTO_chacha_20_neon + .hidden CRYPTO_chacha_20_neon + .thumb + .thumb_func + .type CRYPTO_chacha_20_neon, %function +CRYPTO_chacha_20_neon: + @ args = 8, pretend = 0, frame = 128 + @ frame_needed = 1, uses_anonymous_args = 0 + push {r4, r5, r6, r7, r8, r9, r10, fp, lr} + mov r4, r2 + vpush.64 {d8, d9, d10, d11, d12, d13, d14, d15} + movw r8, #43691 + movt r8, 43690 + mov ip, r3 + umull r8, r9, r4, r8 + sub sp, sp, #132 + add r7, sp, #0 + sub sp, sp, #112 + mov fp, r0 + mov r10, r1 + str r2, [r7, #8] + add r4, sp, #15 + ldr r2, .L92+16 + bic r4, r4, #15 + ldr r5, [r7, #232] + add lr, r4, #64 +.LPIC16: + add r2, pc + str r0, [r7, #60] + str r1, [r7, #12] + str r3, [r7, #44] + ldmia r2, {r0, r1, r2, r3} + ldr r6, [r5] + str r4, [r7, #72] + ldr r5, [r5, #4] + ldr r4, [r7, #236] + str r6, [r7, #120] + str r5, [r7, #124] + str r4, [r7, #112] + stmia lr, {r0, r1, r2, r3} + movs r3, #0 + ldr r0, [r7, #72] + str r3, [r7, #116] + lsrs r3, r9, #7 + vldr d22, [r7, #112] + vldr d23, [r7, #120] + vldr d24, [r0, #64] + vldr d25, [r0, #72] + vld1.64 {d26-d27}, [ip:64] + vldr d28, [ip, #16] + vldr d29, [ip, #24] + beq .L26 + ldr r1, [r0, #64] + lsls r2, r3, #8 + sub r3, r2, r3, lsl #6 + str r3, [r7, #4] + ldr r2, [r0, #72] + str r1, [r7, #40] + mov r1, r3 + ldr r3, [r0, #68] + vldr d0, .L92 + vldr d1, .L92+8 + str r2, [r7, #32] + adds r2, r4, #2 + str r3, [r7, #36] + ldr r3, [r0, #76] + str r2, [r7, #48] + mov r2, r0 + mov r0, fp + str r10, [r7, #64] + str r3, [r7, #28] + adds r3, r0, r1 + mov r1, r6 + str r3, [r7, #16] + add r3, r2, #80 + mov r2, r5 + str r3, [r7, #68] +.L4: + ldr r0, [r7, #44] + add r8, r7, #28 + str r2, [r7, #108] + vadd.i32 q3, q11, q0 + ldmia r8, {r8, r9, r10, fp} + vmov q8, q14 @ v4si + ldr r3, [r0] + vmov q1, q13 @ v4si + vmov q9, q12 @ v4si + vmov q2, q11 @ v4si + str r3, [r7, #52] + mov r3, r0 + ldr r5, [r3, #8] + vmov q15, q14 @ v4si + ldr lr, [r3, #20] + vmov q5, q13 @ v4si + ldr r6, [r3, #12] + vmov q10, q12 @ v4si + str r5, [r7, #92] + mov r5, r3 + ldr r4, [r5, #28] + movs r5, #10 + ldr ip, [r3, #16] + ldr r3, [r3, #24] + str r4, [r7, #104] + ldr r4, [r7, #48] + str r3, [r7, #100] + mov r3, r1 + str r6, [r7, #56] + str r4, [r7, #96] + str r8, [r7, #80] + mov r8, r10 + ldr r0, [r0, #4] + mov r10, r9 + ldr r1, [r7, #92] + ldr r2, [r7, #56] + ldr r9, [r7, #100] + ldr r4, [r7, #52] + str lr, [r7, #88] + mov lr, r3 + str r5, [r7, #76] + movs r5, #0 + str r5, [r7, #84] + b .L93 +.L94: + .align 3 +.L92: + .word 1 + .word 0 + .word 0 + .word 0 + .word .LANCHOR0-(.LPIC16+4) +.L93: +.L3: + vadd.i32 q9, q9, q1 + add r3, r8, r0 + vadd.i32 q10, q10, q5 + add r5, fp, r4 + veor q3, q3, q9 + mov r6, r3 + veor q2, q2, q10 + ldr r3, [r7, #80] + str r5, [r7, #100] + add r10, r10, r1 + vrev32.16 q3, q3 + eor lr, lr, r10 + vadd.i32 q8, q8, q3 + vrev32.16 q2, q2 + vadd.i32 q15, q15, q2 + mov fp, r3 + ldr r3, [r7, #96] + veor q4, q8, q1 + str r6, [r7, #96] + veor q6, q15, q5 + eors r3, r3, r5 + mov r5, r6 + ldr r6, [r7, #84] + vshl.i32 q1, q4, #12 + vshl.i32 q5, q6, #12 + add fp, fp, r2 + eors r6, r6, r5 + ror r3, r3, #16 + vsri.32 q1, q4, #20 + ror lr, lr, #16 + mov r5, r6 + ldr r6, [r7, #108] + vsri.32 q5, q6, #20 + str r3, [r7, #108] + eor r6, r6, fp + ror r5, r5, #16 + vadd.i32 q9, q9, q1 + add r9, r9, lr + ror r3, r6, #16 + ldr r6, [r7, #108] + vadd.i32 q10, q10, q5 + str r3, [r7, #92] + veor q4, q9, q3 + add ip, ip, r6 + ldr r6, [r7, #88] + veor q6, q10, q2 + eor r4, ip, r4 + eor r1, r9, r1 + vshl.i32 q3, q4, #8 + mov r8, r6 + ldr r6, [r7, #104] + vshl.i32 q2, q6, #8 + ror r4, r4, #20 + add r6, r6, r3 + vsri.32 q3, q4, #24 + str r6, [r7, #88] + eors r2, r2, r6 + ldr r6, [r7, #100] + vsri.32 q2, q6, #24 + add r8, r8, r5 + ror r2, r2, #20 + adds r6, r4, r6 + vadd.i32 q4, q8, q3 + eor r0, r8, r0 + vadd.i32 q15, q15, q2 + mov r3, r6 + ldr r6, [r7, #96] + veor q6, q4, q1 + ror r0, r0, #20 + str r3, [r7, #96] + veor q5, q15, q5 + adds r6, r0, r6 + str r6, [r7, #104] + mov r6, r3 + ldr r3, [r7, #108] + vshl.i32 q8, q6, #7 + add fp, fp, r2 + eors r3, r3, r6 + ldr r6, [r7, #104] + vshl.i32 q1, q5, #7 + ror r1, r1, #20 + eors r5, r5, r6 + vsri.32 q8, q6, #25 + ldr r6, [r7, #92] + ror r3, r3, #24 + ror r5, r5, #24 + vsri.32 q1, q5, #25 + str r5, [r7, #100] + eor r6, fp, r6 + ldr r5, [r7, #100] + add r10, r10, r1 + add ip, r3, ip + vext.32 q8, q8, q8, #1 + str ip, [r7, #108] + add ip, r5, r8 + ldr r5, [r7, #88] + eor lr, r10, lr + ror r6, r6, #24 + vext.32 q1, q1, q1, #1 + add r8, r6, r5 + vadd.i32 q9, q9, q8 + ldr r5, [r7, #108] + vext.32 q3, q3, q3, #3 + vadd.i32 q10, q10, q1 + ror lr, lr, #24 + eor r0, ip, r0 + vext.32 q2, q2, q2, #3 + add r9, r9, lr + eors r4, r4, r5 + veor q3, q9, q3 + ldr r5, [r7, #96] + eor r1, r9, r1 + ror r0, r0, #25 + veor q2, q10, q2 + adds r5, r0, r5 + vext.32 q4, q4, q4, #2 + str r5, [r7, #96] + ldr r5, [r7, #104] + ror r1, r1, #25 + vrev32.16 q3, q3 + eor r2, r8, r2 + vext.32 q15, q15, q15, #2 + adds r5, r1, r5 + vadd.i32 q4, q4, q3 + ror r4, r4, #25 + vrev32.16 q2, q2 + str r5, [r7, #84] + vadd.i32 q15, q15, q2 + eors r3, r3, r5 + ldr r5, [r7, #96] + add fp, fp, r4 + veor q8, q4, q8 + ror r2, r2, #25 + veor q1, q15, q1 + eor lr, fp, lr + eors r6, r6, r5 + ror r3, r3, #16 + ldr r5, [r7, #100] + add r10, r10, r2 + str r3, [r7, #104] + ror lr, lr, #16 + ldr r3, [r7, #104] + eor r5, r10, r5 + vshl.i32 q5, q8, #12 + add ip, lr, ip + vshl.i32 q6, q1, #12 + str ip, [r7, #88] + add ip, r3, r8 + str ip, [r7, #100] + ldr r3, [r7, #108] + ror r5, r5, #16 + vsri.32 q5, q8, #20 + ror r6, r6, #16 + add ip, r5, r3 + ldr r3, [r7, #88] + vsri.32 q6, q1, #20 + add r9, r9, r6 + eor r2, ip, r2 + eors r4, r4, r3 + ldr r3, [r7, #100] + eor r0, r9, r0 + vadd.i32 q9, q9, q5 + ror r4, r4, #20 + eors r1, r1, r3 + vadd.i32 q10, q10, q6 + ror r3, r2, #20 + str r3, [r7, #92] + ldr r3, [r7, #96] + veor q3, q9, q3 + ror r0, r0, #20 + add r8, r4, fp + veor q2, q10, q2 + add fp, r0, r3 + ldr r3, [r7, #84] + ror r1, r1, #20 + mov r2, r8 + vshl.i32 q8, q3, #8 + str r8, [r7, #80] + add r8, r1, r3 + ldr r3, [r7, #92] + vmov q1, q6 @ v4si + vshl.i32 q6, q2, #8 + eor r6, fp, r6 + add r10, r10, r3 + ldr r3, [r7, #104] + vsri.32 q8, q3, #24 + eor lr, r2, lr + eor r3, r8, r3 + ror r2, r6, #24 + vsri.32 q6, q2, #24 + eor r5, r10, r5 + str r2, [r7, #108] + ror r2, r3, #24 + ldr r3, [r7, #88] + vmov q3, q8 @ v4si + vadd.i32 q15, q15, q6 + ror lr, lr, #24 + vadd.i32 q8, q4, q8 + ror r6, r5, #24 + add r5, lr, r3 + ldr r3, [r7, #108] + veor q4, q8, q5 + add ip, ip, r6 + vmov q2, q6 @ v4si + add r9, r9, r3 + veor q6, q15, q1 + ldr r3, [r7, #100] + vshl.i32 q1, q4, #7 + str r2, [r7, #96] + add r3, r3, r2 + str r3, [r7, #104] + vshl.i32 q5, q6, #7 + eors r1, r1, r3 + ldr r3, [r7, #92] + vsri.32 q1, q4, #25 + eors r4, r4, r5 + eor r0, r9, r0 + eor r2, ip, r3 + vsri.32 q5, q6, #25 + ldr r3, [r7, #76] + ror r4, r4, #25 + str r6, [r7, #84] + ror r0, r0, #25 + subs r3, r3, #1 + str r5, [r7, #88] + ror r1, r1, #25 + ror r2, r2, #25 + vext.32 q15, q15, q15, #2 + str r3, [r7, #76] + vext.32 q2, q2, q2, #1 + vext.32 q8, q8, q8, #2 + vext.32 q3, q3, q3, #1 + vext.32 q5, q5, q5, #3 + vext.32 q1, q1, q1, #3 + bne .L3 + ldr r3, [r7, #68] + vadd.i32 q4, q12, q10 + str r9, [r7, #100] + mov r9, r10 + mov r10, r8 + ldr r8, [r7, #80] + str lr, [r7, #80] + mov lr, r5 + ldr r5, [r7, #40] + vadd.i32 q5, q13, q5 + ldr r6, [r7, #64] + vadd.i32 q15, q14, q15 + add fp, fp, r5 + ldr r5, [r7, #36] + str r4, [r7, #52] + vadd.i32 q7, q14, q8 + ldr r4, [r7, #96] + add r5, r10, r5 + str r3, [r7, #96] + vadd.i32 q2, q11, q2 + ldr r3, [r6, #12] @ unaligned + vadd.i32 q6, q12, q9 + str r0, [r7, #76] + vadd.i32 q1, q13, q1 + ldr r0, [r6] @ unaligned + vadd.i32 q11, q11, q0 + str r1, [r7, #92] + str r2, [r7, #56] + vadd.i32 q3, q11, q3 + ldr r1, [r6, #4] @ unaligned + vadd.i32 q11, q11, q0 + ldr r2, [r6, #8] @ unaligned + str r5, [r7, #88] + vadd.i32 q11, q11, q0 + ldr r5, [r7, #96] + ldr r10, [r7, #68] + stmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r2, [r7, #72] + ldr r1, [r7, #32] + ldr r3, [r7, #48] + vldr d20, [r2, #80] + vldr d21, [r2, #88] + add r9, r9, r1 + veor q10, q10, q4 + ldr r1, [r7, #28] + add r0, r8, r1 + str r0, [r7, #24] + vstr d20, [r2, #80] + vstr d21, [r2, #88] + adds r0, r4, r3 + str r0, [r7, #20] + ldmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r4, [r7, #60] + str r0, [r4] @ unaligned + mov r4, r10 + ldr r0, [r7, #60] + str r1, [r0, #4] @ unaligned + mov r8, r0 + str r2, [r0, #8] @ unaligned + str r3, [r0, #12] @ unaligned + ldr r0, [r6, #16]! @ unaligned + ldr r1, [r6, #4] @ unaligned + ldr r2, [r6, #8] @ unaligned + ldr r3, [r6, #12] @ unaligned + ldr r6, [r7, #64] + stmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r3, [r7, #72] + vldr d20, [r3, #80] + vldr d21, [r3, #88] + veor q10, q10, q5 + vstr d20, [r3, #80] + vstr d21, [r3, #88] + ldmia r4!, {r0, r1, r2, r3} + mov r4, r8 + str r0, [r8, #16] @ unaligned + str r1, [r8, #20] @ unaligned + str r2, [r8, #24] @ unaligned + str r3, [r8, #28] @ unaligned + ldr r0, [r6, #32]! @ unaligned + ldr r1, [r6, #4] @ unaligned + ldr r2, [r6, #8] @ unaligned + ldr r3, [r6, #12] @ unaligned + ldr r6, [r7, #64] + stmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r0, [r7, #72] + vldr d16, [r0, #80] + vldr d17, [r0, #88] + veor q15, q8, q15 + vstr d30, [r0, #80] + vstr d31, [r0, #88] + ldmia r10!, {r0, r1, r2, r3} + mov r10, r5 + str r0, [r4, #32] @ unaligned + str r1, [r4, #36] @ unaligned + str r2, [r4, #40] @ unaligned + str r3, [r4, #44] @ unaligned + ldr r0, [r6, #48]! @ unaligned + ldr r1, [r6, #4] @ unaligned + ldr r2, [r6, #8] @ unaligned + ldr r3, [r6, #12] @ unaligned + ldr r6, [r7, #64] + stmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r2, [r7, #72] + vldr d18, [r2, #80] + vldr d19, [r2, #88] + veor q9, q9, q2 + vstr d18, [r2, #80] + vstr d19, [r2, #88] + ldmia r10!, {r0, r1, r2, r3} + mov r10, r5 + str r0, [r4, #48] @ unaligned + str r1, [r4, #52] @ unaligned + str r2, [r4, #56] @ unaligned + str r3, [r4, #60] @ unaligned + ldr r0, [r6, #64]! @ unaligned + ldr r1, [r6, #4] @ unaligned + ldr r2, [r6, #8] @ unaligned + ldr r3, [r6, #12] @ unaligned + ldr r6, [r7, #64] + stmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r2, [r7, #72] + vldr d18, [r2, #80] + vldr d19, [r2, #88] + veor q9, q9, q6 + vstr d18, [r2, #80] + vstr d19, [r2, #88] + ldmia r10!, {r0, r1, r2, r3} + mov r10, r5 + str r0, [r4, #64] @ unaligned + str r1, [r4, #68] @ unaligned + str r2, [r4, #72] @ unaligned + str r3, [r4, #76] @ unaligned + ldr r0, [r6, #80]! @ unaligned + ldr r1, [r6, #4] @ unaligned + ldr r2, [r6, #8] @ unaligned + ldr r3, [r6, #12] @ unaligned + ldr r6, [r7, #64] + stmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r2, [r7, #72] + vldr d18, [r2, #80] + vldr d19, [r2, #88] + veor q1, q9, q1 + vstr d2, [r2, #80] + vstr d3, [r2, #88] + ldmia r10!, {r0, r1, r2, r3} + mov r10, r5 + str r0, [r4, #80] @ unaligned + str r1, [r4, #84] @ unaligned + str r2, [r4, #88] @ unaligned + str r3, [r4, #92] @ unaligned + ldr r0, [r6, #96]! @ unaligned + ldr r1, [r6, #4] @ unaligned + ldr r2, [r6, #8] @ unaligned + ldr r3, [r6, #12] @ unaligned + ldr r6, [r7, #64] + stmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r3, [r7, #72] + vldr d16, [r3, #80] + vldr d17, [r3, #88] + veor q8, q8, q7 + vstr d16, [r3, #80] + vstr d17, [r3, #88] + ldmia r10!, {r0, r1, r2, r3} + mov r10, r5 + str r0, [r4, #96] @ unaligned + str r1, [r4, #100] @ unaligned + str r2, [r4, #104] @ unaligned + str r3, [r4, #108] @ unaligned + ldr r0, [r6, #112]! @ unaligned + ldr r1, [r6, #4] @ unaligned + ldr r2, [r6, #8] @ unaligned + ldr r3, [r6, #12] @ unaligned + stmia r5!, {r0, r1, r2, r3} + mov r5, r10 + ldr r0, [r7, #72] + ldr r6, [r7, #44] + vldr d16, [r0, #80] + vldr d17, [r0, #88] + veor q8, q8, q3 + vstr d16, [r0, #80] + vstr d17, [r0, #88] + ldmia r5!, {r0, r1, r2, r3} + mov r5, r4 + mov r8, r5 + str r1, [r4, #116] @ unaligned + ldr r1, [r7, #64] + str r0, [r4, #112] @ unaligned + mov r0, r5 + str r2, [r4, #120] @ unaligned + str r3, [r4, #124] @ unaligned + ldr r3, [r1, #128] + ldr r2, [r7, #88] + eor r3, fp, r3 + str r3, [r4, #128] + ldr r3, [r1, #132] + mov r4, r1 + mov r1, r5 + eors r2, r2, r3 + str r2, [r8, #132] + ldr r3, [r4, #136] + ldr r2, [r7, #24] + eor r3, r9, r3 + str r3, [r5, #136] + ldr r3, [r4, #140] + eors r3, r3, r2 + str r3, [r5, #140] + mov r5, r4 + ldr r3, [r6] + ldr r2, [r4, #144] + ldr r4, [r7, #52] + add r4, r4, r3 + eors r2, r2, r4 + mov r4, r1 + str r2, [r1, #144] + ldr r1, [r7, #76] + ldr r2, [r6, #4] + ldr r3, [r5, #148] + mov r8, r1 + add r8, r8, r2 + mov r2, r8 + eors r3, r3, r2 + str r3, [r0, #148] + mov r0, r4 + ldr r2, [r6, #8] + ldr r1, [r7, #92] + ldr r3, [r5, #152] + mov r8, r1 + add r8, r8, r2 + ldr r1, [r7, #56] + mov r2, r8 + eors r3, r3, r2 + str r3, [r4, #152] + mov r8, r6 + ldr r2, [r6, #12] + mov r4, r5 + ldr r3, [r5, #156] + add r1, r1, r2 + eors r3, r3, r1 + str r3, [r0, #156] + ldr r2, [r6, #16] + mov r1, r0 + ldr r3, [r5, #160] + add ip, ip, r2 + eor r3, ip, r3 + str r3, [r0, #160] + ldr r2, [r6, #20] + mov ip, r0 + ldr r3, [r5, #164] + add lr, lr, r2 + ldr r2, [r7, #100] + eor r3, lr, r3 + str r3, [r1, #164] + ldr r6, [r6, #24] + ldr r3, [r4, #168] + add r2, r2, r6 + eors r3, r3, r2 + ldr r2, [r7, #104] + str r3, [r0, #168] + ldr r5, [r8, #28] + ldr r3, [r4, #172] + add r2, r2, r5 + mov r5, r4 + eors r3, r3, r2 + mov r2, r0 + str r3, [r0, #172] + ldr r3, [r7, #48] + ldr r4, [r4, #176] + ldr r0, [r7, #20] + adds r1, r3, #3 + ldr r3, [r7, #84] + eors r4, r4, r0 + str r4, [r2, #176] + ldr r0, [r5, #180] + mov r4, r2 + str r1, [r7, #48] + eors r3, r3, r0 + mov r0, r3 + ldr r3, [r7, #232] + str r0, [r2, #180] + ldr r1, [r3] + ldr r3, [r5, #184] + ldr r2, [r7, #80] + add r2, r2, r1 + mov r1, r5 + eors r3, r3, r2 + str r3, [ip, #184] + ldr r3, [r7, #232] + adds r1, r1, #192 + str r1, [r7, #64] + ldr r1, [r7, #108] + ldr r2, [r3, #4] + ldr r3, [r5, #188] + add r1, r1, r2 + mov r2, r1 + eors r2, r2, r3 + str r2, [ip, #188] + mov r3, r4 + ldr r2, [r7, #16] + adds r3, r3, #192 + str r3, [r7, #60] + cmp r2, r3 + beq .L85 + ldr r3, [r7, #232] + ldmia r3, {r1, r2} + b .L4 +.L85: + ldr r3, [r7, #12] + ldr r2, [r7, #4] + add r3, r3, r2 + str r3, [r7, #12] +.L2: + ldr r1, [r7, #8] + movw r2, #43691 + movt r2, 43690 + umull r2, r3, r1, r2 + lsr fp, r3, #7 + lsl r3, fp, #8 + sub fp, r3, fp, lsl #6 + rsb fp, fp, r1 + lsrs fp, fp, #6 + beq .L6 + ldr r6, [r7, #72] + ldr r5, [r7, #12] + ldr r4, [r7, #16] + mov r3, r6 + adds r3, r3, #80 + vldr d30, .L95 + vldr d31, .L95+8 + mov lr, r3 + str fp, [r7, #104] + str fp, [r7, #108] +.L8: + vmov q2, q11 @ v4si + movs r3, #10 + vmov q8, q14 @ v4si + vmov q9, q13 @ v4si + vmov q10, q12 @ v4si +.L7: + vadd.i32 q10, q10, q9 + subs r3, r3, #1 + veor q3, q2, q10 + vrev32.16 q3, q3 + vadd.i32 q8, q8, q3 + veor q9, q8, q9 + vshl.i32 q2, q9, #12 + vsri.32 q2, q9, #20 + vadd.i32 q10, q10, q2 + veor q3, q10, q3 + vshl.i32 q9, q3, #8 + vsri.32 q9, q3, #24 + vadd.i32 q8, q8, q9 + vext.32 q9, q9, q9, #3 + veor q2, q8, q2 + vext.32 q8, q8, q8, #2 + vshl.i32 q3, q2, #7 + vsri.32 q3, q2, #25 + vext.32 q3, q3, q3, #1 + vadd.i32 q10, q10, q3 + veor q9, q10, q9 + vrev32.16 q9, q9 + vadd.i32 q8, q8, q9 + veor q3, q8, q3 + vshl.i32 q2, q3, #12 + vsri.32 q2, q3, #20 + vadd.i32 q10, q10, q2 + vmov q3, q2 @ v4si + veor q9, q10, q9 + vshl.i32 q2, q9, #8 + vsri.32 q2, q9, #24 + vadd.i32 q8, q8, q2 + vext.32 q2, q2, q2, #1 + veor q3, q8, q3 + vext.32 q8, q8, q8, #2 + vshl.i32 q9, q3, #7 + vsri.32 q9, q3, #25 + vext.32 q9, q9, q9, #3 + bne .L7 + ldr r0, [r5] @ unaligned + vadd.i32 q1, q12, q10 + ldr r1, [r5, #4] @ unaligned + mov ip, lr + ldr r2, [r5, #8] @ unaligned + mov r9, lr + ldr r3, [r5, #12] @ unaligned + mov r10, r5 + vadd.i32 q9, q13, q9 + mov r8, lr + vadd.i32 q8, q14, q8 + stmia ip!, {r0, r1, r2, r3} + mov ip, lr + vldr d20, [r6, #80] + vldr d21, [r6, #88] + vadd.i32 q3, q11, q2 + veor q10, q10, q1 + vadd.i32 q11, q11, q15 + vstr d20, [r6, #80] + vstr d21, [r6, #88] + ldmia r9!, {r0, r1, r2, r3} + mov r9, r5 + str r0, [r4] @ unaligned + str r1, [r4, #4] @ unaligned + str r2, [r4, #8] @ unaligned + str r3, [r4, #12] @ unaligned + ldr r0, [r10, #16]! @ unaligned + ldr r1, [r10, #4] @ unaligned + ldr r2, [r10, #8] @ unaligned + ldr r3, [r10, #12] @ unaligned + add r10, r4, #48 + adds r4, r4, #64 + stmia r8!, {r0, r1, r2, r3} + mov r8, lr + vldr d20, [r6, #80] + vldr d21, [r6, #88] + veor q10, q10, q9 + vstr d20, [r6, #80] + vstr d21, [r6, #88] + ldmia ip!, {r0, r1, r2, r3} + mov ip, lr + str r0, [r4, #-48] @ unaligned + str r1, [r4, #-44] @ unaligned + str r2, [r4, #-40] @ unaligned + str r3, [r4, #-36] @ unaligned + ldr r0, [r9, #32]! @ unaligned + ldr r1, [r9, #4] @ unaligned + ldr r2, [r9, #8] @ unaligned + ldr r3, [r9, #12] @ unaligned + mov r9, r5 + adds r5, r5, #64 + stmia r8!, {r0, r1, r2, r3} + mov r8, lr + vldr d18, [r6, #80] + vldr d19, [r6, #88] + veor q9, q9, q8 + vstr d18, [r6, #80] + vstr d19, [r6, #88] + ldmia ip!, {r0, r1, r2, r3} + mov ip, lr + str r0, [r4, #-32] @ unaligned + str r1, [r4, #-28] @ unaligned + str r2, [r4, #-24] @ unaligned + str r3, [r4, #-20] @ unaligned + ldr r0, [r9, #48]! @ unaligned + ldr r1, [r9, #4] @ unaligned + ldr r2, [r9, #8] @ unaligned + ldr r3, [r9, #12] @ unaligned + stmia r8!, {r0, r1, r2, r3} + vldr d16, [r6, #80] + vldr d17, [r6, #88] + veor q8, q8, q3 + vstr d16, [r6, #80] + vstr d17, [r6, #88] + ldmia ip!, {r0, r1, r2, r3} + str r0, [r4, #-16] @ unaligned + str r1, [r4, #-12] @ unaligned + str r3, [r10, #12] @ unaligned + ldr r3, [r7, #108] + str r2, [r10, #8] @ unaligned + cmp r3, #1 + beq .L88 + movs r3, #1 + str r3, [r7, #108] + b .L8 +.L96: + .align 3 +.L95: + .word 1 + .word 0 + .word 0 + .word 0 +.L88: + ldr fp, [r7, #104] + ldr r3, [r7, #12] + lsl fp, fp, #6 + add r3, r3, fp + str r3, [r7, #12] + ldr r3, [r7, #16] + add r3, r3, fp + str r3, [r7, #16] +.L6: + ldr r3, [r7, #8] + ands r9, r3, #63 + beq .L1 + vmov q3, q11 @ v4si + movs r3, #10 + vmov q8, q14 @ v4si + mov r5, r9 + vmov q15, q13 @ v4si + vmov q10, q12 @ v4si +.L10: + vadd.i32 q10, q10, q15 + subs r3, r3, #1 + veor q9, q3, q10 + vrev32.16 q9, q9 + vadd.i32 q8, q8, q9 + veor q15, q8, q15 + vshl.i32 q3, q15, #12 + vsri.32 q3, q15, #20 + vadd.i32 q10, q10, q3 + veor q15, q10, q9 + vshl.i32 q9, q15, #8 + vsri.32 q9, q15, #24 + vadd.i32 q8, q8, q9 + vext.32 q9, q9, q9, #3 + veor q3, q8, q3 + vext.32 q8, q8, q8, #2 + vshl.i32 q15, q3, #7 + vsri.32 q15, q3, #25 + vext.32 q15, q15, q15, #1 + vadd.i32 q10, q10, q15 + veor q9, q10, q9 + vrev32.16 q9, q9 + vadd.i32 q8, q8, q9 + veor q15, q8, q15 + vshl.i32 q3, q15, #12 + vsri.32 q3, q15, #20 + vadd.i32 q10, q10, q3 + vmov q15, q3 @ v4si + veor q9, q10, q9 + vshl.i32 q3, q9, #8 + vsri.32 q3, q9, #24 + vadd.i32 q8, q8, q3 + vext.32 q3, q3, q3, #1 + veor q9, q8, q15 + vext.32 q8, q8, q8, #2 + vshl.i32 q15, q9, #7 + vsri.32 q15, q9, #25 + vext.32 q15, q15, q15, #3 + bne .L10 + cmp r5, #15 + mov r9, r5 + bhi .L89 + vadd.i32 q12, q12, q10 + ldr r3, [r7, #72] + vst1.64 {d24-d25}, [r3:128] +.L14: + ldr r3, [r7, #8] + and r2, r3, #48 + cmp r9, r2 + bls .L1 + ldr r6, [r7, #16] + add r3, r2, #16 + ldr r1, [r7, #12] + rsb ip, r2, r9 + adds r0, r1, r2 + mov r4, r6 + add r1, r1, r3 + add r4, r4, r2 + add r3, r3, r6 + cmp r0, r3 + it cc + cmpcc r4, r1 + ite cs + movcs r3, #1 + movcc r3, #0 + cmp ip, #18 + ite ls + movls r3, #0 + andhi r3, r3, #1 + cmp r3, #0 + beq .L16 + and r1, r0, #7 + mov r3, r2 + negs r1, r1 + and r1, r1, #15 + cmp r1, ip + it cs + movcs r1, ip + cmp r1, #0 + beq .L17 + ldr r5, [r7, #72] + cmp r1, #1 + ldrb r0, [r0] @ zero_extendqisi2 + add r3, r2, #1 + ldrb lr, [r5, r2] @ zero_extendqisi2 + mov r6, r5 + eor r0, lr, r0 + strb r0, [r4] + beq .L17 + ldr r0, [r7, #12] + cmp r1, #2 + ldrb r4, [r5, r3] @ zero_extendqisi2 + ldr r5, [r7, #16] + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #2 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #3 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #3 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #4 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #4 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #5 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #5 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #6 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #6 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #7 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #7 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #8 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #8 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #9 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #9 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #10 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #10 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #11 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #11 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #12 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #12 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #13 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #13 + beq .L17 + ldr r0, [r7, #12] + cmp r1, #15 + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #14 + bne .L17 + ldr r0, [r7, #12] + ldrb r4, [r6, r3] @ zero_extendqisi2 + ldrb r0, [r0, r3] @ zero_extendqisi2 + eors r0, r0, r4 + strb r0, [r5, r3] + add r3, r2, #15 +.L17: + rsb r4, r1, ip + add r0, ip, #-1 + sub r6, r4, #16 + subs r0, r0, r1 + cmp r0, #14 + lsr r6, r6, #4 + add r6, r6, #1 + lsl lr, r6, #4 + bls .L19 + add r2, r2, r1 + ldr r1, [r7, #12] + ldr r5, [r7, #16] + cmp r6, #1 + add r0, r1, r2 + ldr r1, [r7, #72] + add r1, r1, r2 + vld1.64 {d18-d19}, [r0:64] + add r2, r2, r5 + vld1.8 {q8}, [r1] + veor q8, q8, q9 + vst1.8 {q8}, [r2] + beq .L20 + add r8, r1, #16 + add ip, r2, #16 + vldr d18, [r0, #16] + vldr d19, [r0, #24] + cmp r6, #2 + vld1.8 {q8}, [r8] + veor q8, q8, q9 + vst1.8 {q8}, [ip] + beq .L20 + add r8, r1, #32 + add ip, r2, #32 + vldr d18, [r0, #32] + vldr d19, [r0, #40] + cmp r6, #3 + vld1.8 {q8}, [r8] + veor q8, q8, q9 + vst1.8 {q8}, [ip] + beq .L20 + adds r1, r1, #48 + adds r2, r2, #48 + vldr d18, [r0, #48] + vldr d19, [r0, #56] + vld1.8 {q8}, [r1] + veor q8, q8, q9 + vst1.8 {q8}, [r2] +.L20: + cmp lr, r4 + add r3, r3, lr + beq .L1 +.L19: + ldr r4, [r7, #72] + adds r2, r3, #1 + ldr r1, [r7, #12] + cmp r2, r9 + ldr r5, [r7, #16] + ldrb r0, [r4, r3] @ zero_extendqisi2 + ldrb r1, [r1, r3] @ zero_extendqisi2 + eor r1, r1, r0 + strb r1, [r5, r3] + bcs .L1 + ldr r0, [r7, #12] + adds r1, r3, #2 + mov r6, r4 + cmp r9, r1 + ldrb r4, [r4, r2] @ zero_extendqisi2 + ldrb r0, [r0, r2] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r2] + bls .L1 + ldr r0, [r7, #12] + adds r2, r3, #3 + ldrb r4, [r6, r1] @ zero_extendqisi2 + cmp r9, r2 + ldrb r0, [r0, r1] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r1] + bls .L1 + ldr r0, [r7, #12] + adds r1, r3, #4 + ldrb r4, [r6, r2] @ zero_extendqisi2 + cmp r9, r1 + ldrb r0, [r0, r2] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r2] + bls .L1 + ldr r0, [r7, #12] + adds r2, r3, #5 + ldrb r4, [r6, r1] @ zero_extendqisi2 + cmp r9, r2 + ldrb r0, [r0, r1] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r1] + bls .L1 + ldr r0, [r7, #12] + adds r1, r3, #6 + ldrb r4, [r6, r2] @ zero_extendqisi2 + cmp r9, r1 + ldrb r0, [r0, r2] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r2] + bls .L1 + ldr r0, [r7, #12] + adds r2, r3, #7 + ldrb r4, [r6, r1] @ zero_extendqisi2 + cmp r9, r2 + ldrb r0, [r0, r1] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r1] + bls .L1 + ldr r0, [r7, #12] + add r1, r3, #8 + ldrb r4, [r6, r2] @ zero_extendqisi2 + cmp r9, r1 + ldrb r0, [r0, r2] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r2] + bls .L1 + ldr r0, [r7, #12] + add r2, r3, #9 + ldrb r4, [r6, r1] @ zero_extendqisi2 + cmp r9, r2 + ldrb r0, [r0, r1] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r1] + bls .L1 + ldr r0, [r7, #12] + add r1, r3, #10 + ldrb r4, [r6, r2] @ zero_extendqisi2 + cmp r9, r1 + ldrb r0, [r0, r2] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r2] + bls .L1 + ldr r0, [r7, #12] + add r2, r3, #11 + ldrb r4, [r6, r1] @ zero_extendqisi2 + cmp r9, r2 + ldrb r0, [r0, r1] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r1] + bls .L1 + ldr r0, [r7, #12] + add r1, r3, #12 + ldrb r4, [r6, r2] @ zero_extendqisi2 + cmp r9, r1 + ldrb r0, [r0, r2] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r2] + bls .L1 + ldr r0, [r7, #12] + add r2, r3, #13 + ldrb r4, [r6, r1] @ zero_extendqisi2 + cmp r9, r2 + ldrb r0, [r0, r1] @ zero_extendqisi2 + eor r0, r0, r4 + strb r0, [r5, r1] + bls .L1 + ldr r1, [r7, #12] + adds r3, r3, #14 + ldrb r0, [r6, r2] @ zero_extendqisi2 + cmp r9, r3 + ldrb r1, [r1, r2] @ zero_extendqisi2 + eor r1, r1, r0 + strb r1, [r5, r2] + bls .L1 + ldr r2, [r7, #72] + ldrb r1, [r2, r3] @ zero_extendqisi2 + ldr r2, [r7, #12] + ldrb r2, [r2, r3] @ zero_extendqisi2 + eors r2, r2, r1 + ldr r1, [r7, #16] + strb r2, [r1, r3] +.L1: + adds r7, r7, #132 + mov sp, r7 + @ sp needed + vldm sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, fp, pc} +.L89: + ldr r4, [r7, #12] + vadd.i32 q12, q12, q10 + ldr r5, [r7, #72] + cmp r9, #31 + ldr r0, [r4] @ unaligned + add r6, r5, #80 + ldr r1, [r4, #4] @ unaligned + ldr r2, [r4, #8] @ unaligned + mov r5, r6 + ldr r3, [r4, #12] @ unaligned + mov r4, r6 + str r6, [r7, #68] + stmia r6!, {r0, r1, r2, r3} + ldr r2, [r7, #72] + ldr r6, [r7, #16] + vldr d18, [r2, #80] + vldr d19, [r2, #88] + veor q9, q9, q12 + vstr d18, [r2, #80] + vstr d19, [r2, #88] + ldmia r4!, {r0, r1, r2, r3} + str r1, [r6, #4] @ unaligned + mov r1, r6 + str r0, [r6] @ unaligned + str r2, [r6, #8] @ unaligned + str r3, [r6, #12] @ unaligned + bhi .L90 + vadd.i32 q13, q13, q15 + ldr r3, [r7, #72] + vstr d26, [r3, #16] + vstr d27, [r3, #24] + b .L14 +.L16: + subs r3, r2, #1 + ldr r2, [r7, #12] + add r2, r2, r9 + mov r5, r2 + ldr r2, [r7, #72] + add r2, r2, r3 + mov r3, r2 +.L24: + ldrb r1, [r0], #1 @ zero_extendqisi2 + ldrb r2, [r3, #1]! @ zero_extendqisi2 + cmp r0, r5 + eor r2, r2, r1 + strb r2, [r4], #1 + bne .L24 + adds r7, r7, #132 + mov sp, r7 + @ sp needed + vldm sp!, {d8-d15} + pop {r4, r5, r6, r7, r8, r9, r10, fp, pc} +.L26: + str fp, [r7, #16] + b .L2 +.L90: + ldr r3, [r7, #12] + add lr, r1, #16 + mov r4, r5 + mov r6, r5 + mov r5, r1 + vadd.i32 q13, q13, q15 + ldr r0, [r3, #16]! @ unaligned + cmp r9, #47 + ldr r1, [r3, #4] @ unaligned + ldr r2, [r3, #8] @ unaligned + ldr r3, [r3, #12] @ unaligned + stmia r6!, {r0, r1, r2, r3} + ldr r2, [r7, #72] + vldr d18, [r2, #80] + vldr d19, [r2, #88] + veor q13, q9, q13 + vstr d26, [r2, #80] + vstr d27, [r2, #88] + ldmia r4!, {r0, r1, r2, r3} + str r0, [r5, #16] @ unaligned + str r1, [lr, #4] @ unaligned + str r2, [lr, #8] @ unaligned + str r3, [lr, #12] @ unaligned + bhi .L91 + vadd.i32 q8, q14, q8 + ldr r3, [r7, #72] + vstr d16, [r3, #32] + vstr d17, [r3, #40] + b .L14 +.L91: + ldr r3, [r7, #12] + add lr, r5, #32 + ldr r4, [r7, #68] + vadd.i32 q8, q14, q8 + ldr r5, [r7, #72] + vadd.i32 q11, q11, q3 + ldr r0, [r3, #32]! @ unaligned + mov r6, r4 + vstr d22, [r5, #48] + vstr d23, [r5, #56] + ldr r1, [r3, #4] @ unaligned + ldr r2, [r3, #8] @ unaligned + ldr r3, [r3, #12] @ unaligned + stmia r4!, {r0, r1, r2, r3} + vldr d18, [r5, #80] + vldr d19, [r5, #88] + veor q9, q9, q8 + ldr r4, [r7, #16] + vstr d18, [r5, #80] + vstr d19, [r5, #88] + ldmia r6!, {r0, r1, r2, r3} + str r0, [r4, #32] @ unaligned + str r1, [lr, #4] @ unaligned + str r2, [lr, #8] @ unaligned + str r3, [lr, #12] @ unaligned + b .L14 + .size CRYPTO_chacha_20_neon, .-CRYPTO_chacha_20_neon + .section .rodata + .align 2 +.LANCHOR0 = . + 0 +.LC0: + .word 1634760805 + .word 857760878 + .word 2036477234 + .word 1797285236 + .ident "GCC: (Linaro GCC 2014.11) 4.9.3 20141031 (prerelease)" + .section .note.GNU-stack,"",%progbits + +#endif /* !OPENSSL_NO_ASM */ |