summaryrefslogtreecommitdiffstats
path: root/win-x86_64/crypto/sha/sha512-x86_64.asm
diff options
context:
space:
mode:
authorAdam Langley <agl@chromium.org>2016-01-13 15:00:54 -0800
committerAdam Langley <agl@chromium.org>2016-01-14 13:09:44 -0800
commit4139edb02e59e7ad48e0a8f4c02e45923bc8a344 (patch)
tree80b47f41b8e3971267452f49e48560c9c36434e2 /win-x86_64/crypto/sha/sha512-x86_64.asm
parent55181dbbcdc86b9abed8bd900f1041344211663c (diff)
downloadexternal_boringssl-4139edb02e59e7ad48e0a8f4c02e45923bc8a344.zip
external_boringssl-4139edb02e59e7ad48e0a8f4c02e45923bc8a344.tar.gz
external_boringssl-4139edb02e59e7ad48e0a8f4c02e45923bc8a344.tar.bz2
external/boringssl: sync to 7b8b9c17
This includes the following changes from BoringSSL : 7b8b9c1 Include 'asm' in the name of X25519 asm sources. 3202750 Update the fuzz tests for the server. 6544426 Fix a ** 0 mod 1 = 0 for real this time. fe5f7c7 Only reserve EVP_MAX_MD_SIZE for the Finished, not twice of it. 0d56f88 Switch s to ssl everywhere. 974c7ba Route DHE through the SSL_ECDH abstraction as well. 4cc36ad Make it possible to tell what curve was used on the server. 4298d77 Implement draft-ietf-tls-curve25519-01 in C. c18ef75 Allocate a NID for X25519. 3a2a480 Remove long-dead comment. cba2b62 Implement draft-ietf-tls-curve25519-01 in Go. ab14563 Bundle a copy of golang.org/x/crypto/curve25519 for testing. a029ebc Switch the bundled poly1305 to relative imports. 64d9250 Completely remove P-224 from the TLS stack. 8c2b3bf Test all supported curves (including those off by default). fc82512 Convert ssl3_send_cert_verify to CBB. 5fb18c6 Make MSVC happy. 2a0b391 Rewrite ssl3_send_server_key_exchange to use CBB. d16bf34 Add a -lldb flag to runner.go. af21bcf Remove other unnecessary BN_CTX allocations. ae0eaaa Convert ssl3_send_client_key_exchange to CBB. 3ac4b3a Remove NO_ASM define that I accidently included in the previous commit. e6c5402 Don't build X25519 asm code when NO_ASM is set. 77a173e Add x86-64 assembly for X25519. c75c0ae Add #defines for ED25519 key and signature lengths. 48cce66 Tidy up ssl3_get_server_key_exchange slightly. c1cc858 Check for EC_KEY_set_public_key error. 4cc671c Add CBB_reserve and CBB_did_write. e13263d Resolve a few old TODOs. 841934f Remove stack macros for nonexistent types. 70ab223 Remove ASN1_R_MALLOC_FAILURE. b965c63 Reject calls to X509_verify_cert that have not been reinitialised 3f5b43d Simplify RSA key exchange padding check. 3ef6085 Refuse to parse RSA pubkeys with invalid exponents. afe57cb Add a tool to generate Ed25519 keys. 77c3c0b Enable Ed25519 when building with OPENSSL_SMALL. 9f897b2 Remove the stitched RC4-MD5 code and use the generic one. 1741a9d Save some mallocs in computing the MAC for e_tls.c. df57163 Add RC4-SHA1 and DES-EDE3-CBC-SHA1 to bssl speed. 13414b3 Implement draft-ietf-tls-chacha20-poly1305-04. 3748990 Implement draft-ietf-tls-chacha20-poly1305-04 in Go. 2089fdd Implement RFC 7539 in Go. 86e412d Add client cert support to bssl client. 23a681b Fix build. e320392 Rename the Go ChaCha20-Poly1305 implementation. 8ffab72 Point EVP_aead_chacha20_poly1305 at the standardized version. fef6fb5 Fix ChaCha20-Poly1305 tests. 60a08ac Remove unreachable code to duplicate DH keys. 4ec0cce Slightly tweak some array allocations. 2936170 Fix memory leak in DSA redo case. a01deee Make CBB_len relative to its argument. 77385bb Mark platform-specific HOST_[c2l|l2c] as (void). 6969971 Remove a dead prototype. 1b36716 Remove crypto/header_removed.h. 017231a Remove asm __asm__ define. 793c21e Make HOST_l2c return void. 0aff3ff Store the partial block as uint8_t, not uint32_t. 5a19d7d Use the straight-forward ROTATE macro. 78fefbf Reformat md32_common.h, part 2. fea1137 Reformat md32_common.h, part 1. 871fff0 *_Update of length zero is legal. d9f0671 Remove |need_record_splitting| from |SSL3_STATE|. cd48038 Remove unused fields from SSL3_STATE. 7fc0100 Slightly simplify SSL3_RECORD. ece5ba2 Reset ssl error codes. a41280d Pull ChangeCipherSpec into the handshake state machine. 8fd5c23 Simplify fragmented HelloRequest state. ef5dfd2 Add tests for malformed HelloRequests. 8411b24 Add tests for bad ChangeCipherSpecs. 502a843 Switch unrolled loop in BN_usub with memcpy. c3ae38b Remove DH EVP_PKEY hooks. 7100ee9 Chromium's update.sh is dead, long live update.py f28dd64 Fix flaky BadRSAClientKeyExchange-1 test. 4234885 Remove unused functions. 45dab25 Skip free callbacks on empty CRYPTO_EX_DATAs. 8a58933 Remove the CRYPTO_EX_new callback. 0abd6f2 Get struct timeval from sys/time.h. 1246670 Use UINT64_C in sha512.c table. 5ddffbb Make SSL_(CTX_)?set_tmp_ecdh call SSL_(CTX_)?set1_curves. 53e5c2c Remove SSL_(CTX_)?set_ecdh_callback. 756ad17 Initialize |one_index| in OAEP padding check. 1634a33 Convert rsa/padding.c to constant-time helpers. b36a395 Add slightly better RSA key exchange tests. 0bd71eb Remove weird ret negation logic. e9cddb8 Remove SSL_OP_LEGACY_SERVER_CONNECT. 3e052de Tighten SSL_OP_LEGACY_SERVER_CONNECT to align with RFC 5746. 03f0005 Remove SSL_OP_MICROSOFT_BIG_SSLV3_BUFFER. ef5e515 Remove SSL_OP_TLS_D5_BUG. c100ef4 Limit depth of ASN1 parse printing. 2205093 Add a comment in SetTestState from bssl_shim. 6ae67df Don't leak Android hacks to other build platforms. a0ef7b0 Enforce that |EC_KEY| private key is in [0, group->order). 533a273 Add |EC_METHOD| method for verifying public key order. a3d9de0 Add |EC_GROUP_get0_order| to replace |EC_GROUP_get_order|. 8847856 Include <sys/time.h> in packeted_bio.h for 'timeval' dca63cf Don't abort in |init_once| if |fcntl| returns ENOSYS afd565f Add defines for SRTP profiles using GCM ciphers from RFC 7714. 902870e Gate SHA_CTX compatibility on !WINDOWS. 34aa55c Support the SHA_CTX hack without ANDROID. 6d9e5a7 Re-apply 75b833cc819a9d189adb0fdd56327bee600ff9e9 28243c0 Add PSS parameter check. e701f16 bn/asm/x86_64-mont5.pl: fix carry propagating bug (CVE-2015-3193). cb85298 Fix leak with ASN.1 combine. c4f25ce Work around yaSSL bug. c5eb467 Remove dead code in p256-x86_64. 758d127 Add get0 getters for EVP_PKEY. fde89b4 avoid clashes with libc's 'open' in e_chacha20poly1305.c 60a45aa Remove reference to removed |RSA_FLAG_NO_CONSTTIME| flag. 81edc9b Do away with BN_LLONG in favor of BN_ULLONG. e8fe07f Fix AES XTS mode key size. 93a5b44 Make CRYPTO_library_init use a CRYPTO_once_t. bf76218 Remove the |ri| field of |BN_MONT_CTX|. 596ab10 s/BN_BITS/BN_BITS2/ in |BN_mod_inverse_ex|; remove |BN_BITS| & |BN_MASK|. 7af36e1 Share common definitions of |TOBN| and |BIGNUM_STATIC|. ff2df33 Reformat the cipher suite table. 9f2e277 Remove strength_bits. d6e9eec Remove algo_strength. dcb6ef0 Remove algorithm_ssl. d28f59c Switch the keylog BIO to a callback. fba735c Register the *25519 tests as dependencies of all_tests. f3376ac Remove |EC_POINTs_mul| & simplify p256-x86_64. 301efc8 Fix error handling in |p256-x86_64|. e2136d9 Remove |EC_GROUP_precompute_mult| and |EC_KEY_precompute_mult|. 9b26297 Make |EC_GROUP_precompute_mult|/|EC_KEY_precompute_mult| no-ops. 5058d79 Remove p224-64 and p256-64 dead code for non-default generators. b1b6229 Add NEON implementation of curve25519. 9e65d48 Allow |CRYPTO_is_NEON_capable| to be known at compile time, if possible. 3ac32b1 Fix curve25519 code for MSVC. 4fb0dc4 Add X25519 and Ed25519 support. c324f17 Make sure pthread_once() succeeds. 9361243 Don't include <alloca.h>, it's no longer needed. b00061c Add SSL_CIPHER_is_AES[128|256]CBC. 3a59611 size_t SSL*_use_*_ASN1. b324159 Fix ssl3_send_server_key_exchange error path. f584a5a Reset epoch state in one place. 2077cf9 Use UINT64_C instead of OPENSSL_U64. af07365 Check for overflow when parsing a CBS with d2i_*. 780cd92 modes/asm/ghash-armv4.pl: extend Apple fix to all clang cases. f9c77de Drop CBB allocation failure test. a33915d Have |CBB_init| zero the |CBB| before any possible failures. c5c85de Make RAND_seed read a byte of random data. d9e2702 Don't encode or decode ∞. e7806fd Remove point-on-curve check from |ec_GFp_simple_oct2point|. 20c3731 Become partially -Wmissing-variable-declarations-clean. 7308aaa Remove `EC_GFp_simple_method` (dead code). f872951 Fix null pointer dereference when using "simple" EC. 8bde5d2 Remove the unused |Ni| member of |BN_MONT_CTX|. ce7ae6f Enable AVX code for SHA-*. 9f1f04f Remove nistz256 dead code for non-default generators. d7421eb Remove condition which always evaluates to true (size_t >= 0). d386394 Test for underflow before subtraction. ef14b2d Remove stl_compat.h. cd24a39 Limit DHE groups to 4096-bit. 99fdfb9 Move curve check out of tls12_check_peer_sigalg. Change-Id: Id2d7110569d250b1bae8f8ce7d4421a92f581a31
Diffstat (limited to 'win-x86_64/crypto/sha/sha512-x86_64.asm')
-rw-r--r--win-x86_64/crypto/sha/sha512-x86_64.asm2301
1 files changed, 2301 insertions, 0 deletions
diff --git a/win-x86_64/crypto/sha/sha512-x86_64.asm b/win-x86_64/crypto/sha/sha512-x86_64.asm
index b76cc0e..71449cd 100644
--- a/win-x86_64/crypto/sha/sha512-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha512-x86_64.asm
@@ -19,6 +19,17 @@ $L$SEH_begin_sha512_block_data_order:
mov rdx,r8
+ lea r11,[OPENSSL_ia32cap_P]
+ mov r9d,DWORD[r11]
+ mov r10d,DWORD[4+r11]
+ mov r11d,DWORD[8+r11]
+ test r10d,2048
+ jnz NEAR $L$xop_shortcut
+ and r9d,1073741824
+ and r10d,268435968
+ or r10d,r9d
+ cmp r10d,1342177792
+ je NEAR $L$avx_shortcut
push rbx
push rbp
push r12
@@ -1801,6 +1812,2282 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
DB 111,114,103,62,0
+
+ALIGN 64
+sha512_block_data_order_xop:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_xop:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+$L$xop_shortcut:
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r11,rsp
+ shl rdx,4
+ sub rsp,256
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[((128+24))+rsp],r11
+ movaps XMMWORD[(128+32)+rsp],xmm6
+ movaps XMMWORD[(128+48)+rsp],xmm7
+ movaps XMMWORD[(128+64)+rsp],xmm8
+ movaps XMMWORD[(128+80)+rsp],xmm9
+ movaps XMMWORD[(128+96)+rsp],xmm10
+ movaps XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_xop:
+
+ vzeroupper
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop_xop
+ALIGN 16
+$L$loop_xop:
+ vmovdqa xmm11,XMMWORD[((K512+1280))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ lea rbp,[((K512+128))]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vpshufb xmm0,xmm0,xmm11
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm1,xmm1,xmm11
+ vmovdqu xmm4,XMMWORD[64+rsi]
+ vpshufb xmm2,xmm2,xmm11
+ vmovdqu xmm5,XMMWORD[80+rsi]
+ vpshufb xmm3,xmm3,xmm11
+ vmovdqu xmm6,XMMWORD[96+rsi]
+ vpshufb xmm4,xmm4,xmm11
+ vmovdqu xmm7,XMMWORD[112+rsi]
+ vpshufb xmm5,xmm5,xmm11
+ vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
+ vpshufb xmm6,xmm6,xmm11
+ vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
+ vpshufb xmm7,xmm7,xmm11
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
+ vmovdqa XMMWORD[rsp],xmm8
+ vpaddq xmm8,xmm4,XMMWORD[rbp]
+ vmovdqa XMMWORD[16+rsp],xmm9
+ vpaddq xmm9,xmm5,XMMWORD[32+rbp]
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ vmovdqa XMMWORD[48+rsp],xmm11
+ vpaddq xmm11,xmm7,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[64+rsp],xmm8
+ mov r14,rax
+ vmovdqa XMMWORD[80+rsp],xmm9
+ mov rdi,rbx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ xor rdi,rcx
+ vmovdqa XMMWORD[112+rsp],xmm11
+ mov r13,r8
+ jmp NEAR $L$xop_00_47
+
+ALIGN 16
+$L$xop_00_47:
+ add rbp,256
+ vpalignr xmm8,xmm1,xmm0,8
+ ror r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm5,xmm4,8
+ mov r12,r9
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r8
+ xor r12,r10
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rax
+ vpaddq xmm0,xmm0,xmm11
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+DB 143,72,120,195,209,7
+ xor r12,r10
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,223,3
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ ror r14,28
+ vpsrlq xmm10,xmm7,6
+ add rdx,r11
+ add r11,rdi
+ vpaddq xmm0,xmm0,xmm8
+ mov r13,rdx
+ add r14,r11
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r11,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ vpaddq xmm0,xmm0,xmm11
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[rsp],xmm10
+ vpalignr xmm8,xmm2,xmm1,8
+ ror r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm6,xmm5,8
+ mov r12,rdx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rcx
+ xor r12,r8
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r10
+ vpaddq xmm1,xmm1,xmm11
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+DB 143,72,120,195,209,7
+ xor r12,r8
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,216,3
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ ror r14,28
+ vpsrlq xmm10,xmm0,6
+ add rbx,r9
+ add r9,rdi
+ vpaddq xmm1,xmm1,xmm8
+ mov r13,rbx
+ add r14,r9
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r9,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ vpaddq xmm1,xmm1,xmm11
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[16+rsp],xmm10
+ vpalignr xmm8,xmm3,xmm2,8
+ ror r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm7,xmm6,8
+ mov r12,rbx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rax
+ xor r12,rcx
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r8
+ vpaddq xmm2,xmm2,xmm11
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+DB 143,72,120,195,209,7
+ xor r12,rcx
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,217,3
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ ror r14,28
+ vpsrlq xmm10,xmm1,6
+ add r11,rdx
+ add rdx,rdi
+ vpaddq xmm2,xmm2,xmm8
+ mov r13,r11
+ add r14,rdx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rdx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ vpaddq xmm2,xmm2,xmm11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpalignr xmm8,xmm4,xmm3,8
+ ror r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm0,xmm7,8
+ mov r12,r11
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r10
+ xor r12,rax
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rcx
+ vpaddq xmm3,xmm3,xmm11
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+DB 143,72,120,195,209,7
+ xor r12,rax
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,218,3
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ ror r14,28
+ vpsrlq xmm10,xmm2,6
+ add r9,rbx
+ add rbx,rdi
+ vpaddq xmm3,xmm3,xmm8
+ mov r13,r9
+ add r14,rbx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rbx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ vpaddq xmm3,xmm3,xmm11
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[48+rsp],xmm10
+ vpalignr xmm8,xmm5,xmm4,8
+ ror r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm1,xmm0,8
+ mov r12,r9
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r8
+ xor r12,r10
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rax
+ vpaddq xmm4,xmm4,xmm11
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+DB 143,72,120,195,209,7
+ xor r12,r10
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,219,3
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ ror r14,28
+ vpsrlq xmm10,xmm3,6
+ add rdx,r11
+ add r11,rdi
+ vpaddq xmm4,xmm4,xmm8
+ mov r13,rdx
+ add r14,r11
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r11,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ vpaddq xmm4,xmm4,xmm11
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ vpaddq xmm10,xmm4,XMMWORD[rbp]
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[64+rsp],xmm10
+ vpalignr xmm8,xmm6,xmm5,8
+ ror r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm2,xmm1,8
+ mov r12,rdx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rcx
+ xor r12,r8
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r10
+ vpaddq xmm5,xmm5,xmm11
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+DB 143,72,120,195,209,7
+ xor r12,r8
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,220,3
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ ror r14,28
+ vpsrlq xmm10,xmm4,6
+ add rbx,r9
+ add r9,rdi
+ vpaddq xmm5,xmm5,xmm8
+ mov r13,rbx
+ add r14,r9
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r9,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ vpaddq xmm5,xmm5,xmm11
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ vpaddq xmm10,xmm5,XMMWORD[32+rbp]
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[80+rsp],xmm10
+ vpalignr xmm8,xmm7,xmm6,8
+ ror r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm3,xmm2,8
+ mov r12,rbx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rax
+ xor r12,rcx
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r8
+ vpaddq xmm6,xmm6,xmm11
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+DB 143,72,120,195,209,7
+ xor r12,rcx
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,221,3
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ ror r14,28
+ vpsrlq xmm10,xmm5,6
+ add r11,rdx
+ add rdx,rdi
+ vpaddq xmm6,xmm6,xmm8
+ mov r13,r11
+ add r14,rdx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rdx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ vpaddq xmm6,xmm6,xmm11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ vpalignr xmm8,xmm0,xmm7,8
+ ror r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm4,xmm3,8
+ mov r12,r11
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r10
+ xor r12,rax
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rcx
+ vpaddq xmm7,xmm7,xmm11
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+DB 143,72,120,195,209,7
+ xor r12,rax
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,222,3
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ ror r14,28
+ vpsrlq xmm10,xmm6,6
+ add r9,rbx
+ add rbx,rdi
+ vpaddq xmm7,xmm7,xmm8
+ mov r13,r9
+ add r14,rbx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rbx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ vpaddq xmm7,xmm7,xmm11
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ vpaddq xmm10,xmm7,XMMWORD[96+rbp]
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[112+rsp],xmm10
+ cmp BYTE[135+rbp],0
+ jne NEAR $L$xop_00_47
+ ror r13,23
+ mov rax,r14
+ mov r12,r9
+ ror r14,5
+ xor r13,r8
+ xor r12,r10
+ ror r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+ xor r12,r10
+ ror r14,6
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ ror r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ ror r13,23
+ mov r11,r14
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ ror r13,23
+ mov r10,r14
+ mov r12,rdx
+ ror r14,5
+ xor r13,rcx
+ xor r12,r8
+ ror r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ xor r12,r8
+ ror r14,6
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ ror r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ ror r13,23
+ mov r9,r14
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ ror r13,23
+ mov r8,r14
+ mov r12,rbx
+ ror r14,5
+ xor r13,rax
+ xor r12,rcx
+ ror r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ xor r12,rcx
+ ror r14,6
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ ror r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ ror r13,23
+ mov rdx,r14
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ ror r13,23
+ mov rcx,r14
+ mov r12,r11
+ ror r14,5
+ xor r13,r10
+ xor r12,rax
+ ror r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ xor r12,rax
+ ror r14,6
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ ror r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ ror r13,23
+ mov rbx,r14
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ ror r13,23
+ mov rax,r14
+ mov r12,r9
+ ror r14,5
+ xor r13,r8
+ xor r12,r10
+ ror r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ xor r12,r10
+ ror r14,6
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ ror r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ ror r13,23
+ mov r11,r14
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ ror r13,23
+ mov r10,r14
+ mov r12,rdx
+ ror r14,5
+ xor r13,rcx
+ xor r12,r8
+ ror r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ xor r12,r8
+ ror r14,6
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ ror r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ ror r13,23
+ mov r9,r14
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ ror r13,23
+ mov r8,r14
+ mov r12,rbx
+ ror r14,5
+ xor r13,rax
+ xor r12,rcx
+ ror r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ xor r12,rcx
+ ror r14,6
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ ror r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ ror r13,23
+ mov rdx,r14
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ ror r13,23
+ mov rcx,r14
+ mov r12,r11
+ ror r14,5
+ xor r13,r10
+ xor r12,rax
+ ror r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ xor r12,rax
+ ror r14,6
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ ror r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ ror r13,23
+ mov rbx,r14
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ mov rdi,QWORD[((128+0))+rsp]
+ mov rax,r14
+
+ add rax,QWORD[rdi]
+ lea rsi,[128+rsi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop_xop
+
+ mov rsi,QWORD[((128+24))+rsp]
+ vzeroupper
+ movaps xmm6,XMMWORD[((128+32))+rsp]
+ movaps xmm7,XMMWORD[((128+48))+rsp]
+ movaps xmm8,XMMWORD[((128+64))+rsp]
+ movaps xmm9,XMMWORD[((128+80))+rsp]
+ movaps xmm10,XMMWORD[((128+96))+rsp]
+ movaps xmm11,XMMWORD[((128+112))+rsp]
+ mov r15,QWORD[rsi]
+ mov r14,QWORD[8+rsi]
+ mov r13,QWORD[16+rsi]
+ mov r12,QWORD[24+rsi]
+ mov rbp,QWORD[32+rsi]
+ mov rbx,QWORD[40+rsi]
+ lea rsp,[48+rsi]
+$L$epilogue_xop:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+$L$SEH_end_sha512_block_data_order_xop:
+
+ALIGN 64
+sha512_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+$L$avx_shortcut:
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r11,rsp
+ shl rdx,4
+ sub rsp,256
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[((128+24))+rsp],r11
+ movaps XMMWORD[(128+32)+rsp],xmm6
+ movaps XMMWORD[(128+48)+rsp],xmm7
+ movaps XMMWORD[(128+64)+rsp],xmm8
+ movaps XMMWORD[(128+80)+rsp],xmm9
+ movaps XMMWORD[(128+96)+rsp],xmm10
+ movaps XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_avx:
+
+ vzeroupper
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop_avx
+ALIGN 16
+$L$loop_avx:
+ vmovdqa xmm11,XMMWORD[((K512+1280))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ lea rbp,[((K512+128))]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vpshufb xmm0,xmm0,xmm11
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm1,xmm1,xmm11
+ vmovdqu xmm4,XMMWORD[64+rsi]
+ vpshufb xmm2,xmm2,xmm11
+ vmovdqu xmm5,XMMWORD[80+rsi]
+ vpshufb xmm3,xmm3,xmm11
+ vmovdqu xmm6,XMMWORD[96+rsi]
+ vpshufb xmm4,xmm4,xmm11
+ vmovdqu xmm7,XMMWORD[112+rsi]
+ vpshufb xmm5,xmm5,xmm11
+ vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
+ vpshufb xmm6,xmm6,xmm11
+ vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
+ vpshufb xmm7,xmm7,xmm11
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
+ vmovdqa XMMWORD[rsp],xmm8
+ vpaddq xmm8,xmm4,XMMWORD[rbp]
+ vmovdqa XMMWORD[16+rsp],xmm9
+ vpaddq xmm9,xmm5,XMMWORD[32+rbp]
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ vmovdqa XMMWORD[48+rsp],xmm11
+ vpaddq xmm11,xmm7,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[64+rsp],xmm8
+ mov r14,rax
+ vmovdqa XMMWORD[80+rsp],xmm9
+ mov rdi,rbx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ xor rdi,rcx
+ vmovdqa XMMWORD[112+rsp],xmm11
+ mov r13,r8
+ jmp NEAR $L$avx_00_47
+
+ALIGN 16
+$L$avx_00_47:
+ add rbp,256
+ vpalignr xmm8,xmm1,xmm0,8
+ shrd r13,r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm5,xmm4,8
+ mov r12,r9
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r8
+ xor r12,r10
+ vpaddq xmm0,xmm0,xmm11
+ shrd r13,r13,4
+ xor r14,rax
+ vpsrlq xmm11,xmm8,7
+ and r12,r8
+ xor r13,r8
+ vpsllq xmm9,xmm8,56
+ add r11,QWORD[rsp]
+ mov r15,rax
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r10
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rbx
+ add r11,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm7,6
+ add rdx,r11
+ add r11,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rdx
+ add r14,r11
+ vpsllq xmm10,xmm7,3
+ shrd r13,r13,23
+ mov r11,r14
+ vpaddq xmm0,xmm0,xmm8
+ mov r12,r8
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm7,19
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r11
+ vpsllq xmm10,xmm10,42
+ and r12,rdx
+ xor r13,rdx
+ vpxor xmm11,xmm11,xmm9
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ vpsrlq xmm9,xmm9,42
+ xor r12,r9
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rax
+ add r10,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm0,xmm0,xmm11
+ xor r14,r11
+ add r10,r13
+ vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[rsp],xmm10
+ vpalignr xmm8,xmm2,xmm1,8
+ shrd r13,r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm6,xmm5,8
+ mov r12,rdx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rcx
+ xor r12,r8
+ vpaddq xmm1,xmm1,xmm11
+ shrd r13,r13,4
+ xor r14,r10
+ vpsrlq xmm11,xmm8,7
+ and r12,rcx
+ xor r13,rcx
+ vpsllq xmm9,xmm8,56
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r8
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r11
+ add r9,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm0,6
+ add rbx,r9
+ add r9,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rbx
+ add r14,r9
+ vpsllq xmm10,xmm0,3
+ shrd r13,r13,23
+ mov r9,r14
+ vpaddq xmm1,xmm1,xmm8
+ mov r12,rcx
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm0,19
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r9
+ vpsllq xmm10,xmm10,42
+ and r12,rbx
+ xor r13,rbx
+ vpxor xmm11,xmm11,xmm9
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ vpsrlq xmm9,xmm9,42
+ xor r12,rdx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r10
+ add r8,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm1,xmm1,xmm11
+ xor r14,r9
+ add r8,r13
+ vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[16+rsp],xmm10
+ vpalignr xmm8,xmm3,xmm2,8
+ shrd r13,r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm7,xmm6,8
+ mov r12,rbx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rax
+ xor r12,rcx
+ vpaddq xmm2,xmm2,xmm11
+ shrd r13,r13,4
+ xor r14,r8
+ vpsrlq xmm11,xmm8,7
+ and r12,rax
+ xor r13,rax
+ vpsllq xmm9,xmm8,56
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rcx
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r9
+ add rdx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm1,6
+ add r11,rdx
+ add rdx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r11
+ add r14,rdx
+ vpsllq xmm10,xmm1,3
+ shrd r13,r13,23
+ mov rdx,r14
+ vpaddq xmm2,xmm2,xmm8
+ mov r12,rax
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm1,19
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rdx
+ vpsllq xmm10,xmm10,42
+ and r12,r11
+ xor r13,r11
+ vpxor xmm11,xmm11,xmm9
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ vpsrlq xmm9,xmm9,42
+ xor r12,rbx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r8
+ add rcx,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm2,xmm2,xmm11
+ xor r14,rdx
+ add rcx,r13
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpalignr xmm8,xmm4,xmm3,8
+ shrd r13,r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm0,xmm7,8
+ mov r12,r11
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r10
+ xor r12,rax
+ vpaddq xmm3,xmm3,xmm11
+ shrd r13,r13,4
+ xor r14,rcx
+ vpsrlq xmm11,xmm8,7
+ and r12,r10
+ xor r13,r10
+ vpsllq xmm9,xmm8,56
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rax
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rdx
+ add rbx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm2,6
+ add r9,rbx
+ add rbx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r9
+ add r14,rbx
+ vpsllq xmm10,xmm2,3
+ shrd r13,r13,23
+ mov rbx,r14
+ vpaddq xmm3,xmm3,xmm8
+ mov r12,r10
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm2,19
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rbx
+ vpsllq xmm10,xmm10,42
+ and r12,r9
+ xor r13,r9
+ vpxor xmm11,xmm11,xmm9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ vpsrlq xmm9,xmm9,42
+ xor r12,r11
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rcx
+ add rax,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm3,xmm3,xmm11
+ xor r14,rbx
+ add rax,r13
+ vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[48+rsp],xmm10
+ vpalignr xmm8,xmm5,xmm4,8
+ shrd r13,r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm1,xmm0,8
+ mov r12,r9
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r8
+ xor r12,r10
+ vpaddq xmm4,xmm4,xmm11
+ shrd r13,r13,4
+ xor r14,rax
+ vpsrlq xmm11,xmm8,7
+ and r12,r8
+ xor r13,r8
+ vpsllq xmm9,xmm8,56
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r10
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rbx
+ add r11,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm3,6
+ add rdx,r11
+ add r11,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rdx
+ add r14,r11
+ vpsllq xmm10,xmm3,3
+ shrd r13,r13,23
+ mov r11,r14
+ vpaddq xmm4,xmm4,xmm8
+ mov r12,r8
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm3,19
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r11
+ vpsllq xmm10,xmm10,42
+ and r12,rdx
+ xor r13,rdx
+ vpxor xmm11,xmm11,xmm9
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ vpsrlq xmm9,xmm9,42
+ xor r12,r9
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rax
+ add r10,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm4,xmm4,xmm11
+ xor r14,r11
+ add r10,r13
+ vpaddq xmm10,xmm4,XMMWORD[rbp]
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[64+rsp],xmm10
+ vpalignr xmm8,xmm6,xmm5,8
+ shrd r13,r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm2,xmm1,8
+ mov r12,rdx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rcx
+ xor r12,r8
+ vpaddq xmm5,xmm5,xmm11
+ shrd r13,r13,4
+ xor r14,r10
+ vpsrlq xmm11,xmm8,7
+ and r12,rcx
+ xor r13,rcx
+ vpsllq xmm9,xmm8,56
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r8
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r11
+ add r9,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm4,6
+ add rbx,r9
+ add r9,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rbx
+ add r14,r9
+ vpsllq xmm10,xmm4,3
+ shrd r13,r13,23
+ mov r9,r14
+ vpaddq xmm5,xmm5,xmm8
+ mov r12,rcx
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm4,19
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r9
+ vpsllq xmm10,xmm10,42
+ and r12,rbx
+ xor r13,rbx
+ vpxor xmm11,xmm11,xmm9
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ vpsrlq xmm9,xmm9,42
+ xor r12,rdx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r10
+ add r8,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm5,xmm5,xmm11
+ xor r14,r9
+ add r8,r13
+ vpaddq xmm10,xmm5,XMMWORD[32+rbp]
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[80+rsp],xmm10
+ vpalignr xmm8,xmm7,xmm6,8
+ shrd r13,r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm3,xmm2,8
+ mov r12,rbx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rax
+ xor r12,rcx
+ vpaddq xmm6,xmm6,xmm11
+ shrd r13,r13,4
+ xor r14,r8
+ vpsrlq xmm11,xmm8,7
+ and r12,rax
+ xor r13,rax
+ vpsllq xmm9,xmm8,56
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rcx
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r9
+ add rdx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm5,6
+ add r11,rdx
+ add rdx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r11
+ add r14,rdx
+ vpsllq xmm10,xmm5,3
+ shrd r13,r13,23
+ mov rdx,r14
+ vpaddq xmm6,xmm6,xmm8
+ mov r12,rax
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm5,19
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rdx
+ vpsllq xmm10,xmm10,42
+ and r12,r11
+ xor r13,r11
+ vpxor xmm11,xmm11,xmm9
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ vpsrlq xmm9,xmm9,42
+ xor r12,rbx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r8
+ add rcx,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm6,xmm6,xmm11
+ xor r14,rdx
+ add rcx,r13
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ vpalignr xmm8,xmm0,xmm7,8
+ shrd r13,r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm4,xmm3,8
+ mov r12,r11
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r10
+ xor r12,rax
+ vpaddq xmm7,xmm7,xmm11
+ shrd r13,r13,4
+ xor r14,rcx
+ vpsrlq xmm11,xmm8,7
+ and r12,r10
+ xor r13,r10
+ vpsllq xmm9,xmm8,56
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rax
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rdx
+ add rbx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm6,6
+ add r9,rbx
+ add rbx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r9
+ add r14,rbx
+ vpsllq xmm10,xmm6,3
+ shrd r13,r13,23
+ mov rbx,r14
+ vpaddq xmm7,xmm7,xmm8
+ mov r12,r10
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm6,19
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rbx
+ vpsllq xmm10,xmm10,42
+ and r12,r9
+ xor r13,r9
+ vpxor xmm11,xmm11,xmm9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ vpsrlq xmm9,xmm9,42
+ xor r12,r11
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rcx
+ add rax,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm7,xmm7,xmm11
+ xor r14,rbx
+ add rax,r13
+ vpaddq xmm10,xmm7,XMMWORD[96+rbp]
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[112+rsp],xmm10
+ cmp BYTE[135+rbp],0
+ jne NEAR $L$avx_00_47
+ shrd r13,r13,23
+ mov rax,r14
+ mov r12,r9
+ shrd r14,r14,5
+ xor r13,r8
+ xor r12,r10
+ shrd r13,r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+ xor r12,r10
+ shrd r14,r14,6
+ xor r15,rbx
+ add r11,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ shrd r14,r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ shrd r13,r13,23
+ mov r11,r14
+ mov r12,r8
+ shrd r14,r14,5
+ xor r13,rdx
+ xor r12,r9
+ shrd r13,r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ shrd r14,r14,6
+ xor rdi,rax
+ add r10,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ shrd r13,r13,23
+ mov r10,r14
+ mov r12,rdx
+ shrd r14,r14,5
+ xor r13,rcx
+ xor r12,r8
+ shrd r13,r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ xor r12,r8
+ shrd r14,r14,6
+ xor r15,r11
+ add r9,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ shrd r14,r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ shrd r13,r13,23
+ mov r9,r14
+ mov r12,rcx
+ shrd r14,r14,5
+ xor r13,rbx
+ xor r12,rdx
+ shrd r13,r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ shrd r14,r14,6
+ xor rdi,r10
+ add r8,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ shrd r13,r13,23
+ mov r8,r14
+ mov r12,rbx
+ shrd r14,r14,5
+ xor r13,rax
+ xor r12,rcx
+ shrd r13,r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ xor r12,rcx
+ shrd r14,r14,6
+ xor r15,r9
+ add rdx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ shrd r14,r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ shrd r13,r13,23
+ mov rdx,r14
+ mov r12,rax
+ shrd r14,r14,5
+ xor r13,r11
+ xor r12,rbx
+ shrd r13,r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ shrd r14,r14,6
+ xor rdi,r8
+ add rcx,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ shrd r13,r13,23
+ mov rcx,r14
+ mov r12,r11
+ shrd r14,r14,5
+ xor r13,r10
+ xor r12,rax
+ shrd r13,r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ xor r12,rax
+ shrd r14,r14,6
+ xor r15,rdx
+ add rbx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ shrd r14,r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ shrd r13,r13,23
+ mov rbx,r14
+ mov r12,r10
+ shrd r14,r14,5
+ xor r13,r9
+ xor r12,r11
+ shrd r13,r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ shrd r14,r14,6
+ xor rdi,rcx
+ add rax,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ shrd r13,r13,23
+ mov rax,r14
+ mov r12,r9
+ shrd r14,r14,5
+ xor r13,r8
+ xor r12,r10
+ shrd r13,r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ xor r12,r10
+ shrd r14,r14,6
+ xor r15,rbx
+ add r11,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ shrd r14,r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ shrd r13,r13,23
+ mov r11,r14
+ mov r12,r8
+ shrd r14,r14,5
+ xor r13,rdx
+ xor r12,r9
+ shrd r13,r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ shrd r14,r14,6
+ xor rdi,rax
+ add r10,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ shrd r13,r13,23
+ mov r10,r14
+ mov r12,rdx
+ shrd r14,r14,5
+ xor r13,rcx
+ xor r12,r8
+ shrd r13,r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ xor r12,r8
+ shrd r14,r14,6
+ xor r15,r11
+ add r9,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ shrd r14,r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ shrd r13,r13,23
+ mov r9,r14
+ mov r12,rcx
+ shrd r14,r14,5
+ xor r13,rbx
+ xor r12,rdx
+ shrd r13,r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ shrd r14,r14,6
+ xor rdi,r10
+ add r8,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ shrd r13,r13,23
+ mov r8,r14
+ mov r12,rbx
+ shrd r14,r14,5
+ xor r13,rax
+ xor r12,rcx
+ shrd r13,r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ xor r12,rcx
+ shrd r14,r14,6
+ xor r15,r9
+ add rdx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ shrd r14,r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ shrd r13,r13,23
+ mov rdx,r14
+ mov r12,rax
+ shrd r14,r14,5
+ xor r13,r11
+ xor r12,rbx
+ shrd r13,r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ shrd r14,r14,6
+ xor rdi,r8
+ add rcx,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ shrd r13,r13,23
+ mov rcx,r14
+ mov r12,r11
+ shrd r14,r14,5
+ xor r13,r10
+ xor r12,rax
+ shrd r13,r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ xor r12,rax
+ shrd r14,r14,6
+ xor r15,rdx
+ add rbx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ shrd r14,r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ shrd r13,r13,23
+ mov rbx,r14
+ mov r12,r10
+ shrd r14,r14,5
+ xor r13,r9
+ xor r12,r11
+ shrd r13,r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ shrd r14,r14,6
+ xor rdi,rcx
+ add rax,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ mov rdi,QWORD[((128+0))+rsp]
+ mov rax,r14
+
+ add rax,QWORD[rdi]
+ lea rsi,[128+rsi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop_avx
+
+ mov rsi,QWORD[((128+24))+rsp]
+ vzeroupper
+ movaps xmm6,XMMWORD[((128+32))+rsp]
+ movaps xmm7,XMMWORD[((128+48))+rsp]
+ movaps xmm8,XMMWORD[((128+64))+rsp]
+ movaps xmm9,XMMWORD[((128+80))+rsp]
+ movaps xmm10,XMMWORD[((128+96))+rsp]
+ movaps xmm11,XMMWORD[((128+112))+rsp]
+ mov r15,QWORD[rsi]
+ mov r14,QWORD[8+rsi]
+ mov r13,QWORD[16+rsi]
+ mov r12,QWORD[24+rsi]
+ mov rbp,QWORD[32+rsi]
+ mov rbx,QWORD[40+rsi]
+ lea rsp,[48+rsi]
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+$L$SEH_end_sha512_block_data_order_avx:
EXTERN __imp_RtlVirtualUnwind
ALIGN 16
@@ -1903,9 +4190,23 @@ ALIGN 4
DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase
DD $L$SEH_end_sha512_block_data_order wrt ..imagebase
DD $L$SEH_info_sha512_block_data_order wrt ..imagebase
+ DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha512_block_data_order:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_xop:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_avx:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase