summaryrefslogtreecommitdiffstats
path: root/win-x86_64/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'win-x86_64/crypto')
-rw-r--r--win-x86_64/crypto/bn/x86_64-mont5.asm9
-rw-r--r--win-x86_64/crypto/ec/p256-x86_64-asm.asm285
-rw-r--r--win-x86_64/crypto/rc4/rc4-md5-x86_64.asm1372
-rw-r--r--win-x86_64/crypto/sha/sha1-x86_64.asm1152
-rw-r--r--win-x86_64/crypto/sha/sha256-x86_64.asm1088
-rw-r--r--win-x86_64/crypto/sha/sha512-x86_64.asm2301
6 files changed, 4550 insertions, 1657 deletions
diff --git a/win-x86_64/crypto/bn/x86_64-mont5.asm b/win-x86_64/crypto/bn/x86_64-mont5.asm
index 284318a..560b384 100644
--- a/win-x86_64/crypto/bn/x86_64-mont5.asm
+++ b/win-x86_64/crypto/bn/x86_64-mont5.asm
@@ -1616,6 +1616,15 @@ $L$8x_tail:
ALIGN 32
$L$8x_tail_done:
add r8,QWORD[rdx]
+ adc r9,0
+ adc r10,0
+ adc r11,0
+ adc r12,0
+ adc r13,0
+ adc r14,0
+ adc r15,0
+
+
xor rax,rax
neg rsi
diff --git a/win-x86_64/crypto/ec/p256-x86_64-asm.asm b/win-x86_64/crypto/ec/p256-x86_64-asm.asm
index c9789f5..45ba686 100644
--- a/win-x86_64/crypto/ec/p256-x86_64-asm.asm
+++ b/win-x86_64/crypto/ec/p256-x86_64-asm.asm
@@ -11,10 +11,6 @@ ALIGN 64
$L$poly:
DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
-
-$L$RR:
- DQ 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
-
$L$One:
DD 1,1,1,1,1,1,1,1
$L$Two:
@@ -24,7 +20,6 @@ $L$Three:
$L$ONE_mont:
DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
-global ecp_nistz256_mul_by_2
ALIGN 64
ecp_nistz256_mul_by_2:
@@ -78,266 +73,6 @@ $L$SEH_end_ecp_nistz256_mul_by_2:
-global ecp_nistz256_div_by_2
-
-ALIGN 32
-ecp_nistz256_div_by_2:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_div_by_2:
- mov rdi,rcx
- mov rsi,rdx
-
-
- push r12
- push r13
-
- mov r8,QWORD[rsi]
- mov r9,QWORD[8+rsi]
- mov r10,QWORD[16+rsi]
- mov rax,r8
- mov r11,QWORD[24+rsi]
- lea rsi,[$L$poly]
-
- mov rdx,r9
- xor r13,r13
- add r8,QWORD[rsi]
- mov rcx,r10
- adc r9,QWORD[8+rsi]
- adc r10,QWORD[16+rsi]
- mov r12,r11
- adc r11,QWORD[24+rsi]
- adc r13,0
- xor rsi,rsi
- test rax,1
-
- cmovz r8,rax
- cmovz r9,rdx
- cmovz r10,rcx
- cmovz r11,r12
- cmovz r13,rsi
-
- mov rax,r9
- shr r8,1
- shl rax,63
- mov rdx,r10
- shr r9,1
- or r8,rax
- shl rdx,63
- mov rcx,r11
- shr r10,1
- or r9,rdx
- shl rcx,63
- shr r11,1
- shl r13,63
- or r10,rcx
- or r11,r13
-
- mov QWORD[rdi],r8
- mov QWORD[8+rdi],r9
- mov QWORD[16+rdi],r10
- mov QWORD[24+rdi],r11
-
- pop r13
- pop r12
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-$L$SEH_end_ecp_nistz256_div_by_2:
-
-
-
-global ecp_nistz256_mul_by_3
-
-ALIGN 32
-ecp_nistz256_mul_by_3:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_mul_by_3:
- mov rdi,rcx
- mov rsi,rdx
-
-
- push r12
- push r13
-
- mov r8,QWORD[rsi]
- xor r13,r13
- mov r9,QWORD[8+rsi]
- add r8,r8
- mov r10,QWORD[16+rsi]
- adc r9,r9
- mov r11,QWORD[24+rsi]
- mov rax,r8
- adc r10,r10
- adc r11,r11
- mov rdx,r9
- adc r13,0
-
- sub r8,-1
- mov rcx,r10
- sbb r9,QWORD[(($L$poly+8))]
- sbb r10,0
- mov r12,r11
- sbb r11,QWORD[(($L$poly+24))]
- test r13,r13
-
- cmovz r8,rax
- cmovz r9,rdx
- cmovz r10,rcx
- cmovz r11,r12
-
- xor r13,r13
- add r8,QWORD[rsi]
- adc r9,QWORD[8+rsi]
- mov rax,r8
- adc r10,QWORD[16+rsi]
- adc r11,QWORD[24+rsi]
- mov rdx,r9
- adc r13,0
-
- sub r8,-1
- mov rcx,r10
- sbb r9,QWORD[(($L$poly+8))]
- sbb r10,0
- mov r12,r11
- sbb r11,QWORD[(($L$poly+24))]
- test r13,r13
-
- cmovz r8,rax
- cmovz r9,rdx
- mov QWORD[rdi],r8
- cmovz r10,rcx
- mov QWORD[8+rdi],r9
- cmovz r11,r12
- mov QWORD[16+rdi],r10
- mov QWORD[24+rdi],r11
-
- pop r13
- pop r12
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-$L$SEH_end_ecp_nistz256_mul_by_3:
-
-
-
-global ecp_nistz256_add
-
-ALIGN 32
-ecp_nistz256_add:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_add:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
- push r12
- push r13
-
- mov r8,QWORD[rsi]
- xor r13,r13
- mov r9,QWORD[8+rsi]
- mov r10,QWORD[16+rsi]
- mov r11,QWORD[24+rsi]
- lea rsi,[$L$poly]
-
- add r8,QWORD[rdx]
- adc r9,QWORD[8+rdx]
- mov rax,r8
- adc r10,QWORD[16+rdx]
- adc r11,QWORD[24+rdx]
- mov rdx,r9
- adc r13,0
-
- sub r8,QWORD[rsi]
- mov rcx,r10
- sbb r9,QWORD[8+rsi]
- sbb r10,QWORD[16+rsi]
- mov r12,r11
- sbb r11,QWORD[24+rsi]
- test r13,r13
-
- cmovz r8,rax
- cmovz r9,rdx
- mov QWORD[rdi],r8
- cmovz r10,rcx
- mov QWORD[8+rdi],r9
- cmovz r11,r12
- mov QWORD[16+rdi],r10
- mov QWORD[24+rdi],r11
-
- pop r13
- pop r12
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-$L$SEH_end_ecp_nistz256_add:
-
-
-
-global ecp_nistz256_sub
-
-ALIGN 32
-ecp_nistz256_sub:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_sub:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
- push r12
- push r13
-
- mov r8,QWORD[rsi]
- xor r13,r13
- mov r9,QWORD[8+rsi]
- mov r10,QWORD[16+rsi]
- mov r11,QWORD[24+rsi]
- lea rsi,[$L$poly]
-
- sub r8,QWORD[rdx]
- sbb r9,QWORD[8+rdx]
- mov rax,r8
- sbb r10,QWORD[16+rdx]
- sbb r11,QWORD[24+rdx]
- mov rdx,r9
- sbb r13,0
-
- add r8,QWORD[rsi]
- mov rcx,r10
- adc r9,QWORD[8+rsi]
- adc r10,QWORD[16+rsi]
- mov r12,r11
- adc r11,QWORD[24+rsi]
- test r13,r13
-
- cmovz r8,rax
- cmovz r9,rdx
- mov QWORD[rdi],r8
- cmovz r10,rcx
- mov QWORD[8+rdi],r9
- cmovz r11,r12
- mov QWORD[16+rdi],r10
- mov QWORD[24+rdi],r11
-
- pop r13
- pop r12
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-$L$SEH_end_ecp_nistz256_sub:
-
-
-
global ecp_nistz256_neg
ALIGN 32
@@ -395,26 +130,6 @@ $L$SEH_end_ecp_nistz256_neg:
-global ecp_nistz256_to_mont
-
-ALIGN 32
-ecp_nistz256_to_mont:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_ecp_nistz256_to_mont:
- mov rdi,rcx
- mov rsi,rdx
-
-
- lea rdx,[$L$RR]
- jmp NEAR $L$mul_mont
-$L$SEH_end_ecp_nistz256_to_mont:
-
-
-
-
-
global ecp_nistz256_mul_mont
diff --git a/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm b/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm
deleted file mode 100644
index f1ea965..0000000
--- a/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm
+++ /dev/null
@@ -1,1372 +0,0 @@
-default rel
-%define XMMWORD
-%define YMMWORD
-%define ZMMWORD
-section .text code align=64
-
-ALIGN 16
-
-global rc4_md5_enc
-
-rc4_md5_enc:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_rc4_md5_enc:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
- mov rcx,r9
- mov r8,QWORD[40+rsp]
- mov r9,QWORD[48+rsp]
-
-
- cmp r9,0
- je NEAR $L$abort
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- sub rsp,40
-$L$body:
- mov r11,rcx
- mov r12,r9
- mov r13,rsi
- mov r14,rdx
- mov r15,r8
- xor rbp,rbp
- xor rcx,rcx
-
- lea rdi,[8+rdi]
- mov bpl,BYTE[((-8))+rdi]
- mov cl,BYTE[((-4))+rdi]
-
- inc bpl
- sub r14,r13
- mov eax,DWORD[rbp*4+rdi]
- add cl,al
- lea rsi,[rbp*4+rdi]
- shl r12,6
- add r12,r15
- mov QWORD[16+rsp],r12
-
- mov QWORD[24+rsp],r11
- mov r8d,DWORD[r11]
- mov r9d,DWORD[4+r11]
- mov r10d,DWORD[8+r11]
- mov r11d,DWORD[12+r11]
- jmp NEAR $L$oop
-
-ALIGN 16
-$L$oop:
- mov DWORD[rsp],r8d
- mov DWORD[4+rsp],r9d
- mov DWORD[8+rsp],r10d
- mov r12d,r11d
- mov DWORD[12+rsp],r11d
- pxor xmm0,xmm0
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r9d
- add r8d,DWORD[r15]
- add al,dl
- mov ebx,DWORD[4+rsi]
- add r8d,3614090360
- xor r12d,r11d
- movzx eax,al
- mov DWORD[rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,7
- mov r12d,r10d
- movd xmm0,DWORD[rax*4+rdi]
-
- add r8d,r9d
- pxor xmm1,xmm1
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r8d
- add r11d,DWORD[4+r15]
- add bl,dl
- mov eax,DWORD[8+rsi]
- add r11d,3905402710
- xor r12d,r10d
- movzx ebx,bl
- mov DWORD[4+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,12
- mov r12d,r9d
- movd xmm1,DWORD[rbx*4+rdi]
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r11d
- add r10d,DWORD[8+r15]
- add al,dl
- mov ebx,DWORD[12+rsi]
- add r10d,606105819
- xor r12d,r9d
- movzx eax,al
- mov DWORD[8+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,17
- mov r12d,r8d
- pinsrw xmm0,WORD[rax*4+rdi],1
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r10d
- add r9d,DWORD[12+r15]
- add bl,dl
- mov eax,DWORD[16+rsi]
- add r9d,3250441966
- xor r12d,r8d
- movzx ebx,bl
- mov DWORD[12+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,22
- mov r12d,r11d
- pinsrw xmm1,WORD[rbx*4+rdi],1
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r9d
- add r8d,DWORD[16+r15]
- add al,dl
- mov ebx,DWORD[20+rsi]
- add r8d,4118548399
- xor r12d,r11d
- movzx eax,al
- mov DWORD[16+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,7
- mov r12d,r10d
- pinsrw xmm0,WORD[rax*4+rdi],2
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r8d
- add r11d,DWORD[20+r15]
- add bl,dl
- mov eax,DWORD[24+rsi]
- add r11d,1200080426
- xor r12d,r10d
- movzx ebx,bl
- mov DWORD[20+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,12
- mov r12d,r9d
- pinsrw xmm1,WORD[rbx*4+rdi],2
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r11d
- add r10d,DWORD[24+r15]
- add al,dl
- mov ebx,DWORD[28+rsi]
- add r10d,2821735955
- xor r12d,r9d
- movzx eax,al
- mov DWORD[24+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,17
- mov r12d,r8d
- pinsrw xmm0,WORD[rax*4+rdi],3
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r10d
- add r9d,DWORD[28+r15]
- add bl,dl
- mov eax,DWORD[32+rsi]
- add r9d,4249261313
- xor r12d,r8d
- movzx ebx,bl
- mov DWORD[28+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,22
- mov r12d,r11d
- pinsrw xmm1,WORD[rbx*4+rdi],3
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r9d
- add r8d,DWORD[32+r15]
- add al,dl
- mov ebx,DWORD[36+rsi]
- add r8d,1770035416
- xor r12d,r11d
- movzx eax,al
- mov DWORD[32+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,7
- mov r12d,r10d
- pinsrw xmm0,WORD[rax*4+rdi],4
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r8d
- add r11d,DWORD[36+r15]
- add bl,dl
- mov eax,DWORD[40+rsi]
- add r11d,2336552879
- xor r12d,r10d
- movzx ebx,bl
- mov DWORD[36+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,12
- mov r12d,r9d
- pinsrw xmm1,WORD[rbx*4+rdi],4
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r11d
- add r10d,DWORD[40+r15]
- add al,dl
- mov ebx,DWORD[44+rsi]
- add r10d,4294925233
- xor r12d,r9d
- movzx eax,al
- mov DWORD[40+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,17
- mov r12d,r8d
- pinsrw xmm0,WORD[rax*4+rdi],5
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r10d
- add r9d,DWORD[44+r15]
- add bl,dl
- mov eax,DWORD[48+rsi]
- add r9d,2304563134
- xor r12d,r8d
- movzx ebx,bl
- mov DWORD[44+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,22
- mov r12d,r11d
- pinsrw xmm1,WORD[rbx*4+rdi],5
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r9d
- add r8d,DWORD[48+r15]
- add al,dl
- mov ebx,DWORD[52+rsi]
- add r8d,1804603682
- xor r12d,r11d
- movzx eax,al
- mov DWORD[48+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,7
- mov r12d,r10d
- pinsrw xmm0,WORD[rax*4+rdi],6
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r8d
- add r11d,DWORD[52+r15]
- add bl,dl
- mov eax,DWORD[56+rsi]
- add r11d,4254626195
- xor r12d,r10d
- movzx ebx,bl
- mov DWORD[52+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,12
- mov r12d,r9d
- pinsrw xmm1,WORD[rbx*4+rdi],6
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r11d
- add r10d,DWORD[56+r15]
- add al,dl
- mov ebx,DWORD[60+rsi]
- add r10d,2792965006
- xor r12d,r9d
- movzx eax,al
- mov DWORD[56+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,17
- mov r12d,r8d
- pinsrw xmm0,WORD[rax*4+rdi],7
-
- add r10d,r11d
- movdqu xmm2,XMMWORD[r13]
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r10d
- add r9d,DWORD[60+r15]
- add bl,dl
- mov eax,DWORD[64+rsi]
- add r9d,1236535329
- xor r12d,r8d
- movzx ebx,bl
- mov DWORD[60+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,22
- mov r12d,r10d
- pinsrw xmm1,WORD[rbx*4+rdi],7
-
- add r9d,r10d
- psllq xmm1,8
- pxor xmm2,xmm0
- pxor xmm2,xmm1
- pxor xmm0,xmm0
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r11d
- add r8d,DWORD[4+r15]
- add al,dl
- mov ebx,DWORD[68+rsi]
- add r8d,4129170786
- xor r12d,r10d
- movzx eax,al
- mov DWORD[64+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,5
- mov r12d,r9d
- movd xmm0,DWORD[rax*4+rdi]
-
- add r8d,r9d
- pxor xmm1,xmm1
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r10d
- add r11d,DWORD[24+r15]
- add bl,dl
- mov eax,DWORD[72+rsi]
- add r11d,3225465664
- xor r12d,r9d
- movzx ebx,bl
- mov DWORD[68+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,9
- mov r12d,r8d
- movd xmm1,DWORD[rbx*4+rdi]
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r9d
- add r10d,DWORD[44+r15]
- add al,dl
- mov ebx,DWORD[76+rsi]
- add r10d,643717713
- xor r12d,r8d
- movzx eax,al
- mov DWORD[72+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,14
- mov r12d,r11d
- pinsrw xmm0,WORD[rax*4+rdi],1
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r8d
- add r9d,DWORD[r15]
- add bl,dl
- mov eax,DWORD[80+rsi]
- add r9d,3921069994
- xor r12d,r11d
- movzx ebx,bl
- mov DWORD[76+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,20
- mov r12d,r10d
- pinsrw xmm1,WORD[rbx*4+rdi],1
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r11d
- add r8d,DWORD[20+r15]
- add al,dl
- mov ebx,DWORD[84+rsi]
- add r8d,3593408605
- xor r12d,r10d
- movzx eax,al
- mov DWORD[80+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,5
- mov r12d,r9d
- pinsrw xmm0,WORD[rax*4+rdi],2
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r10d
- add r11d,DWORD[40+r15]
- add bl,dl
- mov eax,DWORD[88+rsi]
- add r11d,38016083
- xor r12d,r9d
- movzx ebx,bl
- mov DWORD[84+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,9
- mov r12d,r8d
- pinsrw xmm1,WORD[rbx*4+rdi],2
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r9d
- add r10d,DWORD[60+r15]
- add al,dl
- mov ebx,DWORD[92+rsi]
- add r10d,3634488961
- xor r12d,r8d
- movzx eax,al
- mov DWORD[88+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,14
- mov r12d,r11d
- pinsrw xmm0,WORD[rax*4+rdi],3
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r8d
- add r9d,DWORD[16+r15]
- add bl,dl
- mov eax,DWORD[96+rsi]
- add r9d,3889429448
- xor r12d,r11d
- movzx ebx,bl
- mov DWORD[92+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,20
- mov r12d,r10d
- pinsrw xmm1,WORD[rbx*4+rdi],3
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r11d
- add r8d,DWORD[36+r15]
- add al,dl
- mov ebx,DWORD[100+rsi]
- add r8d,568446438
- xor r12d,r10d
- movzx eax,al
- mov DWORD[96+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,5
- mov r12d,r9d
- pinsrw xmm0,WORD[rax*4+rdi],4
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r10d
- add r11d,DWORD[56+r15]
- add bl,dl
- mov eax,DWORD[104+rsi]
- add r11d,3275163606
- xor r12d,r9d
- movzx ebx,bl
- mov DWORD[100+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,9
- mov r12d,r8d
- pinsrw xmm1,WORD[rbx*4+rdi],4
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r9d
- add r10d,DWORD[12+r15]
- add al,dl
- mov ebx,DWORD[108+rsi]
- add r10d,4107603335
- xor r12d,r8d
- movzx eax,al
- mov DWORD[104+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,14
- mov r12d,r11d
- pinsrw xmm0,WORD[rax*4+rdi],5
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r8d
- add r9d,DWORD[32+r15]
- add bl,dl
- mov eax,DWORD[112+rsi]
- add r9d,1163531501
- xor r12d,r11d
- movzx ebx,bl
- mov DWORD[108+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,20
- mov r12d,r10d
- pinsrw xmm1,WORD[rbx*4+rdi],5
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r11d
- add r8d,DWORD[52+r15]
- add al,dl
- mov ebx,DWORD[116+rsi]
- add r8d,2850285829
- xor r12d,r10d
- movzx eax,al
- mov DWORD[112+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,5
- mov r12d,r9d
- pinsrw xmm0,WORD[rax*4+rdi],6
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r10d
- add r11d,DWORD[8+r15]
- add bl,dl
- mov eax,DWORD[120+rsi]
- add r11d,4243563512
- xor r12d,r9d
- movzx ebx,bl
- mov DWORD[116+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,9
- mov r12d,r8d
- pinsrw xmm1,WORD[rbx*4+rdi],6
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],eax
- and r12d,r9d
- add r10d,DWORD[28+r15]
- add al,dl
- mov ebx,DWORD[124+rsi]
- add r10d,1735328473
- xor r12d,r8d
- movzx eax,al
- mov DWORD[120+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,14
- mov r12d,r11d
- pinsrw xmm0,WORD[rax*4+rdi],7
-
- add r10d,r11d
- movdqu xmm3,XMMWORD[16+r13]
- add bpl,32
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],ebx
- and r12d,r8d
- add r9d,DWORD[48+r15]
- add bl,dl
- mov eax,DWORD[rbp*4+rdi]
- add r9d,2368359562
- xor r12d,r11d
- movzx ebx,bl
- mov DWORD[124+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,20
- mov r12d,r11d
- pinsrw xmm1,WORD[rbx*4+rdi],7
-
- add r9d,r10d
- mov rsi,rcx
- xor rcx,rcx
- mov cl,sil
- lea rsi,[rbp*4+rdi]
- psllq xmm1,8
- pxor xmm3,xmm0
- pxor xmm3,xmm1
- pxor xmm0,xmm0
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],eax
- xor r12d,r9d
- add r8d,DWORD[20+r15]
- add al,dl
- mov ebx,DWORD[4+rsi]
- add r8d,4294588738
- movzx eax,al
- add r8d,r12d
- mov DWORD[rsi],edx
- add cl,bl
- rol r8d,4
- mov r12d,r10d
- movd xmm0,DWORD[rax*4+rdi]
-
- add r8d,r9d
- pxor xmm1,xmm1
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],ebx
- xor r12d,r8d
- add r11d,DWORD[32+r15]
- add bl,dl
- mov eax,DWORD[8+rsi]
- add r11d,2272392833
- movzx ebx,bl
- add r11d,r12d
- mov DWORD[4+rsi],edx
- add cl,al
- rol r11d,11
- mov r12d,r9d
- movd xmm1,DWORD[rbx*4+rdi]
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],eax
- xor r12d,r11d
- add r10d,DWORD[44+r15]
- add al,dl
- mov ebx,DWORD[12+rsi]
- add r10d,1839030562
- movzx eax,al
- add r10d,r12d
- mov DWORD[8+rsi],edx
- add cl,bl
- rol r10d,16
- mov r12d,r8d
- pinsrw xmm0,WORD[rax*4+rdi],1
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],ebx
- xor r12d,r10d
- add r9d,DWORD[56+r15]
- add bl,dl
- mov eax,DWORD[16+rsi]
- add r9d,4259657740
- movzx ebx,bl
- add r9d,r12d
- mov DWORD[12+rsi],edx
- add cl,al
- rol r9d,23
- mov r12d,r11d
- pinsrw xmm1,WORD[rbx*4+rdi],1
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],eax
- xor r12d,r9d
- add r8d,DWORD[4+r15]
- add al,dl
- mov ebx,DWORD[20+rsi]
- add r8d,2763975236
- movzx eax,al
- add r8d,r12d
- mov DWORD[16+rsi],edx
- add cl,bl
- rol r8d,4
- mov r12d,r10d
- pinsrw xmm0,WORD[rax*4+rdi],2
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],ebx
- xor r12d,r8d
- add r11d,DWORD[16+r15]
- add bl,dl
- mov eax,DWORD[24+rsi]
- add r11d,1272893353
- movzx ebx,bl
- add r11d,r12d
- mov DWORD[20+rsi],edx
- add cl,al
- rol r11d,11
- mov r12d,r9d
- pinsrw xmm1,WORD[rbx*4+rdi],2
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],eax
- xor r12d,r11d
- add r10d,DWORD[28+r15]
- add al,dl
- mov ebx,DWORD[28+rsi]
- add r10d,4139469664
- movzx eax,al
- add r10d,r12d
- mov DWORD[24+rsi],edx
- add cl,bl
- rol r10d,16
- mov r12d,r8d
- pinsrw xmm0,WORD[rax*4+rdi],3
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],ebx
- xor r12d,r10d
- add r9d,DWORD[40+r15]
- add bl,dl
- mov eax,DWORD[32+rsi]
- add r9d,3200236656
- movzx ebx,bl
- add r9d,r12d
- mov DWORD[28+rsi],edx
- add cl,al
- rol r9d,23
- mov r12d,r11d
- pinsrw xmm1,WORD[rbx*4+rdi],3
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],eax
- xor r12d,r9d
- add r8d,DWORD[52+r15]
- add al,dl
- mov ebx,DWORD[36+rsi]
- add r8d,681279174
- movzx eax,al
- add r8d,r12d
- mov DWORD[32+rsi],edx
- add cl,bl
- rol r8d,4
- mov r12d,r10d
- pinsrw xmm0,WORD[rax*4+rdi],4
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],ebx
- xor r12d,r8d
- add r11d,DWORD[r15]
- add bl,dl
- mov eax,DWORD[40+rsi]
- add r11d,3936430074
- movzx ebx,bl
- add r11d,r12d
- mov DWORD[36+rsi],edx
- add cl,al
- rol r11d,11
- mov r12d,r9d
- pinsrw xmm1,WORD[rbx*4+rdi],4
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],eax
- xor r12d,r11d
- add r10d,DWORD[12+r15]
- add al,dl
- mov ebx,DWORD[44+rsi]
- add r10d,3572445317
- movzx eax,al
- add r10d,r12d
- mov DWORD[40+rsi],edx
- add cl,bl
- rol r10d,16
- mov r12d,r8d
- pinsrw xmm0,WORD[rax*4+rdi],5
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],ebx
- xor r12d,r10d
- add r9d,DWORD[24+r15]
- add bl,dl
- mov eax,DWORD[48+rsi]
- add r9d,76029189
- movzx ebx,bl
- add r9d,r12d
- mov DWORD[44+rsi],edx
- add cl,al
- rol r9d,23
- mov r12d,r11d
- pinsrw xmm1,WORD[rbx*4+rdi],5
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],eax
- xor r12d,r9d
- add r8d,DWORD[36+r15]
- add al,dl
- mov ebx,DWORD[52+rsi]
- add r8d,3654602809
- movzx eax,al
- add r8d,r12d
- mov DWORD[48+rsi],edx
- add cl,bl
- rol r8d,4
- mov r12d,r10d
- pinsrw xmm0,WORD[rax*4+rdi],6
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],ebx
- xor r12d,r8d
- add r11d,DWORD[48+r15]
- add bl,dl
- mov eax,DWORD[56+rsi]
- add r11d,3873151461
- movzx ebx,bl
- add r11d,r12d
- mov DWORD[52+rsi],edx
- add cl,al
- rol r11d,11
- mov r12d,r9d
- pinsrw xmm1,WORD[rbx*4+rdi],6
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],eax
- xor r12d,r11d
- add r10d,DWORD[60+r15]
- add al,dl
- mov ebx,DWORD[60+rsi]
- add r10d,530742520
- movzx eax,al
- add r10d,r12d
- mov DWORD[56+rsi],edx
- add cl,bl
- rol r10d,16
- mov r12d,r8d
- pinsrw xmm0,WORD[rax*4+rdi],7
-
- add r10d,r11d
- movdqu xmm4,XMMWORD[32+r13]
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],ebx
- xor r12d,r10d
- add r9d,DWORD[8+r15]
- add bl,dl
- mov eax,DWORD[64+rsi]
- add r9d,3299628645
- movzx ebx,bl
- add r9d,r12d
- mov DWORD[60+rsi],edx
- add cl,al
- rol r9d,23
- mov r12d,-1
- pinsrw xmm1,WORD[rbx*4+rdi],7
-
- add r9d,r10d
- psllq xmm1,8
- pxor xmm4,xmm0
- pxor xmm4,xmm1
- pxor xmm0,xmm0
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],eax
- or r12d,r9d
- add r8d,DWORD[r15]
- add al,dl
- mov ebx,DWORD[68+rsi]
- add r8d,4096336452
- movzx eax,al
- xor r12d,r10d
- mov DWORD[64+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,6
- mov r12d,-1
- movd xmm0,DWORD[rax*4+rdi]
-
- add r8d,r9d
- pxor xmm1,xmm1
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],ebx
- or r12d,r8d
- add r11d,DWORD[28+r15]
- add bl,dl
- mov eax,DWORD[72+rsi]
- add r11d,1126891415
- movzx ebx,bl
- xor r12d,r9d
- mov DWORD[68+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,10
- mov r12d,-1
- movd xmm1,DWORD[rbx*4+rdi]
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],eax
- or r12d,r11d
- add r10d,DWORD[56+r15]
- add al,dl
- mov ebx,DWORD[76+rsi]
- add r10d,2878612391
- movzx eax,al
- xor r12d,r8d
- mov DWORD[72+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,15
- mov r12d,-1
- pinsrw xmm0,WORD[rax*4+rdi],1
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],ebx
- or r12d,r10d
- add r9d,DWORD[20+r15]
- add bl,dl
- mov eax,DWORD[80+rsi]
- add r9d,4237533241
- movzx ebx,bl
- xor r12d,r11d
- mov DWORD[76+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,21
- mov r12d,-1
- pinsrw xmm1,WORD[rbx*4+rdi],1
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],eax
- or r12d,r9d
- add r8d,DWORD[48+r15]
- add al,dl
- mov ebx,DWORD[84+rsi]
- add r8d,1700485571
- movzx eax,al
- xor r12d,r10d
- mov DWORD[80+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,6
- mov r12d,-1
- pinsrw xmm0,WORD[rax*4+rdi],2
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],ebx
- or r12d,r8d
- add r11d,DWORD[12+r15]
- add bl,dl
- mov eax,DWORD[88+rsi]
- add r11d,2399980690
- movzx ebx,bl
- xor r12d,r9d
- mov DWORD[84+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,10
- mov r12d,-1
- pinsrw xmm1,WORD[rbx*4+rdi],2
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],eax
- or r12d,r11d
- add r10d,DWORD[40+r15]
- add al,dl
- mov ebx,DWORD[92+rsi]
- add r10d,4293915773
- movzx eax,al
- xor r12d,r8d
- mov DWORD[88+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,15
- mov r12d,-1
- pinsrw xmm0,WORD[rax*4+rdi],3
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],ebx
- or r12d,r10d
- add r9d,DWORD[4+r15]
- add bl,dl
- mov eax,DWORD[96+rsi]
- add r9d,2240044497
- movzx ebx,bl
- xor r12d,r11d
- mov DWORD[92+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,21
- mov r12d,-1
- pinsrw xmm1,WORD[rbx*4+rdi],3
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],eax
- or r12d,r9d
- add r8d,DWORD[32+r15]
- add al,dl
- mov ebx,DWORD[100+rsi]
- add r8d,1873313359
- movzx eax,al
- xor r12d,r10d
- mov DWORD[96+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,6
- mov r12d,-1
- pinsrw xmm0,WORD[rax*4+rdi],4
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],ebx
- or r12d,r8d
- add r11d,DWORD[60+r15]
- add bl,dl
- mov eax,DWORD[104+rsi]
- add r11d,4264355552
- movzx ebx,bl
- xor r12d,r9d
- mov DWORD[100+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,10
- mov r12d,-1
- pinsrw xmm1,WORD[rbx*4+rdi],4
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],eax
- or r12d,r11d
- add r10d,DWORD[24+r15]
- add al,dl
- mov ebx,DWORD[108+rsi]
- add r10d,2734768916
- movzx eax,al
- xor r12d,r8d
- mov DWORD[104+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,15
- mov r12d,-1
- pinsrw xmm0,WORD[rax*4+rdi],5
-
- add r10d,r11d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],ebx
- or r12d,r10d
- add r9d,DWORD[52+r15]
- add bl,dl
- mov eax,DWORD[112+rsi]
- add r9d,1309151649
- movzx ebx,bl
- xor r12d,r11d
- mov DWORD[108+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,21
- mov r12d,-1
- pinsrw xmm1,WORD[rbx*4+rdi],5
-
- add r9d,r10d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r11d
- mov DWORD[rcx*4+rdi],eax
- or r12d,r9d
- add r8d,DWORD[16+r15]
- add al,dl
- mov ebx,DWORD[116+rsi]
- add r8d,4149444226
- movzx eax,al
- xor r12d,r10d
- mov DWORD[112+rsi],edx
- add r8d,r12d
- add cl,bl
- rol r8d,6
- mov r12d,-1
- pinsrw xmm0,WORD[rax*4+rdi],6
-
- add r8d,r9d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r10d
- mov DWORD[rcx*4+rdi],ebx
- or r12d,r8d
- add r11d,DWORD[44+r15]
- add bl,dl
- mov eax,DWORD[120+rsi]
- add r11d,3174756917
- movzx ebx,bl
- xor r12d,r9d
- mov DWORD[116+rsi],edx
- add r11d,r12d
- add cl,al
- rol r11d,10
- mov r12d,-1
- pinsrw xmm1,WORD[rbx*4+rdi],6
-
- add r11d,r8d
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r9d
- mov DWORD[rcx*4+rdi],eax
- or r12d,r11d
- add r10d,DWORD[8+r15]
- add al,dl
- mov ebx,DWORD[124+rsi]
- add r10d,718787259
- movzx eax,al
- xor r12d,r8d
- mov DWORD[120+rsi],edx
- add r10d,r12d
- add cl,bl
- rol r10d,15
- mov r12d,-1
- pinsrw xmm0,WORD[rax*4+rdi],7
-
- add r10d,r11d
- movdqu xmm5,XMMWORD[48+r13]
- add bpl,32
- mov edx,DWORD[rcx*4+rdi]
- xor r12d,r8d
- mov DWORD[rcx*4+rdi],ebx
- or r12d,r10d
- add r9d,DWORD[36+r15]
- add bl,dl
- mov eax,DWORD[rbp*4+rdi]
- add r9d,3951481745
- movzx ebx,bl
- xor r12d,r11d
- mov DWORD[124+rsi],edx
- add r9d,r12d
- add cl,al
- rol r9d,21
- mov r12d,-1
- pinsrw xmm1,WORD[rbx*4+rdi],7
-
- add r9d,r10d
- mov rsi,rbp
- xor rbp,rbp
- mov bpl,sil
- mov rsi,rcx
- xor rcx,rcx
- mov cl,sil
- lea rsi,[rbp*4+rdi]
- psllq xmm1,8
- pxor xmm5,xmm0
- pxor xmm5,xmm1
- add r8d,DWORD[rsp]
- add r9d,DWORD[4+rsp]
- add r10d,DWORD[8+rsp]
- add r11d,DWORD[12+rsp]
-
- movdqu XMMWORD[r13*1+r14],xmm2
- movdqu XMMWORD[16+r13*1+r14],xmm3
- movdqu XMMWORD[32+r13*1+r14],xmm4
- movdqu XMMWORD[48+r13*1+r14],xmm5
- lea r15,[64+r15]
- lea r13,[64+r13]
- cmp r15,QWORD[16+rsp]
- jb NEAR $L$oop
-
- mov r12,QWORD[24+rsp]
- sub cl,al
- mov DWORD[r12],r8d
- mov DWORD[4+r12],r9d
- mov DWORD[8+r12],r10d
- mov DWORD[12+r12],r11d
- sub bpl,1
- mov DWORD[((-8))+rdi],ebp
- mov DWORD[((-4))+rdi],ecx
-
- mov r15,QWORD[40+rsp]
- mov r14,QWORD[48+rsp]
- mov r13,QWORD[56+rsp]
- mov r12,QWORD[64+rsp]
- mov rbp,QWORD[72+rsp]
- mov rbx,QWORD[80+rsp]
- lea rsp,[88+rsp]
-$L$epilogue:
-$L$abort:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-$L$SEH_end_rc4_md5_enc:
-EXTERN __imp_RtlVirtualUnwind
-
-ALIGN 16
-se_handler:
- push rsi
- push rdi
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
- pushfq
- sub rsp,64
-
- mov rax,QWORD[120+r8]
- mov rbx,QWORD[248+r8]
-
- lea r10,[$L$body]
- cmp rbx,r10
- jb NEAR $L$in_prologue
-
- mov rax,QWORD[152+r8]
-
- lea r10,[$L$epilogue]
- cmp rbx,r10
- jae NEAR $L$in_prologue
-
- mov r15,QWORD[40+rax]
- mov r14,QWORD[48+rax]
- mov r13,QWORD[56+rax]
- mov r12,QWORD[64+rax]
- mov rbp,QWORD[72+rax]
- mov rbx,QWORD[80+rax]
- lea rax,[88+rax]
-
- mov QWORD[144+r8],rbx
- mov QWORD[160+r8],rbp
- mov QWORD[216+r8],r12
- mov QWORD[224+r8],r13
- mov QWORD[232+r8],r14
- mov QWORD[240+r8],r15
-
-$L$in_prologue:
- mov rdi,QWORD[8+rax]
- mov rsi,QWORD[16+rax]
- mov QWORD[152+r8],rax
- mov QWORD[168+r8],rsi
- mov QWORD[176+r8],rdi
-
- mov rdi,QWORD[40+r9]
- mov rsi,r8
- mov ecx,154
- DD 0xa548f3fc
-
- mov rsi,r9
- xor rcx,rcx
- mov rdx,QWORD[8+rsi]
- mov r8,QWORD[rsi]
- mov r9,QWORD[16+rsi]
- mov r10,QWORD[40+rsi]
- lea r11,[56+rsi]
- lea r12,[24+rsi]
- mov QWORD[32+rsp],r10
- mov QWORD[40+rsp],r11
- mov QWORD[48+rsp],r12
- mov QWORD[56+rsp],rcx
- call QWORD[__imp_RtlVirtualUnwind]
-
- mov eax,1
- add rsp,64
- popfq
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
- pop rdi
- pop rsi
- DB 0F3h,0C3h ;repret
-
-
-section .pdata rdata align=4
-ALIGN 4
- DD $L$SEH_begin_rc4_md5_enc wrt ..imagebase
- DD $L$SEH_end_rc4_md5_enc wrt ..imagebase
- DD $L$SEH_info_rc4_md5_enc wrt ..imagebase
-
-section .xdata rdata align=8
-ALIGN 8
-$L$SEH_info_rc4_md5_enc:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
diff --git a/win-x86_64/crypto/sha/sha1-x86_64.asm b/win-x86_64/crypto/sha/sha1-x86_64.asm
index 0f5361a..168f78d 100644
--- a/win-x86_64/crypto/sha/sha1-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha1-x86_64.asm
@@ -24,6 +24,11 @@ $L$SEH_begin_sha1_block_data_order:
mov r10d,DWORD[((OPENSSL_ia32cap_P+8))]
test r8d,512
jz NEAR $L$ialu
+ and r8d,268435456
+ and r9d,1073741824
+ or r8d,r9d
+ cmp r8d,1342177280
+ je NEAR _avx_shortcut
jmp NEAR _ssse3_shortcut
ALIGN 16
@@ -2445,6 +2450,1146 @@ $L$epilogue_ssse3:
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_sha1_block_data_order_ssse3:
+
+ALIGN 16
+sha1_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha1_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+_avx_shortcut:
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ lea rsp,[((-160))+rsp]
+ vzeroupper
+ vmovaps XMMWORD[(-40-96)+rax],xmm6
+ vmovaps XMMWORD[(-40-80)+rax],xmm7
+ vmovaps XMMWORD[(-40-64)+rax],xmm8
+ vmovaps XMMWORD[(-40-48)+rax],xmm9
+ vmovaps XMMWORD[(-40-32)+rax],xmm10
+ vmovaps XMMWORD[(-40-16)+rax],xmm11
+$L$prologue_avx:
+ mov r14,rax
+ and rsp,-64
+ mov r8,rdi
+ mov r9,rsi
+ mov r10,rdx
+
+ shl r10,6
+ add r10,r9
+ lea r11,[((K_XX_XX+64))]
+
+ mov eax,DWORD[r8]
+ mov ebx,DWORD[4+r8]
+ mov ecx,DWORD[8+r8]
+ mov edx,DWORD[12+r8]
+ mov esi,ebx
+ mov ebp,DWORD[16+r8]
+ mov edi,ecx
+ xor edi,edx
+ and esi,edi
+
+ vmovdqa xmm6,XMMWORD[64+r11]
+ vmovdqa xmm11,XMMWORD[((-64))+r11]
+ vmovdqu xmm0,XMMWORD[r9]
+ vmovdqu xmm1,XMMWORD[16+r9]
+ vmovdqu xmm2,XMMWORD[32+r9]
+ vmovdqu xmm3,XMMWORD[48+r9]
+ vpshufb xmm0,xmm0,xmm6
+ add r9,64
+ vpshufb xmm1,xmm1,xmm6
+ vpshufb xmm2,xmm2,xmm6
+ vpshufb xmm3,xmm3,xmm6
+ vpaddd xmm4,xmm0,xmm11
+ vpaddd xmm5,xmm1,xmm11
+ vpaddd xmm6,xmm2,xmm11
+ vmovdqa XMMWORD[rsp],xmm4
+ vmovdqa XMMWORD[16+rsp],xmm5
+ vmovdqa XMMWORD[32+rsp],xmm6
+ jmp NEAR $L$oop_avx
+ALIGN 16
+$L$oop_avx:
+ shrd ebx,ebx,2
+ xor esi,edx
+ vpalignr xmm4,xmm1,xmm0,8
+ mov edi,eax
+ add ebp,DWORD[rsp]
+ vpaddd xmm9,xmm11,xmm3
+ xor ebx,ecx
+ shld eax,eax,5
+ vpsrldq xmm8,xmm3,4
+ add ebp,esi
+ and edi,ebx
+ vpxor xmm4,xmm4,xmm0
+ xor ebx,ecx
+ add ebp,eax
+ vpxor xmm8,xmm8,xmm2
+ shrd eax,eax,7
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[4+rsp]
+ vpxor xmm4,xmm4,xmm8
+ xor eax,ebx
+ shld ebp,ebp,5
+ vmovdqa XMMWORD[48+rsp],xmm9
+ add edx,edi
+ and esi,eax
+ vpsrld xmm8,xmm4,31
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor esi,ebx
+ vpslldq xmm10,xmm4,12
+ vpaddd xmm4,xmm4,xmm4
+ mov edi,edx
+ add ecx,DWORD[8+rsp]
+ xor ebp,eax
+ shld edx,edx,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm4,xmm4,xmm8
+ add ecx,esi
+ and edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ vpslld xmm10,xmm10,2
+ vpxor xmm4,xmm4,xmm9
+ shrd edx,edx,7
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[12+rsp]
+ vpxor xmm4,xmm4,xmm10
+ xor edx,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ and esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ shrd ecx,ecx,7
+ xor esi,ebp
+ vpalignr xmm5,xmm2,xmm1,8
+ mov edi,ebx
+ add eax,DWORD[16+rsp]
+ vpaddd xmm9,xmm11,xmm4
+ xor ecx,edx
+ shld ebx,ebx,5
+ vpsrldq xmm8,xmm4,4
+ add eax,esi
+ and edi,ecx
+ vpxor xmm5,xmm5,xmm1
+ xor ecx,edx
+ add eax,ebx
+ vpxor xmm8,xmm8,xmm3
+ shrd ebx,ebx,7
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[20+rsp]
+ vpxor xmm5,xmm5,xmm8
+ xor ebx,ecx
+ shld eax,eax,5
+ vmovdqa XMMWORD[rsp],xmm9
+ add ebp,edi
+ and esi,ebx
+ vpsrld xmm8,xmm5,31
+ xor ebx,ecx
+ add ebp,eax
+ shrd eax,eax,7
+ xor esi,ecx
+ vpslldq xmm10,xmm5,12
+ vpaddd xmm5,xmm5,xmm5
+ mov edi,ebp
+ add edx,DWORD[24+rsp]
+ xor eax,ebx
+ shld ebp,ebp,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm5,xmm5,xmm8
+ add edx,esi
+ and edi,eax
+ xor eax,ebx
+ add edx,ebp
+ vpslld xmm10,xmm10,2
+ vpxor xmm5,xmm5,xmm9
+ shrd ebp,ebp,7
+ xor edi,ebx
+ mov esi,edx
+ add ecx,DWORD[28+rsp]
+ vpxor xmm5,xmm5,xmm10
+ xor ebp,eax
+ shld edx,edx,5
+ vmovdqa xmm11,XMMWORD[((-32))+r11]
+ add ecx,edi
+ and esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ shrd edx,edx,7
+ xor esi,eax
+ vpalignr xmm6,xmm3,xmm2,8
+ mov edi,ecx
+ add ebx,DWORD[32+rsp]
+ vpaddd xmm9,xmm11,xmm5
+ xor edx,ebp
+ shld ecx,ecx,5
+ vpsrldq xmm8,xmm5,4
+ add ebx,esi
+ and edi,edx
+ vpxor xmm6,xmm6,xmm2
+ xor edx,ebp
+ add ebx,ecx
+ vpxor xmm8,xmm8,xmm4
+ shrd ecx,ecx,7
+ xor edi,ebp
+ mov esi,ebx
+ add eax,DWORD[36+rsp]
+ vpxor xmm6,xmm6,xmm8
+ xor ecx,edx
+ shld ebx,ebx,5
+ vmovdqa XMMWORD[16+rsp],xmm9
+ add eax,edi
+ and esi,ecx
+ vpsrld xmm8,xmm6,31
+ xor ecx,edx
+ add eax,ebx
+ shrd ebx,ebx,7
+ xor esi,edx
+ vpslldq xmm10,xmm6,12
+ vpaddd xmm6,xmm6,xmm6
+ mov edi,eax
+ add ebp,DWORD[40+rsp]
+ xor ebx,ecx
+ shld eax,eax,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm6,xmm6,xmm8
+ add ebp,esi
+ and edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpslld xmm10,xmm10,2
+ vpxor xmm6,xmm6,xmm9
+ shrd eax,eax,7
+ xor edi,ecx
+ mov esi,ebp
+ add edx,DWORD[44+rsp]
+ vpxor xmm6,xmm6,xmm10
+ xor eax,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ and esi,eax
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor esi,ebx
+ vpalignr xmm7,xmm4,xmm3,8
+ mov edi,edx
+ add ecx,DWORD[48+rsp]
+ vpaddd xmm9,xmm11,xmm6
+ xor ebp,eax
+ shld edx,edx,5
+ vpsrldq xmm8,xmm6,4
+ add ecx,esi
+ and edi,ebp
+ vpxor xmm7,xmm7,xmm3
+ xor ebp,eax
+ add ecx,edx
+ vpxor xmm8,xmm8,xmm5
+ shrd edx,edx,7
+ xor edi,eax
+ mov esi,ecx
+ add ebx,DWORD[52+rsp]
+ vpxor xmm7,xmm7,xmm8
+ xor edx,ebp
+ shld ecx,ecx,5
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add ebx,edi
+ and esi,edx
+ vpsrld xmm8,xmm7,31
+ xor edx,ebp
+ add ebx,ecx
+ shrd ecx,ecx,7
+ xor esi,ebp
+ vpslldq xmm10,xmm7,12
+ vpaddd xmm7,xmm7,xmm7
+ mov edi,ebx
+ add eax,DWORD[56+rsp]
+ xor ecx,edx
+ shld ebx,ebx,5
+ vpsrld xmm9,xmm10,30
+ vpor xmm7,xmm7,xmm8
+ add eax,esi
+ and edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ vpslld xmm10,xmm10,2
+ vpxor xmm7,xmm7,xmm9
+ shrd ebx,ebx,7
+ xor edi,edx
+ mov esi,eax
+ add ebp,DWORD[60+rsp]
+ vpxor xmm7,xmm7,xmm10
+ xor ebx,ecx
+ shld eax,eax,5
+ add ebp,edi
+ and esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpalignr xmm8,xmm7,xmm6,8
+ vpxor xmm0,xmm0,xmm4
+ shrd eax,eax,7
+ xor esi,ecx
+ mov edi,ebp
+ add edx,DWORD[rsp]
+ vpxor xmm0,xmm0,xmm1
+ xor eax,ebx
+ shld ebp,ebp,5
+ vpaddd xmm9,xmm11,xmm7
+ add edx,esi
+ and edi,eax
+ vpxor xmm0,xmm0,xmm8
+ xor eax,ebx
+ add edx,ebp
+ shrd ebp,ebp,7
+ xor edi,ebx
+ vpsrld xmm8,xmm0,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ mov esi,edx
+ add ecx,DWORD[4+rsp]
+ xor ebp,eax
+ shld edx,edx,5
+ vpslld xmm0,xmm0,2
+ add ecx,edi
+ and esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ shrd edx,edx,7
+ xor esi,eax
+ mov edi,ecx
+ add ebx,DWORD[8+rsp]
+ vpor xmm0,xmm0,xmm8
+ xor edx,ebp
+ shld ecx,ecx,5
+ add ebx,esi
+ and edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[12+rsp]
+ xor edi,ebp
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpalignr xmm8,xmm0,xmm7,8
+ vpxor xmm1,xmm1,xmm5
+ add ebp,DWORD[16+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ vpxor xmm1,xmm1,xmm2
+ add ebp,esi
+ xor edi,ecx
+ vpaddd xmm9,xmm11,xmm0
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpxor xmm1,xmm1,xmm8
+ add edx,DWORD[20+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ vpsrld xmm8,xmm1,30
+ vmovdqa XMMWORD[rsp],xmm9
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpslld xmm1,xmm1,2
+ add ecx,DWORD[24+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpor xmm1,xmm1,xmm8
+ add ebx,DWORD[28+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpalignr xmm8,xmm1,xmm0,8
+ vpxor xmm2,xmm2,xmm6
+ add eax,DWORD[32+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ vpxor xmm2,xmm2,xmm3
+ add eax,esi
+ xor edi,edx
+ vpaddd xmm9,xmm11,xmm1
+ vmovdqa xmm11,XMMWORD[r11]
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpxor xmm2,xmm2,xmm8
+ add ebp,DWORD[36+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ vpsrld xmm8,xmm2,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpslld xmm2,xmm2,2
+ add edx,DWORD[40+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpor xmm2,xmm2,xmm8
+ add ecx,DWORD[44+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpalignr xmm8,xmm2,xmm1,8
+ vpxor xmm3,xmm3,xmm7
+ add ebx,DWORD[48+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ vpxor xmm3,xmm3,xmm4
+ add ebx,esi
+ xor edi,ebp
+ vpaddd xmm9,xmm11,xmm2
+ shrd edx,edx,7
+ add ebx,ecx
+ vpxor xmm3,xmm3,xmm8
+ add eax,DWORD[52+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ vpsrld xmm8,xmm3,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpslld xmm3,xmm3,2
+ add ebp,DWORD[56+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpor xmm3,xmm3,xmm8
+ add edx,DWORD[60+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpalignr xmm8,xmm3,xmm2,8
+ vpxor xmm4,xmm4,xmm0
+ add ecx,DWORD[rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ vpxor xmm4,xmm4,xmm5
+ add ecx,esi
+ xor edi,eax
+ vpaddd xmm9,xmm11,xmm3
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpxor xmm4,xmm4,xmm8
+ add ebx,DWORD[4+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ vpsrld xmm8,xmm4,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpslld xmm4,xmm4,2
+ add eax,DWORD[8+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ vpor xmm4,xmm4,xmm8
+ add ebp,DWORD[12+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpalignr xmm8,xmm4,xmm3,8
+ vpxor xmm5,xmm5,xmm1
+ add edx,DWORD[16+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ vpxor xmm5,xmm5,xmm6
+ add edx,esi
+ xor edi,ebx
+ vpaddd xmm9,xmm11,xmm4
+ shrd eax,eax,7
+ add edx,ebp
+ vpxor xmm5,xmm5,xmm8
+ add ecx,DWORD[20+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ vpsrld xmm8,xmm5,30
+ vmovdqa XMMWORD[rsp],xmm9
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpslld xmm5,xmm5,2
+ add ebx,DWORD[24+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vpor xmm5,xmm5,xmm8
+ add eax,DWORD[28+rsp]
+ shrd ecx,ecx,7
+ mov esi,ebx
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ vpalignr xmm8,xmm5,xmm4,8
+ vpxor xmm6,xmm6,xmm2
+ add ebp,DWORD[32+rsp]
+ and esi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ vpxor xmm6,xmm6,xmm7
+ mov edi,eax
+ xor esi,ecx
+ vpaddd xmm9,xmm11,xmm5
+ shld eax,eax,5
+ add ebp,esi
+ vpxor xmm6,xmm6,xmm8
+ xor edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[36+rsp]
+ vpsrld xmm8,xmm6,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ and edi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov esi,ebp
+ vpslld xmm6,xmm6,2
+ xor edi,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[40+rsp]
+ and esi,eax
+ vpor xmm6,xmm6,xmm8
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov edi,edx
+ xor esi,eax
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[44+rsp]
+ and edi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ mov esi,ecx
+ xor edi,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ vpalignr xmm8,xmm6,xmm5,8
+ vpxor xmm7,xmm7,xmm3
+ add eax,DWORD[48+rsp]
+ and esi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ vpxor xmm7,xmm7,xmm0
+ mov edi,ebx
+ xor esi,edx
+ vpaddd xmm9,xmm11,xmm6
+ vmovdqa xmm11,XMMWORD[32+r11]
+ shld ebx,ebx,5
+ add eax,esi
+ vpxor xmm7,xmm7,xmm8
+ xor edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[52+rsp]
+ vpsrld xmm8,xmm7,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ and edi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov esi,eax
+ vpslld xmm7,xmm7,2
+ xor edi,ecx
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[56+rsp]
+ and esi,ebx
+ vpor xmm7,xmm7,xmm8
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov edi,ebp
+ xor esi,ebx
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[60+rsp]
+ and edi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov esi,edx
+ xor edi,eax
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ vpalignr xmm8,xmm7,xmm6,8
+ vpxor xmm0,xmm0,xmm4
+ add ebx,DWORD[rsp]
+ and esi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ vpxor xmm0,xmm0,xmm1
+ mov edi,ecx
+ xor esi,ebp
+ vpaddd xmm9,xmm11,xmm7
+ shld ecx,ecx,5
+ add ebx,esi
+ vpxor xmm0,xmm0,xmm8
+ xor edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[4+rsp]
+ vpsrld xmm8,xmm0,30
+ vmovdqa XMMWORD[48+rsp],xmm9
+ and edi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov esi,ebx
+ vpslld xmm0,xmm0,2
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[8+rsp]
+ and esi,ecx
+ vpor xmm0,xmm0,xmm8
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov edi,eax
+ xor esi,ecx
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ add edx,DWORD[12+rsp]
+ and edi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ mov esi,ebp
+ xor edi,ebx
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,eax
+ xor eax,ebx
+ add edx,ebp
+ vpalignr xmm8,xmm0,xmm7,8
+ vpxor xmm1,xmm1,xmm5
+ add ecx,DWORD[16+rsp]
+ and esi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ vpxor xmm1,xmm1,xmm2
+ mov edi,edx
+ xor esi,eax
+ vpaddd xmm9,xmm11,xmm0
+ shld edx,edx,5
+ add ecx,esi
+ vpxor xmm1,xmm1,xmm8
+ xor edi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[20+rsp]
+ vpsrld xmm8,xmm1,30
+ vmovdqa XMMWORD[rsp],xmm9
+ and edi,ebp
+ xor ebp,eax
+ shrd edx,edx,7
+ mov esi,ecx
+ vpslld xmm1,xmm1,2
+ xor edi,ebp
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[24+rsp]
+ and esi,edx
+ vpor xmm1,xmm1,xmm8
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov edi,ebx
+ xor esi,edx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,ecx
+ xor ecx,edx
+ add eax,ebx
+ add ebp,DWORD[28+rsp]
+ and edi,ecx
+ xor ecx,edx
+ shrd ebx,ebx,7
+ mov esi,eax
+ xor edi,ecx
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ebx
+ xor ebx,ecx
+ add ebp,eax
+ vpalignr xmm8,xmm1,xmm0,8
+ vpxor xmm2,xmm2,xmm6
+ add edx,DWORD[32+rsp]
+ and esi,ebx
+ xor ebx,ecx
+ shrd eax,eax,7
+ vpxor xmm2,xmm2,xmm3
+ mov edi,ebp
+ xor esi,ebx
+ vpaddd xmm9,xmm11,xmm1
+ shld ebp,ebp,5
+ add edx,esi
+ vpxor xmm2,xmm2,xmm8
+ xor edi,eax
+ xor eax,ebx
+ add edx,ebp
+ add ecx,DWORD[36+rsp]
+ vpsrld xmm8,xmm2,30
+ vmovdqa XMMWORD[16+rsp],xmm9
+ and edi,eax
+ xor eax,ebx
+ shrd ebp,ebp,7
+ mov esi,edx
+ vpslld xmm2,xmm2,2
+ xor edi,eax
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,ebp
+ xor ebp,eax
+ add ecx,edx
+ add ebx,DWORD[40+rsp]
+ and esi,ebp
+ vpor xmm2,xmm2,xmm8
+ xor ebp,eax
+ shrd edx,edx,7
+ mov edi,ecx
+ xor esi,ebp
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,edx
+ xor edx,ebp
+ add ebx,ecx
+ add eax,DWORD[44+rsp]
+ and edi,edx
+ xor edx,ebp
+ shrd ecx,ecx,7
+ mov esi,ebx
+ xor edi,edx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ add eax,ebx
+ vpalignr xmm8,xmm2,xmm1,8
+ vpxor xmm3,xmm3,xmm7
+ add ebp,DWORD[48+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ vpxor xmm3,xmm3,xmm4
+ add ebp,esi
+ xor edi,ecx
+ vpaddd xmm9,xmm11,xmm2
+ shrd ebx,ebx,7
+ add ebp,eax
+ vpxor xmm3,xmm3,xmm8
+ add edx,DWORD[52+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ vpsrld xmm8,xmm3,30
+ vmovdqa XMMWORD[32+rsp],xmm9
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vpslld xmm3,xmm3,2
+ add ecx,DWORD[56+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vpor xmm3,xmm3,xmm8
+ add ebx,DWORD[60+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[rsp]
+ vpaddd xmm9,xmm11,xmm3
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ vmovdqa XMMWORD[48+rsp],xmm9
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[4+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[8+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[12+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ cmp r9,r10
+ je NEAR $L$done_avx
+ vmovdqa xmm6,XMMWORD[64+r11]
+ vmovdqa xmm11,XMMWORD[((-64))+r11]
+ vmovdqu xmm0,XMMWORD[r9]
+ vmovdqu xmm1,XMMWORD[16+r9]
+ vmovdqu xmm2,XMMWORD[32+r9]
+ vmovdqu xmm3,XMMWORD[48+r9]
+ vpshufb xmm0,xmm0,xmm6
+ add r9,64
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ vpshufb xmm1,xmm1,xmm6
+ mov edi,ecx
+ shld ecx,ecx,5
+ vpaddd xmm4,xmm0,xmm11
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ vmovdqa XMMWORD[rsp],xmm4
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ vpshufb xmm2,xmm2,xmm6
+ mov edi,edx
+ shld edx,edx,5
+ vpaddd xmm5,xmm1,xmm11
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ vmovdqa XMMWORD[16+rsp],xmm5
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ vpshufb xmm3,xmm3,xmm6
+ mov edi,ebp
+ shld ebp,ebp,5
+ vpaddd xmm6,xmm2,xmm11
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ vmovdqa XMMWORD[32+rsp],xmm6
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ shrd ecx,ecx,7
+ add eax,ebx
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ add edx,DWORD[12+r8]
+ mov DWORD[r8],eax
+ add ebp,DWORD[16+r8]
+ mov DWORD[4+r8],esi
+ mov ebx,esi
+ mov DWORD[8+r8],ecx
+ mov edi,ecx
+ mov DWORD[12+r8],edx
+ xor edi,edx
+ mov DWORD[16+r8],ebp
+ and esi,edi
+ jmp NEAR $L$oop_avx
+
+ALIGN 16
+$L$done_avx:
+ add ebx,DWORD[16+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[20+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ xor esi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[24+rsp]
+ xor esi,ecx
+ mov edi,eax
+ shld eax,eax,5
+ add ebp,esi
+ xor edi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[28+rsp]
+ xor edi,ebx
+ mov esi,ebp
+ shld ebp,ebp,5
+ add edx,edi
+ xor esi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[32+rsp]
+ xor esi,eax
+ mov edi,edx
+ shld edx,edx,5
+ add ecx,esi
+ xor edi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[36+rsp]
+ xor edi,ebp
+ mov esi,ecx
+ shld ecx,ecx,5
+ add ebx,edi
+ xor esi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[40+rsp]
+ xor esi,edx
+ mov edi,ebx
+ shld ebx,ebx,5
+ add eax,esi
+ xor edi,edx
+ shrd ecx,ecx,7
+ add eax,ebx
+ add ebp,DWORD[44+rsp]
+ xor edi,ecx
+ mov esi,eax
+ shld eax,eax,5
+ add ebp,edi
+ xor esi,ecx
+ shrd ebx,ebx,7
+ add ebp,eax
+ add edx,DWORD[48+rsp]
+ xor esi,ebx
+ mov edi,ebp
+ shld ebp,ebp,5
+ add edx,esi
+ xor edi,ebx
+ shrd eax,eax,7
+ add edx,ebp
+ add ecx,DWORD[52+rsp]
+ xor edi,eax
+ mov esi,edx
+ shld edx,edx,5
+ add ecx,edi
+ xor esi,eax
+ shrd ebp,ebp,7
+ add ecx,edx
+ add ebx,DWORD[56+rsp]
+ xor esi,ebp
+ mov edi,ecx
+ shld ecx,ecx,5
+ add ebx,esi
+ xor edi,ebp
+ shrd edx,edx,7
+ add ebx,ecx
+ add eax,DWORD[60+rsp]
+ xor edi,edx
+ mov esi,ebx
+ shld ebx,ebx,5
+ add eax,edi
+ shrd ecx,ecx,7
+ add eax,ebx
+ vzeroupper
+
+ add eax,DWORD[r8]
+ add esi,DWORD[4+r8]
+ add ecx,DWORD[8+r8]
+ mov DWORD[r8],eax
+ add edx,DWORD[12+r8]
+ mov DWORD[4+r8],esi
+ add ebp,DWORD[16+r8]
+ mov DWORD[8+r8],ecx
+ mov DWORD[12+r8],edx
+ mov DWORD[16+r8],ebp
+ movaps xmm6,XMMWORD[((-40-96))+r14]
+ movaps xmm7,XMMWORD[((-40-80))+r14]
+ movaps xmm8,XMMWORD[((-40-64))+r14]
+ movaps xmm9,XMMWORD[((-40-48))+r14]
+ movaps xmm10,XMMWORD[((-40-32))+r14]
+ movaps xmm11,XMMWORD[((-40-16))+r14]
+ lea rsi,[r14]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+$L$SEH_end_sha1_block_data_order_avx:
ALIGN 64
K_XX_XX:
DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -2605,6 +3750,9 @@ ALIGN 4
DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha1_block_data_order:
@@ -2614,3 +3762,7 @@ $L$SEH_info_sha1_block_data_order_ssse3:
DB 9,0,0,0
DD ssse3_handler wrt ..imagebase
DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha1_block_data_order_avx:
+DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
diff --git a/win-x86_64/crypto/sha/sha256-x86_64.asm b/win-x86_64/crypto/sha/sha256-x86_64.asm
index e6193c5..efaf9b5 100644
--- a/win-x86_64/crypto/sha/sha256-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha256-x86_64.asm
@@ -23,6 +23,11 @@ $L$SEH_begin_sha256_block_data_order:
mov r9d,DWORD[r11]
mov r10d,DWORD[4+r11]
mov r11d,DWORD[8+r11]
+ and r9d,1073741824
+ and r10d,268435968
+ or r10d,r9d
+ cmp r10d,1342177792
+ je NEAR $L$avx_shortcut
test r10d,512
jnz NEAR $L$ssse3_shortcut
push rbx
@@ -2877,6 +2882,1082 @@ $L$epilogue_ssse3:
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_sha256_block_data_order_ssse3:
+
+ALIGN 64
+sha256_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+$L$avx_shortcut:
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r11,rsp
+ shl rdx,4
+ sub rsp,160
+ lea rdx,[rdx*4+rsi]
+ and rsp,-64
+ mov QWORD[((64+0))+rsp],rdi
+ mov QWORD[((64+8))+rsp],rsi
+ mov QWORD[((64+16))+rsp],rdx
+ mov QWORD[((64+24))+rsp],r11
+ movaps XMMWORD[(64+32)+rsp],xmm6
+ movaps XMMWORD[(64+48)+rsp],xmm7
+ movaps XMMWORD[(64+64)+rsp],xmm8
+ movaps XMMWORD[(64+80)+rsp],xmm9
+$L$prologue_avx:
+
+ vzeroupper
+ mov eax,DWORD[rdi]
+ mov ebx,DWORD[4+rdi]
+ mov ecx,DWORD[8+rdi]
+ mov edx,DWORD[12+rdi]
+ mov r8d,DWORD[16+rdi]
+ mov r9d,DWORD[20+rdi]
+ mov r10d,DWORD[24+rdi]
+ mov r11d,DWORD[28+rdi]
+ vmovdqa xmm8,XMMWORD[((K256+512+32))]
+ vmovdqa xmm9,XMMWORD[((K256+512+64))]
+ jmp NEAR $L$loop_avx
+ALIGN 16
+$L$loop_avx:
+ vmovdqa xmm7,XMMWORD[((K256+512))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm0,xmm0,xmm7
+ lea rbp,[K256]
+ vpshufb xmm1,xmm1,xmm7
+ vpshufb xmm2,xmm2,xmm7
+ vpaddd xmm4,xmm0,XMMWORD[rbp]
+ vpshufb xmm3,xmm3,xmm7
+ vpaddd xmm5,xmm1,XMMWORD[32+rbp]
+ vpaddd xmm6,xmm2,XMMWORD[64+rbp]
+ vpaddd xmm7,xmm3,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[rsp],xmm4
+ mov r14d,eax
+ vmovdqa XMMWORD[16+rsp],xmm5
+ mov edi,ebx
+ vmovdqa XMMWORD[32+rsp],xmm6
+ xor edi,ecx
+ vmovdqa XMMWORD[48+rsp],xmm7
+ mov r13d,r8d
+ jmp NEAR $L$avx_00_47
+
+ALIGN 16
+$L$avx_00_47:
+ sub rbp,-128
+ vpalignr xmm4,xmm1,xmm0,4
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ vpalignr xmm7,xmm3,xmm2,4
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ vpaddd xmm0,xmm0,xmm7
+ xor r13d,r8d
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ vpsrld xmm7,xmm4,3
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ vpslld xmm5,xmm4,14
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ vpshufd xmm7,xmm3,250
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ vpsrld xmm6,xmm7,10
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ vpaddd xmm0,xmm0,xmm4
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ vpsrlq xmm7,xmm7,2
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ vpaddd xmm0,xmm0,xmm6
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ vpshufd xmm7,xmm0,80
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ vpsrlq xmm7,xmm7,2
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ vpaddd xmm0,xmm0,xmm6
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ vpaddd xmm6,xmm0,XMMWORD[rbp]
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ vmovdqa XMMWORD[rsp],xmm6
+ vpalignr xmm4,xmm2,xmm1,4
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ vpalignr xmm7,xmm0,xmm3,4
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ vpaddd xmm1,xmm1,xmm7
+ xor r13d,eax
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ vpsrld xmm7,xmm4,3
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ vpslld xmm5,xmm4,14
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ vpshufd xmm7,xmm0,250
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ vpsrld xmm6,xmm7,10
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ vpaddd xmm1,xmm1,xmm4
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ vpsrlq xmm7,xmm7,2
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ vpaddd xmm1,xmm1,xmm6
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ vpshufd xmm7,xmm1,80
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ vpsrlq xmm7,xmm7,2
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ vpaddd xmm1,xmm1,xmm6
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ vpaddd xmm6,xmm1,XMMWORD[32+rbp]
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ vmovdqa XMMWORD[16+rsp],xmm6
+ vpalignr xmm4,xmm3,xmm2,4
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ vpalignr xmm7,xmm1,xmm0,4
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ vpaddd xmm2,xmm2,xmm7
+ xor r13d,r8d
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ vpsrld xmm7,xmm4,3
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ vpslld xmm5,xmm4,14
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ vpshufd xmm7,xmm1,250
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ vpsrld xmm6,xmm7,10
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ vpaddd xmm2,xmm2,xmm4
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ vpsrlq xmm7,xmm7,2
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ vpaddd xmm2,xmm2,xmm6
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ vpshufd xmm7,xmm2,80
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ vpsrlq xmm7,xmm7,2
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ vpaddd xmm2,xmm2,xmm6
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ vpaddd xmm6,xmm2,XMMWORD[64+rbp]
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ vmovdqa XMMWORD[32+rsp],xmm6
+ vpalignr xmm4,xmm0,xmm3,4
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ vpalignr xmm7,xmm2,xmm1,4
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ vpsrld xmm6,xmm4,7
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ vpaddd xmm3,xmm3,xmm7
+ xor r13d,eax
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ vpsrld xmm7,xmm4,3
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ vpslld xmm5,xmm4,14
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ vpxor xmm4,xmm7,xmm6
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ vpshufd xmm7,xmm2,250
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ vpsrld xmm6,xmm6,11
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ vpxor xmm4,xmm4,xmm5
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ vpslld xmm5,xmm5,11
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ vpxor xmm4,xmm4,xmm6
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ vpsrld xmm6,xmm7,10
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ vpxor xmm4,xmm4,xmm5
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ vpsrlq xmm7,xmm7,17
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ vpaddd xmm3,xmm3,xmm4
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ vpxor xmm6,xmm6,xmm7
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ vpsrlq xmm7,xmm7,2
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ vpxor xmm6,xmm6,xmm7
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ vpshufb xmm6,xmm6,xmm8
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ vpaddd xmm3,xmm3,xmm6
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ vpshufd xmm7,xmm3,80
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ vpsrld xmm6,xmm7,10
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ vpsrlq xmm7,xmm7,17
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ vpxor xmm6,xmm6,xmm7
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ vpsrlq xmm7,xmm7,2
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ vpxor xmm6,xmm6,xmm7
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ vpshufb xmm6,xmm6,xmm9
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ vpaddd xmm3,xmm3,xmm6
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ vpaddd xmm6,xmm3,XMMWORD[96+rbp]
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ vmovdqa XMMWORD[48+rsp],xmm6
+ cmp BYTE[131+rbp],0
+ jne NEAR $L$avx_00_47
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[4+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[8+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[12+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[16+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[20+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[24+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[28+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ shrd r13d,r13d,14
+ mov eax,r14d
+ mov r12d,r9d
+ shrd r14d,r14d,9
+ xor r13d,r8d
+ xor r12d,r10d
+ shrd r13d,r13d,5
+ xor r14d,eax
+ and r12d,r8d
+ xor r13d,r8d
+ add r11d,DWORD[32+rsp]
+ mov r15d,eax
+ xor r12d,r10d
+ shrd r14d,r14d,11
+ xor r15d,ebx
+ add r11d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,eax
+ add r11d,r13d
+ xor edi,ebx
+ shrd r14d,r14d,2
+ add edx,r11d
+ add r11d,edi
+ mov r13d,edx
+ add r14d,r11d
+ shrd r13d,r13d,14
+ mov r11d,r14d
+ mov r12d,r8d
+ shrd r14d,r14d,9
+ xor r13d,edx
+ xor r12d,r9d
+ shrd r13d,r13d,5
+ xor r14d,r11d
+ and r12d,edx
+ xor r13d,edx
+ add r10d,DWORD[36+rsp]
+ mov edi,r11d
+ xor r12d,r9d
+ shrd r14d,r14d,11
+ xor edi,eax
+ add r10d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r11d
+ add r10d,r13d
+ xor r15d,eax
+ shrd r14d,r14d,2
+ add ecx,r10d
+ add r10d,r15d
+ mov r13d,ecx
+ add r14d,r10d
+ shrd r13d,r13d,14
+ mov r10d,r14d
+ mov r12d,edx
+ shrd r14d,r14d,9
+ xor r13d,ecx
+ xor r12d,r8d
+ shrd r13d,r13d,5
+ xor r14d,r10d
+ and r12d,ecx
+ xor r13d,ecx
+ add r9d,DWORD[40+rsp]
+ mov r15d,r10d
+ xor r12d,r8d
+ shrd r14d,r14d,11
+ xor r15d,r11d
+ add r9d,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r10d
+ add r9d,r13d
+ xor edi,r11d
+ shrd r14d,r14d,2
+ add ebx,r9d
+ add r9d,edi
+ mov r13d,ebx
+ add r14d,r9d
+ shrd r13d,r13d,14
+ mov r9d,r14d
+ mov r12d,ecx
+ shrd r14d,r14d,9
+ xor r13d,ebx
+ xor r12d,edx
+ shrd r13d,r13d,5
+ xor r14d,r9d
+ and r12d,ebx
+ xor r13d,ebx
+ add r8d,DWORD[44+rsp]
+ mov edi,r9d
+ xor r12d,edx
+ shrd r14d,r14d,11
+ xor edi,r10d
+ add r8d,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,r9d
+ add r8d,r13d
+ xor r15d,r10d
+ shrd r14d,r14d,2
+ add eax,r8d
+ add r8d,r15d
+ mov r13d,eax
+ add r14d,r8d
+ shrd r13d,r13d,14
+ mov r8d,r14d
+ mov r12d,ebx
+ shrd r14d,r14d,9
+ xor r13d,eax
+ xor r12d,ecx
+ shrd r13d,r13d,5
+ xor r14d,r8d
+ and r12d,eax
+ xor r13d,eax
+ add edx,DWORD[48+rsp]
+ mov r15d,r8d
+ xor r12d,ecx
+ shrd r14d,r14d,11
+ xor r15d,r9d
+ add edx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,r8d
+ add edx,r13d
+ xor edi,r9d
+ shrd r14d,r14d,2
+ add r11d,edx
+ add edx,edi
+ mov r13d,r11d
+ add r14d,edx
+ shrd r13d,r13d,14
+ mov edx,r14d
+ mov r12d,eax
+ shrd r14d,r14d,9
+ xor r13d,r11d
+ xor r12d,ebx
+ shrd r13d,r13d,5
+ xor r14d,edx
+ and r12d,r11d
+ xor r13d,r11d
+ add ecx,DWORD[52+rsp]
+ mov edi,edx
+ xor r12d,ebx
+ shrd r14d,r14d,11
+ xor edi,r8d
+ add ecx,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,edx
+ add ecx,r13d
+ xor r15d,r8d
+ shrd r14d,r14d,2
+ add r10d,ecx
+ add ecx,r15d
+ mov r13d,r10d
+ add r14d,ecx
+ shrd r13d,r13d,14
+ mov ecx,r14d
+ mov r12d,r11d
+ shrd r14d,r14d,9
+ xor r13d,r10d
+ xor r12d,eax
+ shrd r13d,r13d,5
+ xor r14d,ecx
+ and r12d,r10d
+ xor r13d,r10d
+ add ebx,DWORD[56+rsp]
+ mov r15d,ecx
+ xor r12d,eax
+ shrd r14d,r14d,11
+ xor r15d,edx
+ add ebx,r12d
+ shrd r13d,r13d,6
+ and edi,r15d
+ xor r14d,ecx
+ add ebx,r13d
+ xor edi,edx
+ shrd r14d,r14d,2
+ add r9d,ebx
+ add ebx,edi
+ mov r13d,r9d
+ add r14d,ebx
+ shrd r13d,r13d,14
+ mov ebx,r14d
+ mov r12d,r10d
+ shrd r14d,r14d,9
+ xor r13d,r9d
+ xor r12d,r11d
+ shrd r13d,r13d,5
+ xor r14d,ebx
+ and r12d,r9d
+ xor r13d,r9d
+ add eax,DWORD[60+rsp]
+ mov edi,ebx
+ xor r12d,r11d
+ shrd r14d,r14d,11
+ xor edi,ecx
+ add eax,r12d
+ shrd r13d,r13d,6
+ and r15d,edi
+ xor r14d,ebx
+ add eax,r13d
+ xor r15d,ecx
+ shrd r14d,r14d,2
+ add r8d,eax
+ add eax,r15d
+ mov r13d,r8d
+ add r14d,eax
+ mov rdi,QWORD[((64+0))+rsp]
+ mov eax,r14d
+
+ add eax,DWORD[rdi]
+ lea rsi,[64+rsi]
+ add ebx,DWORD[4+rdi]
+ add ecx,DWORD[8+rdi]
+ add edx,DWORD[12+rdi]
+ add r8d,DWORD[16+rdi]
+ add r9d,DWORD[20+rdi]
+ add r10d,DWORD[24+rdi]
+ add r11d,DWORD[28+rdi]
+
+ cmp rsi,QWORD[((64+16))+rsp]
+
+ mov DWORD[rdi],eax
+ mov DWORD[4+rdi],ebx
+ mov DWORD[8+rdi],ecx
+ mov DWORD[12+rdi],edx
+ mov DWORD[16+rdi],r8d
+ mov DWORD[20+rdi],r9d
+ mov DWORD[24+rdi],r10d
+ mov DWORD[28+rdi],r11d
+ jb NEAR $L$loop_avx
+
+ mov rsi,QWORD[((64+24))+rsp]
+ vzeroupper
+ movaps xmm6,XMMWORD[((64+32))+rsp]
+ movaps xmm7,XMMWORD[((64+48))+rsp]
+ movaps xmm8,XMMWORD[((64+64))+rsp]
+ movaps xmm9,XMMWORD[((64+80))+rsp]
+ mov r15,QWORD[rsi]
+ mov r14,QWORD[8+rsi]
+ mov r13,QWORD[16+rsi]
+ mov r12,QWORD[24+rsi]
+ mov rbp,QWORD[32+rsi]
+ mov rbx,QWORD[40+rsi]
+ lea rsp,[48+rsi]
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+$L$SEH_end_sha256_block_data_order_avx:
EXTERN __imp_RtlVirtualUnwind
ALIGN 16
@@ -2982,6 +4063,9 @@ ALIGN 4
DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha256_block_data_order:
@@ -2992,3 +4076,7 @@ $L$SEH_info_sha256_block_data_order_ssse3:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_avx:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
diff --git a/win-x86_64/crypto/sha/sha512-x86_64.asm b/win-x86_64/crypto/sha/sha512-x86_64.asm
index b76cc0e..71449cd 100644
--- a/win-x86_64/crypto/sha/sha512-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha512-x86_64.asm
@@ -19,6 +19,17 @@ $L$SEH_begin_sha512_block_data_order:
mov rdx,r8
+ lea r11,[OPENSSL_ia32cap_P]
+ mov r9d,DWORD[r11]
+ mov r10d,DWORD[4+r11]
+ mov r11d,DWORD[8+r11]
+ test r10d,2048
+ jnz NEAR $L$xop_shortcut
+ and r9d,1073741824
+ and r10d,268435968
+ or r10d,r9d
+ cmp r10d,1342177792
+ je NEAR $L$avx_shortcut
push rbx
push rbp
push r12
@@ -1801,6 +1812,2282 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54
DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
DB 111,114,103,62,0
+
+ALIGN 64
+sha512_block_data_order_xop:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_xop:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+$L$xop_shortcut:
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r11,rsp
+ shl rdx,4
+ sub rsp,256
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[((128+24))+rsp],r11
+ movaps XMMWORD[(128+32)+rsp],xmm6
+ movaps XMMWORD[(128+48)+rsp],xmm7
+ movaps XMMWORD[(128+64)+rsp],xmm8
+ movaps XMMWORD[(128+80)+rsp],xmm9
+ movaps XMMWORD[(128+96)+rsp],xmm10
+ movaps XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_xop:
+
+ vzeroupper
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop_xop
+ALIGN 16
+$L$loop_xop:
+ vmovdqa xmm11,XMMWORD[((K512+1280))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ lea rbp,[((K512+128))]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vpshufb xmm0,xmm0,xmm11
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm1,xmm1,xmm11
+ vmovdqu xmm4,XMMWORD[64+rsi]
+ vpshufb xmm2,xmm2,xmm11
+ vmovdqu xmm5,XMMWORD[80+rsi]
+ vpshufb xmm3,xmm3,xmm11
+ vmovdqu xmm6,XMMWORD[96+rsi]
+ vpshufb xmm4,xmm4,xmm11
+ vmovdqu xmm7,XMMWORD[112+rsi]
+ vpshufb xmm5,xmm5,xmm11
+ vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
+ vpshufb xmm6,xmm6,xmm11
+ vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
+ vpshufb xmm7,xmm7,xmm11
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
+ vmovdqa XMMWORD[rsp],xmm8
+ vpaddq xmm8,xmm4,XMMWORD[rbp]
+ vmovdqa XMMWORD[16+rsp],xmm9
+ vpaddq xmm9,xmm5,XMMWORD[32+rbp]
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ vmovdqa XMMWORD[48+rsp],xmm11
+ vpaddq xmm11,xmm7,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[64+rsp],xmm8
+ mov r14,rax
+ vmovdqa XMMWORD[80+rsp],xmm9
+ mov rdi,rbx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ xor rdi,rcx
+ vmovdqa XMMWORD[112+rsp],xmm11
+ mov r13,r8
+ jmp NEAR $L$xop_00_47
+
+ALIGN 16
+$L$xop_00_47:
+ add rbp,256
+ vpalignr xmm8,xmm1,xmm0,8
+ ror r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm5,xmm4,8
+ mov r12,r9
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r8
+ xor r12,r10
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rax
+ vpaddq xmm0,xmm0,xmm11
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+DB 143,72,120,195,209,7
+ xor r12,r10
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,223,3
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ ror r14,28
+ vpsrlq xmm10,xmm7,6
+ add rdx,r11
+ add r11,rdi
+ vpaddq xmm0,xmm0,xmm8
+ mov r13,rdx
+ add r14,r11
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r11,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ vpaddq xmm0,xmm0,xmm11
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[rsp],xmm10
+ vpalignr xmm8,xmm2,xmm1,8
+ ror r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm6,xmm5,8
+ mov r12,rdx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rcx
+ xor r12,r8
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r10
+ vpaddq xmm1,xmm1,xmm11
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+DB 143,72,120,195,209,7
+ xor r12,r8
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,216,3
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ ror r14,28
+ vpsrlq xmm10,xmm0,6
+ add rbx,r9
+ add r9,rdi
+ vpaddq xmm1,xmm1,xmm8
+ mov r13,rbx
+ add r14,r9
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r9,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ vpaddq xmm1,xmm1,xmm11
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[16+rsp],xmm10
+ vpalignr xmm8,xmm3,xmm2,8
+ ror r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm7,xmm6,8
+ mov r12,rbx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rax
+ xor r12,rcx
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r8
+ vpaddq xmm2,xmm2,xmm11
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+DB 143,72,120,195,209,7
+ xor r12,rcx
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,217,3
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ ror r14,28
+ vpsrlq xmm10,xmm1,6
+ add r11,rdx
+ add rdx,rdi
+ vpaddq xmm2,xmm2,xmm8
+ mov r13,r11
+ add r14,rdx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rdx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ vpaddq xmm2,xmm2,xmm11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpalignr xmm8,xmm4,xmm3,8
+ ror r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm0,xmm7,8
+ mov r12,r11
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r10
+ xor r12,rax
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rcx
+ vpaddq xmm3,xmm3,xmm11
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+DB 143,72,120,195,209,7
+ xor r12,rax
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,218,3
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ ror r14,28
+ vpsrlq xmm10,xmm2,6
+ add r9,rbx
+ add rbx,rdi
+ vpaddq xmm3,xmm3,xmm8
+ mov r13,r9
+ add r14,rbx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rbx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ vpaddq xmm3,xmm3,xmm11
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[48+rsp],xmm10
+ vpalignr xmm8,xmm5,xmm4,8
+ ror r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm1,xmm0,8
+ mov r12,r9
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r8
+ xor r12,r10
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rax
+ vpaddq xmm4,xmm4,xmm11
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+DB 143,72,120,195,209,7
+ xor r12,r10
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,219,3
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ ror r14,28
+ vpsrlq xmm10,xmm3,6
+ add rdx,r11
+ add r11,rdi
+ vpaddq xmm4,xmm4,xmm8
+ mov r13,rdx
+ add r14,r11
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r11,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ vpaddq xmm4,xmm4,xmm11
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ vpaddq xmm10,xmm4,XMMWORD[rbp]
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[64+rsp],xmm10
+ vpalignr xmm8,xmm6,xmm5,8
+ ror r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm2,xmm1,8
+ mov r12,rdx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rcx
+ xor r12,r8
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r10
+ vpaddq xmm5,xmm5,xmm11
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+DB 143,72,120,195,209,7
+ xor r12,r8
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,220,3
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ ror r14,28
+ vpsrlq xmm10,xmm4,6
+ add rbx,r9
+ add r9,rdi
+ vpaddq xmm5,xmm5,xmm8
+ mov r13,rbx
+ add r14,r9
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov r9,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ vpaddq xmm5,xmm5,xmm11
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ vpaddq xmm10,xmm5,XMMWORD[32+rbp]
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[80+rsp],xmm10
+ vpalignr xmm8,xmm7,xmm6,8
+ ror r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm3,xmm2,8
+ mov r12,rbx
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,rax
+ xor r12,rcx
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,r8
+ vpaddq xmm6,xmm6,xmm11
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+DB 143,72,120,195,209,7
+ xor r12,rcx
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,221,3
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ ror r14,28
+ vpsrlq xmm10,xmm5,6
+ add r11,rdx
+ add rdx,rdi
+ vpaddq xmm6,xmm6,xmm8
+ mov r13,r11
+ add r14,rdx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rdx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ vpaddq xmm6,xmm6,xmm11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ vpalignr xmm8,xmm0,xmm7,8
+ ror r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm4,xmm3,8
+ mov r12,r11
+ ror r14,5
+DB 143,72,120,195,200,56
+ xor r13,r10
+ xor r12,rax
+ vpsrlq xmm8,xmm8,7
+ ror r13,4
+ xor r14,rcx
+ vpaddq xmm7,xmm7,xmm11
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+DB 143,72,120,195,209,7
+ xor r12,rax
+ ror r14,6
+ vpxor xmm8,xmm8,xmm9
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+DB 143,104,120,195,222,3
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ ror r14,28
+ vpsrlq xmm10,xmm6,6
+ add r9,rbx
+ add rbx,rdi
+ vpaddq xmm7,xmm7,xmm8
+ mov r13,r9
+ add r14,rbx
+DB 143,72,120,195,203,42
+ ror r13,23
+ mov rbx,r14
+ vpxor xmm11,xmm11,xmm10
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm9
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ vpaddq xmm7,xmm7,xmm11
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ vpaddq xmm10,xmm7,XMMWORD[96+rbp]
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[112+rsp],xmm10
+ cmp BYTE[135+rbp],0
+ jne NEAR $L$xop_00_47
+ ror r13,23
+ mov rax,r14
+ mov r12,r9
+ ror r14,5
+ xor r13,r8
+ xor r12,r10
+ ror r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+ xor r12,r10
+ ror r14,6
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ ror r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ ror r13,23
+ mov r11,r14
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ ror r13,23
+ mov r10,r14
+ mov r12,rdx
+ ror r14,5
+ xor r13,rcx
+ xor r12,r8
+ ror r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ xor r12,r8
+ ror r14,6
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ ror r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ ror r13,23
+ mov r9,r14
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ ror r13,23
+ mov r8,r14
+ mov r12,rbx
+ ror r14,5
+ xor r13,rax
+ xor r12,rcx
+ ror r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ xor r12,rcx
+ ror r14,6
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ ror r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ ror r13,23
+ mov rdx,r14
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ ror r13,23
+ mov rcx,r14
+ mov r12,r11
+ ror r14,5
+ xor r13,r10
+ xor r12,rax
+ ror r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ xor r12,rax
+ ror r14,6
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ ror r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ ror r13,23
+ mov rbx,r14
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ ror r13,23
+ mov rax,r14
+ mov r12,r9
+ ror r14,5
+ xor r13,r8
+ xor r12,r10
+ ror r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ xor r12,r10
+ ror r14,6
+ xor r15,rbx
+ add r11,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ ror r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ ror r13,23
+ mov r11,r14
+ mov r12,r8
+ ror r14,5
+ xor r13,rdx
+ xor r12,r9
+ ror r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ ror r14,6
+ xor rdi,rax
+ add r10,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ ror r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ ror r13,23
+ mov r10,r14
+ mov r12,rdx
+ ror r14,5
+ xor r13,rcx
+ xor r12,r8
+ ror r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ xor r12,r8
+ ror r14,6
+ xor r15,r11
+ add r9,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ ror r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ ror r13,23
+ mov r9,r14
+ mov r12,rcx
+ ror r14,5
+ xor r13,rbx
+ xor r12,rdx
+ ror r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ ror r14,6
+ xor rdi,r10
+ add r8,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ ror r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ ror r13,23
+ mov r8,r14
+ mov r12,rbx
+ ror r14,5
+ xor r13,rax
+ xor r12,rcx
+ ror r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ xor r12,rcx
+ ror r14,6
+ xor r15,r9
+ add rdx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ ror r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ ror r13,23
+ mov rdx,r14
+ mov r12,rax
+ ror r14,5
+ xor r13,r11
+ xor r12,rbx
+ ror r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ ror r14,6
+ xor rdi,r8
+ add rcx,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ ror r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ ror r13,23
+ mov rcx,r14
+ mov r12,r11
+ ror r14,5
+ xor r13,r10
+ xor r12,rax
+ ror r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ xor r12,rax
+ ror r14,6
+ xor r15,rdx
+ add rbx,r12
+ ror r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ ror r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ ror r13,23
+ mov rbx,r14
+ mov r12,r10
+ ror r14,5
+ xor r13,r9
+ xor r12,r11
+ ror r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ ror r14,6
+ xor rdi,rcx
+ add rax,r12
+ ror r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ ror r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ mov rdi,QWORD[((128+0))+rsp]
+ mov rax,r14
+
+ add rax,QWORD[rdi]
+ lea rsi,[128+rsi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop_xop
+
+ mov rsi,QWORD[((128+24))+rsp]
+ vzeroupper
+ movaps xmm6,XMMWORD[((128+32))+rsp]
+ movaps xmm7,XMMWORD[((128+48))+rsp]
+ movaps xmm8,XMMWORD[((128+64))+rsp]
+ movaps xmm9,XMMWORD[((128+80))+rsp]
+ movaps xmm10,XMMWORD[((128+96))+rsp]
+ movaps xmm11,XMMWORD[((128+112))+rsp]
+ mov r15,QWORD[rsi]
+ mov r14,QWORD[8+rsi]
+ mov r13,QWORD[16+rsi]
+ mov r12,QWORD[24+rsi]
+ mov rbp,QWORD[32+rsi]
+ mov rbx,QWORD[40+rsi]
+ lea rsp,[48+rsi]
+$L$epilogue_xop:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+$L$SEH_end_sha512_block_data_order_xop:
+
+ALIGN 64
+sha512_block_data_order_avx:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha512_block_data_order_avx:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+$L$avx_shortcut:
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r11,rsp
+ shl rdx,4
+ sub rsp,256
+ lea rdx,[rdx*8+rsi]
+ and rsp,-64
+ mov QWORD[((128+0))+rsp],rdi
+ mov QWORD[((128+8))+rsp],rsi
+ mov QWORD[((128+16))+rsp],rdx
+ mov QWORD[((128+24))+rsp],r11
+ movaps XMMWORD[(128+32)+rsp],xmm6
+ movaps XMMWORD[(128+48)+rsp],xmm7
+ movaps XMMWORD[(128+64)+rsp],xmm8
+ movaps XMMWORD[(128+80)+rsp],xmm9
+ movaps XMMWORD[(128+96)+rsp],xmm10
+ movaps XMMWORD[(128+112)+rsp],xmm11
+$L$prologue_avx:
+
+ vzeroupper
+ mov rax,QWORD[rdi]
+ mov rbx,QWORD[8+rdi]
+ mov rcx,QWORD[16+rdi]
+ mov rdx,QWORD[24+rdi]
+ mov r8,QWORD[32+rdi]
+ mov r9,QWORD[40+rdi]
+ mov r10,QWORD[48+rdi]
+ mov r11,QWORD[56+rdi]
+ jmp NEAR $L$loop_avx
+ALIGN 16
+$L$loop_avx:
+ vmovdqa xmm11,XMMWORD[((K512+1280))]
+ vmovdqu xmm0,XMMWORD[rsi]
+ lea rbp,[((K512+128))]
+ vmovdqu xmm1,XMMWORD[16+rsi]
+ vmovdqu xmm2,XMMWORD[32+rsi]
+ vpshufb xmm0,xmm0,xmm11
+ vmovdqu xmm3,XMMWORD[48+rsi]
+ vpshufb xmm1,xmm1,xmm11
+ vmovdqu xmm4,XMMWORD[64+rsi]
+ vpshufb xmm2,xmm2,xmm11
+ vmovdqu xmm5,XMMWORD[80+rsi]
+ vpshufb xmm3,xmm3,xmm11
+ vmovdqu xmm6,XMMWORD[96+rsi]
+ vpshufb xmm4,xmm4,xmm11
+ vmovdqu xmm7,XMMWORD[112+rsi]
+ vpshufb xmm5,xmm5,xmm11
+ vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
+ vpshufb xmm6,xmm6,xmm11
+ vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
+ vpshufb xmm7,xmm7,xmm11
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
+ vmovdqa XMMWORD[rsp],xmm8
+ vpaddq xmm8,xmm4,XMMWORD[rbp]
+ vmovdqa XMMWORD[16+rsp],xmm9
+ vpaddq xmm9,xmm5,XMMWORD[32+rbp]
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ vmovdqa XMMWORD[48+rsp],xmm11
+ vpaddq xmm11,xmm7,XMMWORD[96+rbp]
+ vmovdqa XMMWORD[64+rsp],xmm8
+ mov r14,rax
+ vmovdqa XMMWORD[80+rsp],xmm9
+ mov rdi,rbx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ xor rdi,rcx
+ vmovdqa XMMWORD[112+rsp],xmm11
+ mov r13,r8
+ jmp NEAR $L$avx_00_47
+
+ALIGN 16
+$L$avx_00_47:
+ add rbp,256
+ vpalignr xmm8,xmm1,xmm0,8
+ shrd r13,r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm5,xmm4,8
+ mov r12,r9
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r8
+ xor r12,r10
+ vpaddq xmm0,xmm0,xmm11
+ shrd r13,r13,4
+ xor r14,rax
+ vpsrlq xmm11,xmm8,7
+ and r12,r8
+ xor r13,r8
+ vpsllq xmm9,xmm8,56
+ add r11,QWORD[rsp]
+ mov r15,rax
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r10
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rbx
+ add r11,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm7,6
+ add rdx,r11
+ add r11,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rdx
+ add r14,r11
+ vpsllq xmm10,xmm7,3
+ shrd r13,r13,23
+ mov r11,r14
+ vpaddq xmm0,xmm0,xmm8
+ mov r12,r8
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm7,19
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r11
+ vpsllq xmm10,xmm10,42
+ and r12,rdx
+ xor r13,rdx
+ vpxor xmm11,xmm11,xmm9
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ vpsrlq xmm9,xmm9,42
+ xor r12,r9
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rax
+ add r10,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm0,xmm0,xmm11
+ xor r14,r11
+ add r10,r13
+ vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[rsp],xmm10
+ vpalignr xmm8,xmm2,xmm1,8
+ shrd r13,r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm6,xmm5,8
+ mov r12,rdx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rcx
+ xor r12,r8
+ vpaddq xmm1,xmm1,xmm11
+ shrd r13,r13,4
+ xor r14,r10
+ vpsrlq xmm11,xmm8,7
+ and r12,rcx
+ xor r13,rcx
+ vpsllq xmm9,xmm8,56
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r8
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r11
+ add r9,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm0,6
+ add rbx,r9
+ add r9,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rbx
+ add r14,r9
+ vpsllq xmm10,xmm0,3
+ shrd r13,r13,23
+ mov r9,r14
+ vpaddq xmm1,xmm1,xmm8
+ mov r12,rcx
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm0,19
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r9
+ vpsllq xmm10,xmm10,42
+ and r12,rbx
+ xor r13,rbx
+ vpxor xmm11,xmm11,xmm9
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ vpsrlq xmm9,xmm9,42
+ xor r12,rdx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r10
+ add r8,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm1,xmm1,xmm11
+ xor r14,r9
+ add r8,r13
+ vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[16+rsp],xmm10
+ vpalignr xmm8,xmm3,xmm2,8
+ shrd r13,r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm7,xmm6,8
+ mov r12,rbx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rax
+ xor r12,rcx
+ vpaddq xmm2,xmm2,xmm11
+ shrd r13,r13,4
+ xor r14,r8
+ vpsrlq xmm11,xmm8,7
+ and r12,rax
+ xor r13,rax
+ vpsllq xmm9,xmm8,56
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rcx
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r9
+ add rdx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm1,6
+ add r11,rdx
+ add rdx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r11
+ add r14,rdx
+ vpsllq xmm10,xmm1,3
+ shrd r13,r13,23
+ mov rdx,r14
+ vpaddq xmm2,xmm2,xmm8
+ mov r12,rax
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm1,19
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rdx
+ vpsllq xmm10,xmm10,42
+ and r12,r11
+ xor r13,r11
+ vpxor xmm11,xmm11,xmm9
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ vpsrlq xmm9,xmm9,42
+ xor r12,rbx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r8
+ add rcx,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm2,xmm2,xmm11
+ xor r14,rdx
+ add rcx,r13
+ vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[32+rsp],xmm10
+ vpalignr xmm8,xmm4,xmm3,8
+ shrd r13,r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm0,xmm7,8
+ mov r12,r11
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r10
+ xor r12,rax
+ vpaddq xmm3,xmm3,xmm11
+ shrd r13,r13,4
+ xor r14,rcx
+ vpsrlq xmm11,xmm8,7
+ and r12,r10
+ xor r13,r10
+ vpsllq xmm9,xmm8,56
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rax
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rdx
+ add rbx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm2,6
+ add r9,rbx
+ add rbx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r9
+ add r14,rbx
+ vpsllq xmm10,xmm2,3
+ shrd r13,r13,23
+ mov rbx,r14
+ vpaddq xmm3,xmm3,xmm8
+ mov r12,r10
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm2,19
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rbx
+ vpsllq xmm10,xmm10,42
+ and r12,r9
+ xor r13,r9
+ vpxor xmm11,xmm11,xmm9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ vpsrlq xmm9,xmm9,42
+ xor r12,r11
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rcx
+ add rax,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm3,xmm3,xmm11
+ xor r14,rbx
+ add rax,r13
+ vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[48+rsp],xmm10
+ vpalignr xmm8,xmm5,xmm4,8
+ shrd r13,r13,23
+ mov rax,r14
+ vpalignr xmm11,xmm1,xmm0,8
+ mov r12,r9
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r8
+ xor r12,r10
+ vpaddq xmm4,xmm4,xmm11
+ shrd r13,r13,4
+ xor r14,rax
+ vpsrlq xmm11,xmm8,7
+ and r12,r8
+ xor r13,r8
+ vpsllq xmm9,xmm8,56
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r10
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rbx
+ add r11,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rax
+ add r11,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rbx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm3,6
+ add rdx,r11
+ add r11,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rdx
+ add r14,r11
+ vpsllq xmm10,xmm3,3
+ shrd r13,r13,23
+ mov r11,r14
+ vpaddq xmm4,xmm4,xmm8
+ mov r12,r8
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm3,19
+ xor r13,rdx
+ xor r12,r9
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r11
+ vpsllq xmm10,xmm10,42
+ and r12,rdx
+ xor r13,rdx
+ vpxor xmm11,xmm11,xmm9
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ vpsrlq xmm9,xmm9,42
+ xor r12,r9
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rax
+ add r10,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm4,xmm4,xmm11
+ xor r14,r11
+ add r10,r13
+ vpaddq xmm10,xmm4,XMMWORD[rbp]
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ vmovdqa XMMWORD[64+rsp],xmm10
+ vpalignr xmm8,xmm6,xmm5,8
+ shrd r13,r13,23
+ mov r10,r14
+ vpalignr xmm11,xmm2,xmm1,8
+ mov r12,rdx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rcx
+ xor r12,r8
+ vpaddq xmm5,xmm5,xmm11
+ shrd r13,r13,4
+ xor r14,r10
+ vpsrlq xmm11,xmm8,7
+ and r12,rcx
+ xor r13,rcx
+ vpsllq xmm9,xmm8,56
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ vpxor xmm8,xmm11,xmm10
+ xor r12,r8
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r11
+ add r9,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r10
+ add r9,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r11
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm4,6
+ add rbx,r9
+ add r9,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,rbx
+ add r14,r9
+ vpsllq xmm10,xmm4,3
+ shrd r13,r13,23
+ mov r9,r14
+ vpaddq xmm5,xmm5,xmm8
+ mov r12,rcx
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm4,19
+ xor r13,rbx
+ xor r12,rdx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,r9
+ vpsllq xmm10,xmm10,42
+ and r12,rbx
+ xor r13,rbx
+ vpxor xmm11,xmm11,xmm9
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ vpsrlq xmm9,xmm9,42
+ xor r12,rdx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r10
+ add r8,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm5,xmm5,xmm11
+ xor r14,r9
+ add r8,r13
+ vpaddq xmm10,xmm5,XMMWORD[32+rbp]
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ vmovdqa XMMWORD[80+rsp],xmm10
+ vpalignr xmm8,xmm7,xmm6,8
+ shrd r13,r13,23
+ mov r8,r14
+ vpalignr xmm11,xmm3,xmm2,8
+ mov r12,rbx
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,rax
+ xor r12,rcx
+ vpaddq xmm6,xmm6,xmm11
+ shrd r13,r13,4
+ xor r14,r8
+ vpsrlq xmm11,xmm8,7
+ and r12,rax
+ xor r13,rax
+ vpsllq xmm9,xmm8,56
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rcx
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,r9
+ add rdx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,r8
+ add rdx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,r9
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm5,6
+ add r11,rdx
+ add rdx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r11
+ add r14,rdx
+ vpsllq xmm10,xmm5,3
+ shrd r13,r13,23
+ mov rdx,r14
+ vpaddq xmm6,xmm6,xmm8
+ mov r12,rax
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm5,19
+ xor r13,r11
+ xor r12,rbx
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rdx
+ vpsllq xmm10,xmm10,42
+ and r12,r11
+ xor r13,r11
+ vpxor xmm11,xmm11,xmm9
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ vpsrlq xmm9,xmm9,42
+ xor r12,rbx
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,r8
+ add rcx,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm6,xmm6,xmm11
+ xor r14,rdx
+ add rcx,r13
+ vpaddq xmm10,xmm6,XMMWORD[64+rbp]
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ vmovdqa XMMWORD[96+rsp],xmm10
+ vpalignr xmm8,xmm0,xmm7,8
+ shrd r13,r13,23
+ mov rcx,r14
+ vpalignr xmm11,xmm4,xmm3,8
+ mov r12,r11
+ shrd r14,r14,5
+ vpsrlq xmm10,xmm8,1
+ xor r13,r10
+ xor r12,rax
+ vpaddq xmm7,xmm7,xmm11
+ shrd r13,r13,4
+ xor r14,rcx
+ vpsrlq xmm11,xmm8,7
+ and r12,r10
+ xor r13,r10
+ vpsllq xmm9,xmm8,56
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ vpxor xmm8,xmm11,xmm10
+ xor r12,rax
+ shrd r14,r14,6
+ vpsrlq xmm10,xmm10,7
+ xor r15,rdx
+ add rbx,r12
+ vpxor xmm8,xmm8,xmm9
+ shrd r13,r13,14
+ and rdi,r15
+ vpsllq xmm9,xmm9,7
+ xor r14,rcx
+ add rbx,r13
+ vpxor xmm8,xmm8,xmm10
+ xor rdi,rdx
+ shrd r14,r14,28
+ vpsrlq xmm11,xmm6,6
+ add r9,rbx
+ add rbx,rdi
+ vpxor xmm8,xmm8,xmm9
+ mov r13,r9
+ add r14,rbx
+ vpsllq xmm10,xmm6,3
+ shrd r13,r13,23
+ mov rbx,r14
+ vpaddq xmm7,xmm7,xmm8
+ mov r12,r10
+ shrd r14,r14,5
+ vpsrlq xmm9,xmm6,19
+ xor r13,r9
+ xor r12,r11
+ vpxor xmm11,xmm11,xmm10
+ shrd r13,r13,4
+ xor r14,rbx
+ vpsllq xmm10,xmm10,42
+ and r12,r9
+ xor r13,r9
+ vpxor xmm11,xmm11,xmm9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ vpsrlq xmm9,xmm9,42
+ xor r12,r11
+ shrd r14,r14,6
+ vpxor xmm11,xmm11,xmm10
+ xor rdi,rcx
+ add rax,r12
+ vpxor xmm11,xmm11,xmm9
+ shrd r13,r13,14
+ and r15,rdi
+ vpaddq xmm7,xmm7,xmm11
+ xor r14,rbx
+ add rax,r13
+ vpaddq xmm10,xmm7,XMMWORD[96+rbp]
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ vmovdqa XMMWORD[112+rsp],xmm10
+ cmp BYTE[135+rbp],0
+ jne NEAR $L$avx_00_47
+ shrd r13,r13,23
+ mov rax,r14
+ mov r12,r9
+ shrd r14,r14,5
+ xor r13,r8
+ xor r12,r10
+ shrd r13,r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[rsp]
+ mov r15,rax
+ xor r12,r10
+ shrd r14,r14,6
+ xor r15,rbx
+ add r11,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ shrd r14,r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ shrd r13,r13,23
+ mov r11,r14
+ mov r12,r8
+ shrd r14,r14,5
+ xor r13,rdx
+ xor r12,r9
+ shrd r13,r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[8+rsp]
+ mov rdi,r11
+ xor r12,r9
+ shrd r14,r14,6
+ xor rdi,rax
+ add r10,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ shrd r13,r13,23
+ mov r10,r14
+ mov r12,rdx
+ shrd r14,r14,5
+ xor r13,rcx
+ xor r12,r8
+ shrd r13,r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[16+rsp]
+ mov r15,r10
+ xor r12,r8
+ shrd r14,r14,6
+ xor r15,r11
+ add r9,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ shrd r14,r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ shrd r13,r13,23
+ mov r9,r14
+ mov r12,rcx
+ shrd r14,r14,5
+ xor r13,rbx
+ xor r12,rdx
+ shrd r13,r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[24+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ shrd r14,r14,6
+ xor rdi,r10
+ add r8,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ shrd r13,r13,23
+ mov r8,r14
+ mov r12,rbx
+ shrd r14,r14,5
+ xor r13,rax
+ xor r12,rcx
+ shrd r13,r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[32+rsp]
+ mov r15,r8
+ xor r12,rcx
+ shrd r14,r14,6
+ xor r15,r9
+ add rdx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ shrd r14,r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ shrd r13,r13,23
+ mov rdx,r14
+ mov r12,rax
+ shrd r14,r14,5
+ xor r13,r11
+ xor r12,rbx
+ shrd r13,r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[40+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ shrd r14,r14,6
+ xor rdi,r8
+ add rcx,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ shrd r13,r13,23
+ mov rcx,r14
+ mov r12,r11
+ shrd r14,r14,5
+ xor r13,r10
+ xor r12,rax
+ shrd r13,r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[48+rsp]
+ mov r15,rcx
+ xor r12,rax
+ shrd r14,r14,6
+ xor r15,rdx
+ add rbx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ shrd r14,r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ shrd r13,r13,23
+ mov rbx,r14
+ mov r12,r10
+ shrd r14,r14,5
+ xor r13,r9
+ xor r12,r11
+ shrd r13,r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[56+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ shrd r14,r14,6
+ xor rdi,rcx
+ add rax,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ shrd r13,r13,23
+ mov rax,r14
+ mov r12,r9
+ shrd r14,r14,5
+ xor r13,r8
+ xor r12,r10
+ shrd r13,r13,4
+ xor r14,rax
+ and r12,r8
+ xor r13,r8
+ add r11,QWORD[64+rsp]
+ mov r15,rax
+ xor r12,r10
+ shrd r14,r14,6
+ xor r15,rbx
+ add r11,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rax
+ add r11,r13
+ xor rdi,rbx
+ shrd r14,r14,28
+ add rdx,r11
+ add r11,rdi
+ mov r13,rdx
+ add r14,r11
+ shrd r13,r13,23
+ mov r11,r14
+ mov r12,r8
+ shrd r14,r14,5
+ xor r13,rdx
+ xor r12,r9
+ shrd r13,r13,4
+ xor r14,r11
+ and r12,rdx
+ xor r13,rdx
+ add r10,QWORD[72+rsp]
+ mov rdi,r11
+ xor r12,r9
+ shrd r14,r14,6
+ xor rdi,rax
+ add r10,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r11
+ add r10,r13
+ xor r15,rax
+ shrd r14,r14,28
+ add rcx,r10
+ add r10,r15
+ mov r13,rcx
+ add r14,r10
+ shrd r13,r13,23
+ mov r10,r14
+ mov r12,rdx
+ shrd r14,r14,5
+ xor r13,rcx
+ xor r12,r8
+ shrd r13,r13,4
+ xor r14,r10
+ and r12,rcx
+ xor r13,rcx
+ add r9,QWORD[80+rsp]
+ mov r15,r10
+ xor r12,r8
+ shrd r14,r14,6
+ xor r15,r11
+ add r9,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r10
+ add r9,r13
+ xor rdi,r11
+ shrd r14,r14,28
+ add rbx,r9
+ add r9,rdi
+ mov r13,rbx
+ add r14,r9
+ shrd r13,r13,23
+ mov r9,r14
+ mov r12,rcx
+ shrd r14,r14,5
+ xor r13,rbx
+ xor r12,rdx
+ shrd r13,r13,4
+ xor r14,r9
+ and r12,rbx
+ xor r13,rbx
+ add r8,QWORD[88+rsp]
+ mov rdi,r9
+ xor r12,rdx
+ shrd r14,r14,6
+ xor rdi,r10
+ add r8,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,r9
+ add r8,r13
+ xor r15,r10
+ shrd r14,r14,28
+ add rax,r8
+ add r8,r15
+ mov r13,rax
+ add r14,r8
+ shrd r13,r13,23
+ mov r8,r14
+ mov r12,rbx
+ shrd r14,r14,5
+ xor r13,rax
+ xor r12,rcx
+ shrd r13,r13,4
+ xor r14,r8
+ and r12,rax
+ xor r13,rax
+ add rdx,QWORD[96+rsp]
+ mov r15,r8
+ xor r12,rcx
+ shrd r14,r14,6
+ xor r15,r9
+ add rdx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,r8
+ add rdx,r13
+ xor rdi,r9
+ shrd r14,r14,28
+ add r11,rdx
+ add rdx,rdi
+ mov r13,r11
+ add r14,rdx
+ shrd r13,r13,23
+ mov rdx,r14
+ mov r12,rax
+ shrd r14,r14,5
+ xor r13,r11
+ xor r12,rbx
+ shrd r13,r13,4
+ xor r14,rdx
+ and r12,r11
+ xor r13,r11
+ add rcx,QWORD[104+rsp]
+ mov rdi,rdx
+ xor r12,rbx
+ shrd r14,r14,6
+ xor rdi,r8
+ add rcx,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rdx
+ add rcx,r13
+ xor r15,r8
+ shrd r14,r14,28
+ add r10,rcx
+ add rcx,r15
+ mov r13,r10
+ add r14,rcx
+ shrd r13,r13,23
+ mov rcx,r14
+ mov r12,r11
+ shrd r14,r14,5
+ xor r13,r10
+ xor r12,rax
+ shrd r13,r13,4
+ xor r14,rcx
+ and r12,r10
+ xor r13,r10
+ add rbx,QWORD[112+rsp]
+ mov r15,rcx
+ xor r12,rax
+ shrd r14,r14,6
+ xor r15,rdx
+ add rbx,r12
+ shrd r13,r13,14
+ and rdi,r15
+ xor r14,rcx
+ add rbx,r13
+ xor rdi,rdx
+ shrd r14,r14,28
+ add r9,rbx
+ add rbx,rdi
+ mov r13,r9
+ add r14,rbx
+ shrd r13,r13,23
+ mov rbx,r14
+ mov r12,r10
+ shrd r14,r14,5
+ xor r13,r9
+ xor r12,r11
+ shrd r13,r13,4
+ xor r14,rbx
+ and r12,r9
+ xor r13,r9
+ add rax,QWORD[120+rsp]
+ mov rdi,rbx
+ xor r12,r11
+ shrd r14,r14,6
+ xor rdi,rcx
+ add rax,r12
+ shrd r13,r13,14
+ and r15,rdi
+ xor r14,rbx
+ add rax,r13
+ xor r15,rcx
+ shrd r14,r14,28
+ add r8,rax
+ add rax,r15
+ mov r13,r8
+ add r14,rax
+ mov rdi,QWORD[((128+0))+rsp]
+ mov rax,r14
+
+ add rax,QWORD[rdi]
+ lea rsi,[128+rsi]
+ add rbx,QWORD[8+rdi]
+ add rcx,QWORD[16+rdi]
+ add rdx,QWORD[24+rdi]
+ add r8,QWORD[32+rdi]
+ add r9,QWORD[40+rdi]
+ add r10,QWORD[48+rdi]
+ add r11,QWORD[56+rdi]
+
+ cmp rsi,QWORD[((128+16))+rsp]
+
+ mov QWORD[rdi],rax
+ mov QWORD[8+rdi],rbx
+ mov QWORD[16+rdi],rcx
+ mov QWORD[24+rdi],rdx
+ mov QWORD[32+rdi],r8
+ mov QWORD[40+rdi],r9
+ mov QWORD[48+rdi],r10
+ mov QWORD[56+rdi],r11
+ jb NEAR $L$loop_avx
+
+ mov rsi,QWORD[((128+24))+rsp]
+ vzeroupper
+ movaps xmm6,XMMWORD[((128+32))+rsp]
+ movaps xmm7,XMMWORD[((128+48))+rsp]
+ movaps xmm8,XMMWORD[((128+64))+rsp]
+ movaps xmm9,XMMWORD[((128+80))+rsp]
+ movaps xmm10,XMMWORD[((128+96))+rsp]
+ movaps xmm11,XMMWORD[((128+112))+rsp]
+ mov r15,QWORD[rsi]
+ mov r14,QWORD[8+rsi]
+ mov r13,QWORD[16+rsi]
+ mov r12,QWORD[24+rsi]
+ mov rbp,QWORD[32+rsi]
+ mov rbx,QWORD[40+rsi]
+ lea rsp,[48+rsi]
+$L$epilogue_avx:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+$L$SEH_end_sha512_block_data_order_avx:
EXTERN __imp_RtlVirtualUnwind
ALIGN 16
@@ -1903,9 +4190,23 @@ ALIGN 4
DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase
DD $L$SEH_end_sha512_block_data_order wrt ..imagebase
DD $L$SEH_info_sha512_block_data_order wrt ..imagebase
+ DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase
+ DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase
+ DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase
section .xdata rdata align=8
ALIGN 8
$L$SEH_info_sha512_block_data_order:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_xop:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase
+$L$SEH_info_sha512_block_data_order_avx:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+ DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase