diff options
Diffstat (limited to 'win-x86_64/crypto')
-rw-r--r-- | win-x86_64/crypto/bn/x86_64-mont5.asm | 9 | ||||
-rw-r--r-- | win-x86_64/crypto/ec/p256-x86_64-asm.asm | 285 | ||||
-rw-r--r-- | win-x86_64/crypto/rc4/rc4-md5-x86_64.asm | 1372 | ||||
-rw-r--r-- | win-x86_64/crypto/sha/sha1-x86_64.asm | 1152 | ||||
-rw-r--r-- | win-x86_64/crypto/sha/sha256-x86_64.asm | 1088 | ||||
-rw-r--r-- | win-x86_64/crypto/sha/sha512-x86_64.asm | 2301 |
6 files changed, 4550 insertions, 1657 deletions
diff --git a/win-x86_64/crypto/bn/x86_64-mont5.asm b/win-x86_64/crypto/bn/x86_64-mont5.asm index 284318a..560b384 100644 --- a/win-x86_64/crypto/bn/x86_64-mont5.asm +++ b/win-x86_64/crypto/bn/x86_64-mont5.asm @@ -1616,6 +1616,15 @@ $L$8x_tail: ALIGN 32 $L$8x_tail_done: add r8,QWORD[rdx] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + + xor rax,rax neg rsi diff --git a/win-x86_64/crypto/ec/p256-x86_64-asm.asm b/win-x86_64/crypto/ec/p256-x86_64-asm.asm index c9789f5..45ba686 100644 --- a/win-x86_64/crypto/ec/p256-x86_64-asm.asm +++ b/win-x86_64/crypto/ec/p256-x86_64-asm.asm @@ -11,10 +11,6 @@ ALIGN 64 $L$poly: DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 - -$L$RR: - DQ 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd - $L$One: DD 1,1,1,1,1,1,1,1 $L$Two: @@ -24,7 +20,6 @@ $L$Three: $L$ONE_mont: DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe -global ecp_nistz256_mul_by_2 ALIGN 64 ecp_nistz256_mul_by_2: @@ -78,266 +73,6 @@ $L$SEH_end_ecp_nistz256_mul_by_2: -global ecp_nistz256_div_by_2 - -ALIGN 32 -ecp_nistz256_div_by_2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_div_by_2: - mov rdi,rcx - mov rsi,rdx - - - push r12 - push r13 - - mov r8,QWORD[rsi] - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov rax,r8 - mov r11,QWORD[24+rsi] - lea rsi,[$L$poly] - - mov rdx,r9 - xor r13,r13 - add r8,QWORD[rsi] - mov rcx,r10 - adc r9,QWORD[8+rsi] - adc r10,QWORD[16+rsi] - mov r12,r11 - adc r11,QWORD[24+rsi] - adc r13,0 - xor rsi,rsi - test rax,1 - - cmovz r8,rax - cmovz r9,rdx - cmovz r10,rcx - cmovz r11,r12 - cmovz r13,rsi - - mov rax,r9 - shr r8,1 - shl rax,63 - mov rdx,r10 - shr r9,1 - or r8,rax - shl rdx,63 - mov rcx,r11 - shr r10,1 - or r9,rdx - shl rcx,63 - shr r11,1 - shl r13,63 - or r10,rcx - or r11,r13 - - mov QWORD[rdi],r8 - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - - pop r13 - pop r12 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_div_by_2: - - - -global ecp_nistz256_mul_by_3 - -ALIGN 32 -ecp_nistz256_mul_by_3: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_mul_by_3: - mov rdi,rcx - mov rsi,rdx - - - push r12 - push r13 - - mov r8,QWORD[rsi] - xor r13,r13 - mov r9,QWORD[8+rsi] - add r8,r8 - mov r10,QWORD[16+rsi] - adc r9,r9 - mov r11,QWORD[24+rsi] - mov rax,r8 - adc r10,r10 - adc r11,r11 - mov rdx,r9 - adc r13,0 - - sub r8,-1 - mov rcx,r10 - sbb r9,QWORD[(($L$poly+8))] - sbb r10,0 - mov r12,r11 - sbb r11,QWORD[(($L$poly+24))] - test r13,r13 - - cmovz r8,rax - cmovz r9,rdx - cmovz r10,rcx - cmovz r11,r12 - - xor r13,r13 - add r8,QWORD[rsi] - adc r9,QWORD[8+rsi] - mov rax,r8 - adc r10,QWORD[16+rsi] - adc r11,QWORD[24+rsi] - mov rdx,r9 - adc r13,0 - - sub r8,-1 - mov rcx,r10 - sbb r9,QWORD[(($L$poly+8))] - sbb r10,0 - mov r12,r11 - sbb r11,QWORD[(($L$poly+24))] - test r13,r13 - - cmovz r8,rax - cmovz r9,rdx - mov QWORD[rdi],r8 - cmovz r10,rcx - mov QWORD[8+rdi],r9 - cmovz r11,r12 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - - pop r13 - pop r12 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_mul_by_3: - - - -global ecp_nistz256_add - -ALIGN 32 -ecp_nistz256_add: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_add: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - push r12 - push r13 - - mov r8,QWORD[rsi] - xor r13,r13 - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov r11,QWORD[24+rsi] - lea rsi,[$L$poly] - - add r8,QWORD[rdx] - adc r9,QWORD[8+rdx] - mov rax,r8 - adc r10,QWORD[16+rdx] - adc r11,QWORD[24+rdx] - mov rdx,r9 - adc r13,0 - - sub r8,QWORD[rsi] - mov rcx,r10 - sbb r9,QWORD[8+rsi] - sbb r10,QWORD[16+rsi] - mov r12,r11 - sbb r11,QWORD[24+rsi] - test r13,r13 - - cmovz r8,rax - cmovz r9,rdx - mov QWORD[rdi],r8 - cmovz r10,rcx - mov QWORD[8+rdi],r9 - cmovz r11,r12 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - - pop r13 - pop r12 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_add: - - - -global ecp_nistz256_sub - -ALIGN 32 -ecp_nistz256_sub: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_sub: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - push r12 - push r13 - - mov r8,QWORD[rsi] - xor r13,r13 - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov r11,QWORD[24+rsi] - lea rsi,[$L$poly] - - sub r8,QWORD[rdx] - sbb r9,QWORD[8+rdx] - mov rax,r8 - sbb r10,QWORD[16+rdx] - sbb r11,QWORD[24+rdx] - mov rdx,r9 - sbb r13,0 - - add r8,QWORD[rsi] - mov rcx,r10 - adc r9,QWORD[8+rsi] - adc r10,QWORD[16+rsi] - mov r12,r11 - adc r11,QWORD[24+rsi] - test r13,r13 - - cmovz r8,rax - cmovz r9,rdx - mov QWORD[rdi],r8 - cmovz r10,rcx - mov QWORD[8+rdi],r9 - cmovz r11,r12 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - - pop r13 - pop r12 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_sub: - - - global ecp_nistz256_neg ALIGN 32 @@ -395,26 +130,6 @@ $L$SEH_end_ecp_nistz256_neg: -global ecp_nistz256_to_mont - -ALIGN 32 -ecp_nistz256_to_mont: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_to_mont: - mov rdi,rcx - mov rsi,rdx - - - lea rdx,[$L$RR] - jmp NEAR $L$mul_mont -$L$SEH_end_ecp_nistz256_to_mont: - - - - - global ecp_nistz256_mul_mont diff --git a/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm b/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm deleted file mode 100644 index f1ea965..0000000 --- a/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm +++ /dev/null @@ -1,1372 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - -ALIGN 16 - -global rc4_md5_enc - -rc4_md5_enc: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rc4_md5_enc: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - cmp r9,0 - je NEAR $L$abort - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - sub rsp,40 -$L$body: - mov r11,rcx - mov r12,r9 - mov r13,rsi - mov r14,rdx - mov r15,r8 - xor rbp,rbp - xor rcx,rcx - - lea rdi,[8+rdi] - mov bpl,BYTE[((-8))+rdi] - mov cl,BYTE[((-4))+rdi] - - inc bpl - sub r14,r13 - mov eax,DWORD[rbp*4+rdi] - add cl,al - lea rsi,[rbp*4+rdi] - shl r12,6 - add r12,r15 - mov QWORD[16+rsp],r12 - - mov QWORD[24+rsp],r11 - mov r8d,DWORD[r11] - mov r9d,DWORD[4+r11] - mov r10d,DWORD[8+r11] - mov r11d,DWORD[12+r11] - jmp NEAR $L$oop - -ALIGN 16 -$L$oop: - mov DWORD[rsp],r8d - mov DWORD[4+rsp],r9d - mov DWORD[8+rsp],r10d - mov r12d,r11d - mov DWORD[12+rsp],r11d - pxor xmm0,xmm0 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r8d,DWORD[r15] - add al,dl - mov ebx,DWORD[4+rsi] - add r8d,3614090360 - xor r12d,r11d - movzx eax,al - mov DWORD[rsi],edx - add r8d,r12d - add cl,bl - rol r8d,7 - mov r12d,r10d - movd xmm0,DWORD[rax*4+rdi] - - add r8d,r9d - pxor xmm1,xmm1 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r11d,DWORD[4+r15] - add bl,dl - mov eax,DWORD[8+rsi] - add r11d,3905402710 - xor r12d,r10d - movzx ebx,bl - mov DWORD[4+rsi],edx - add r11d,r12d - add cl,al - rol r11d,12 - mov r12d,r9d - movd xmm1,DWORD[rbx*4+rdi] - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r10d,DWORD[8+r15] - add al,dl - mov ebx,DWORD[12+rsi] - add r10d,606105819 - xor r12d,r9d - movzx eax,al - mov DWORD[8+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,17 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],1 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r9d,DWORD[12+r15] - add bl,dl - mov eax,DWORD[16+rsi] - add r9d,3250441966 - xor r12d,r8d - movzx ebx,bl - mov DWORD[12+rsi],edx - add r9d,r12d - add cl,al - rol r9d,22 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],1 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r8d,DWORD[16+r15] - add al,dl - mov ebx,DWORD[20+rsi] - add r8d,4118548399 - xor r12d,r11d - movzx eax,al - mov DWORD[16+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,7 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],2 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r11d,DWORD[20+r15] - add bl,dl - mov eax,DWORD[24+rsi] - add r11d,1200080426 - xor r12d,r10d - movzx ebx,bl - mov DWORD[20+rsi],edx - add r11d,r12d - add cl,al - rol r11d,12 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],2 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r10d,DWORD[24+r15] - add al,dl - mov ebx,DWORD[28+rsi] - add r10d,2821735955 - xor r12d,r9d - movzx eax,al - mov DWORD[24+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,17 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],3 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r9d,DWORD[28+r15] - add bl,dl - mov eax,DWORD[32+rsi] - add r9d,4249261313 - xor r12d,r8d - movzx ebx,bl - mov DWORD[28+rsi],edx - add r9d,r12d - add cl,al - rol r9d,22 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],3 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r8d,DWORD[32+r15] - add al,dl - mov ebx,DWORD[36+rsi] - add r8d,1770035416 - xor r12d,r11d - movzx eax,al - mov DWORD[32+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,7 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],4 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r11d,DWORD[36+r15] - add bl,dl - mov eax,DWORD[40+rsi] - add r11d,2336552879 - xor r12d,r10d - movzx ebx,bl - mov DWORD[36+rsi],edx - add r11d,r12d - add cl,al - rol r11d,12 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],4 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r10d,DWORD[40+r15] - add al,dl - mov ebx,DWORD[44+rsi] - add r10d,4294925233 - xor r12d,r9d - movzx eax,al - mov DWORD[40+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,17 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],5 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r9d,DWORD[44+r15] - add bl,dl - mov eax,DWORD[48+rsi] - add r9d,2304563134 - xor r12d,r8d - movzx ebx,bl - mov DWORD[44+rsi],edx - add r9d,r12d - add cl,al - rol r9d,22 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],5 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r8d,DWORD[48+r15] - add al,dl - mov ebx,DWORD[52+rsi] - add r8d,1804603682 - xor r12d,r11d - movzx eax,al - mov DWORD[48+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,7 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],6 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r11d,DWORD[52+r15] - add bl,dl - mov eax,DWORD[56+rsi] - add r11d,4254626195 - xor r12d,r10d - movzx ebx,bl - mov DWORD[52+rsi],edx - add r11d,r12d - add cl,al - rol r11d,12 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],6 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r10d,DWORD[56+r15] - add al,dl - mov ebx,DWORD[60+rsi] - add r10d,2792965006 - xor r12d,r9d - movzx eax,al - mov DWORD[56+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,17 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],7 - - add r10d,r11d - movdqu xmm2,XMMWORD[r13] - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r9d,DWORD[60+r15] - add bl,dl - mov eax,DWORD[64+rsi] - add r9d,1236535329 - xor r12d,r8d - movzx ebx,bl - mov DWORD[60+rsi],edx - add r9d,r12d - add cl,al - rol r9d,22 - mov r12d,r10d - pinsrw xmm1,WORD[rbx*4+rdi],7 - - add r9d,r10d - psllq xmm1,8 - pxor xmm2,xmm0 - pxor xmm2,xmm1 - pxor xmm0,xmm0 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r8d,DWORD[4+r15] - add al,dl - mov ebx,DWORD[68+rsi] - add r8d,4129170786 - xor r12d,r10d - movzx eax,al - mov DWORD[64+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,5 - mov r12d,r9d - movd xmm0,DWORD[rax*4+rdi] - - add r8d,r9d - pxor xmm1,xmm1 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r11d,DWORD[24+r15] - add bl,dl - mov eax,DWORD[72+rsi] - add r11d,3225465664 - xor r12d,r9d - movzx ebx,bl - mov DWORD[68+rsi],edx - add r11d,r12d - add cl,al - rol r11d,9 - mov r12d,r8d - movd xmm1,DWORD[rbx*4+rdi] - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r10d,DWORD[44+r15] - add al,dl - mov ebx,DWORD[76+rsi] - add r10d,643717713 - xor r12d,r8d - movzx eax,al - mov DWORD[72+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,14 - mov r12d,r11d - pinsrw xmm0,WORD[rax*4+rdi],1 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r9d,DWORD[r15] - add bl,dl - mov eax,DWORD[80+rsi] - add r9d,3921069994 - xor r12d,r11d - movzx ebx,bl - mov DWORD[76+rsi],edx - add r9d,r12d - add cl,al - rol r9d,20 - mov r12d,r10d - pinsrw xmm1,WORD[rbx*4+rdi],1 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r8d,DWORD[20+r15] - add al,dl - mov ebx,DWORD[84+rsi] - add r8d,3593408605 - xor r12d,r10d - movzx eax,al - mov DWORD[80+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,5 - mov r12d,r9d - pinsrw xmm0,WORD[rax*4+rdi],2 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r11d,DWORD[40+r15] - add bl,dl - mov eax,DWORD[88+rsi] - add r11d,38016083 - xor r12d,r9d - movzx ebx,bl - mov DWORD[84+rsi],edx - add r11d,r12d - add cl,al - rol r11d,9 - mov r12d,r8d - pinsrw xmm1,WORD[rbx*4+rdi],2 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r10d,DWORD[60+r15] - add al,dl - mov ebx,DWORD[92+rsi] - add r10d,3634488961 - xor r12d,r8d - movzx eax,al - mov DWORD[88+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,14 - mov r12d,r11d - pinsrw xmm0,WORD[rax*4+rdi],3 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r9d,DWORD[16+r15] - add bl,dl - mov eax,DWORD[96+rsi] - add r9d,3889429448 - xor r12d,r11d - movzx ebx,bl - mov DWORD[92+rsi],edx - add r9d,r12d - add cl,al - rol r9d,20 - mov r12d,r10d - pinsrw xmm1,WORD[rbx*4+rdi],3 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r8d,DWORD[36+r15] - add al,dl - mov ebx,DWORD[100+rsi] - add r8d,568446438 - xor r12d,r10d - movzx eax,al - mov DWORD[96+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,5 - mov r12d,r9d - pinsrw xmm0,WORD[rax*4+rdi],4 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r11d,DWORD[56+r15] - add bl,dl - mov eax,DWORD[104+rsi] - add r11d,3275163606 - xor r12d,r9d - movzx ebx,bl - mov DWORD[100+rsi],edx - add r11d,r12d - add cl,al - rol r11d,9 - mov r12d,r8d - pinsrw xmm1,WORD[rbx*4+rdi],4 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r10d,DWORD[12+r15] - add al,dl - mov ebx,DWORD[108+rsi] - add r10d,4107603335 - xor r12d,r8d - movzx eax,al - mov DWORD[104+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,14 - mov r12d,r11d - pinsrw xmm0,WORD[rax*4+rdi],5 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r9d,DWORD[32+r15] - add bl,dl - mov eax,DWORD[112+rsi] - add r9d,1163531501 - xor r12d,r11d - movzx ebx,bl - mov DWORD[108+rsi],edx - add r9d,r12d - add cl,al - rol r9d,20 - mov r12d,r10d - pinsrw xmm1,WORD[rbx*4+rdi],5 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r8d,DWORD[52+r15] - add al,dl - mov ebx,DWORD[116+rsi] - add r8d,2850285829 - xor r12d,r10d - movzx eax,al - mov DWORD[112+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,5 - mov r12d,r9d - pinsrw xmm0,WORD[rax*4+rdi],6 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r11d,DWORD[8+r15] - add bl,dl - mov eax,DWORD[120+rsi] - add r11d,4243563512 - xor r12d,r9d - movzx ebx,bl - mov DWORD[116+rsi],edx - add r11d,r12d - add cl,al - rol r11d,9 - mov r12d,r8d - pinsrw xmm1,WORD[rbx*4+rdi],6 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r10d,DWORD[28+r15] - add al,dl - mov ebx,DWORD[124+rsi] - add r10d,1735328473 - xor r12d,r8d - movzx eax,al - mov DWORD[120+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,14 - mov r12d,r11d - pinsrw xmm0,WORD[rax*4+rdi],7 - - add r10d,r11d - movdqu xmm3,XMMWORD[16+r13] - add bpl,32 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r9d,DWORD[48+r15] - add bl,dl - mov eax,DWORD[rbp*4+rdi] - add r9d,2368359562 - xor r12d,r11d - movzx ebx,bl - mov DWORD[124+rsi],edx - add r9d,r12d - add cl,al - rol r9d,20 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],7 - - add r9d,r10d - mov rsi,rcx - xor rcx,rcx - mov cl,sil - lea rsi,[rbp*4+rdi] - psllq xmm1,8 - pxor xmm3,xmm0 - pxor xmm3,xmm1 - pxor xmm0,xmm0 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - xor r12d,r9d - add r8d,DWORD[20+r15] - add al,dl - mov ebx,DWORD[4+rsi] - add r8d,4294588738 - movzx eax,al - add r8d,r12d - mov DWORD[rsi],edx - add cl,bl - rol r8d,4 - mov r12d,r10d - movd xmm0,DWORD[rax*4+rdi] - - add r8d,r9d - pxor xmm1,xmm1 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r8d - add r11d,DWORD[32+r15] - add bl,dl - mov eax,DWORD[8+rsi] - add r11d,2272392833 - movzx ebx,bl - add r11d,r12d - mov DWORD[4+rsi],edx - add cl,al - rol r11d,11 - mov r12d,r9d - movd xmm1,DWORD[rbx*4+rdi] - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - xor r12d,r11d - add r10d,DWORD[44+r15] - add al,dl - mov ebx,DWORD[12+rsi] - add r10d,1839030562 - movzx eax,al - add r10d,r12d - mov DWORD[8+rsi],edx - add cl,bl - rol r10d,16 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],1 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r10d - add r9d,DWORD[56+r15] - add bl,dl - mov eax,DWORD[16+rsi] - add r9d,4259657740 - movzx ebx,bl - add r9d,r12d - mov DWORD[12+rsi],edx - add cl,al - rol r9d,23 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],1 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - xor r12d,r9d - add r8d,DWORD[4+r15] - add al,dl - mov ebx,DWORD[20+rsi] - add r8d,2763975236 - movzx eax,al - add r8d,r12d - mov DWORD[16+rsi],edx - add cl,bl - rol r8d,4 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],2 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r8d - add r11d,DWORD[16+r15] - add bl,dl - mov eax,DWORD[24+rsi] - add r11d,1272893353 - movzx ebx,bl - add r11d,r12d - mov DWORD[20+rsi],edx - add cl,al - rol r11d,11 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],2 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - xor r12d,r11d - add r10d,DWORD[28+r15] - add al,dl - mov ebx,DWORD[28+rsi] - add r10d,4139469664 - movzx eax,al - add r10d,r12d - mov DWORD[24+rsi],edx - add cl,bl - rol r10d,16 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],3 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r10d - add r9d,DWORD[40+r15] - add bl,dl - mov eax,DWORD[32+rsi] - add r9d,3200236656 - movzx ebx,bl - add r9d,r12d - mov DWORD[28+rsi],edx - add cl,al - rol r9d,23 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],3 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - xor r12d,r9d - add r8d,DWORD[52+r15] - add al,dl - mov ebx,DWORD[36+rsi] - add r8d,681279174 - movzx eax,al - add r8d,r12d - mov DWORD[32+rsi],edx - add cl,bl - rol r8d,4 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],4 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r8d - add r11d,DWORD[r15] - add bl,dl - mov eax,DWORD[40+rsi] - add r11d,3936430074 - movzx ebx,bl - add r11d,r12d - mov DWORD[36+rsi],edx - add cl,al - rol r11d,11 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],4 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - xor r12d,r11d - add r10d,DWORD[12+r15] - add al,dl - mov ebx,DWORD[44+rsi] - add r10d,3572445317 - movzx eax,al - add r10d,r12d - mov DWORD[40+rsi],edx - add cl,bl - rol r10d,16 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],5 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r10d - add r9d,DWORD[24+r15] - add bl,dl - mov eax,DWORD[48+rsi] - add r9d,76029189 - movzx ebx,bl - add r9d,r12d - mov DWORD[44+rsi],edx - add cl,al - rol r9d,23 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],5 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - xor r12d,r9d - add r8d,DWORD[36+r15] - add al,dl - mov ebx,DWORD[52+rsi] - add r8d,3654602809 - movzx eax,al - add r8d,r12d - mov DWORD[48+rsi],edx - add cl,bl - rol r8d,4 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],6 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r8d - add r11d,DWORD[48+r15] - add bl,dl - mov eax,DWORD[56+rsi] - add r11d,3873151461 - movzx ebx,bl - add r11d,r12d - mov DWORD[52+rsi],edx - add cl,al - rol r11d,11 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],6 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - xor r12d,r11d - add r10d,DWORD[60+r15] - add al,dl - mov ebx,DWORD[60+rsi] - add r10d,530742520 - movzx eax,al - add r10d,r12d - mov DWORD[56+rsi],edx - add cl,bl - rol r10d,16 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],7 - - add r10d,r11d - movdqu xmm4,XMMWORD[32+r13] - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r10d - add r9d,DWORD[8+r15] - add bl,dl - mov eax,DWORD[64+rsi] - add r9d,3299628645 - movzx ebx,bl - add r9d,r12d - mov DWORD[60+rsi],edx - add cl,al - rol r9d,23 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],7 - - add r9d,r10d - psllq xmm1,8 - pxor xmm4,xmm0 - pxor xmm4,xmm1 - pxor xmm0,xmm0 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - or r12d,r9d - add r8d,DWORD[r15] - add al,dl - mov ebx,DWORD[68+rsi] - add r8d,4096336452 - movzx eax,al - xor r12d,r10d - mov DWORD[64+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,6 - mov r12d,-1 - movd xmm0,DWORD[rax*4+rdi] - - add r8d,r9d - pxor xmm1,xmm1 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - or r12d,r8d - add r11d,DWORD[28+r15] - add bl,dl - mov eax,DWORD[72+rsi] - add r11d,1126891415 - movzx ebx,bl - xor r12d,r9d - mov DWORD[68+rsi],edx - add r11d,r12d - add cl,al - rol r11d,10 - mov r12d,-1 - movd xmm1,DWORD[rbx*4+rdi] - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - or r12d,r11d - add r10d,DWORD[56+r15] - add al,dl - mov ebx,DWORD[76+rsi] - add r10d,2878612391 - movzx eax,al - xor r12d,r8d - mov DWORD[72+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,15 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],1 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - or r12d,r10d - add r9d,DWORD[20+r15] - add bl,dl - mov eax,DWORD[80+rsi] - add r9d,4237533241 - movzx ebx,bl - xor r12d,r11d - mov DWORD[76+rsi],edx - add r9d,r12d - add cl,al - rol r9d,21 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],1 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - or r12d,r9d - add r8d,DWORD[48+r15] - add al,dl - mov ebx,DWORD[84+rsi] - add r8d,1700485571 - movzx eax,al - xor r12d,r10d - mov DWORD[80+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,6 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],2 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - or r12d,r8d - add r11d,DWORD[12+r15] - add bl,dl - mov eax,DWORD[88+rsi] - add r11d,2399980690 - movzx ebx,bl - xor r12d,r9d - mov DWORD[84+rsi],edx - add r11d,r12d - add cl,al - rol r11d,10 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],2 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - or r12d,r11d - add r10d,DWORD[40+r15] - add al,dl - mov ebx,DWORD[92+rsi] - add r10d,4293915773 - movzx eax,al - xor r12d,r8d - mov DWORD[88+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,15 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],3 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - or r12d,r10d - add r9d,DWORD[4+r15] - add bl,dl - mov eax,DWORD[96+rsi] - add r9d,2240044497 - movzx ebx,bl - xor r12d,r11d - mov DWORD[92+rsi],edx - add r9d,r12d - add cl,al - rol r9d,21 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],3 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - or r12d,r9d - add r8d,DWORD[32+r15] - add al,dl - mov ebx,DWORD[100+rsi] - add r8d,1873313359 - movzx eax,al - xor r12d,r10d - mov DWORD[96+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,6 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],4 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - or r12d,r8d - add r11d,DWORD[60+r15] - add bl,dl - mov eax,DWORD[104+rsi] - add r11d,4264355552 - movzx ebx,bl - xor r12d,r9d - mov DWORD[100+rsi],edx - add r11d,r12d - add cl,al - rol r11d,10 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],4 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - or r12d,r11d - add r10d,DWORD[24+r15] - add al,dl - mov ebx,DWORD[108+rsi] - add r10d,2734768916 - movzx eax,al - xor r12d,r8d - mov DWORD[104+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,15 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],5 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - or r12d,r10d - add r9d,DWORD[52+r15] - add bl,dl - mov eax,DWORD[112+rsi] - add r9d,1309151649 - movzx ebx,bl - xor r12d,r11d - mov DWORD[108+rsi],edx - add r9d,r12d - add cl,al - rol r9d,21 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],5 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - or r12d,r9d - add r8d,DWORD[16+r15] - add al,dl - mov ebx,DWORD[116+rsi] - add r8d,4149444226 - movzx eax,al - xor r12d,r10d - mov DWORD[112+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,6 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],6 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - or r12d,r8d - add r11d,DWORD[44+r15] - add bl,dl - mov eax,DWORD[120+rsi] - add r11d,3174756917 - movzx ebx,bl - xor r12d,r9d - mov DWORD[116+rsi],edx - add r11d,r12d - add cl,al - rol r11d,10 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],6 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - or r12d,r11d - add r10d,DWORD[8+r15] - add al,dl - mov ebx,DWORD[124+rsi] - add r10d,718787259 - movzx eax,al - xor r12d,r8d - mov DWORD[120+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,15 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],7 - - add r10d,r11d - movdqu xmm5,XMMWORD[48+r13] - add bpl,32 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - or r12d,r10d - add r9d,DWORD[36+r15] - add bl,dl - mov eax,DWORD[rbp*4+rdi] - add r9d,3951481745 - movzx ebx,bl - xor r12d,r11d - mov DWORD[124+rsi],edx - add r9d,r12d - add cl,al - rol r9d,21 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],7 - - add r9d,r10d - mov rsi,rbp - xor rbp,rbp - mov bpl,sil - mov rsi,rcx - xor rcx,rcx - mov cl,sil - lea rsi,[rbp*4+rdi] - psllq xmm1,8 - pxor xmm5,xmm0 - pxor xmm5,xmm1 - add r8d,DWORD[rsp] - add r9d,DWORD[4+rsp] - add r10d,DWORD[8+rsp] - add r11d,DWORD[12+rsp] - - movdqu XMMWORD[r13*1+r14],xmm2 - movdqu XMMWORD[16+r13*1+r14],xmm3 - movdqu XMMWORD[32+r13*1+r14],xmm4 - movdqu XMMWORD[48+r13*1+r14],xmm5 - lea r15,[64+r15] - lea r13,[64+r13] - cmp r15,QWORD[16+rsp] - jb NEAR $L$oop - - mov r12,QWORD[24+rsp] - sub cl,al - mov DWORD[r12],r8d - mov DWORD[4+r12],r9d - mov DWORD[8+r12],r10d - mov DWORD[12+r12],r11d - sub bpl,1 - mov DWORD[((-8))+rdi],ebp - mov DWORD[((-4))+rdi],ecx - - mov r15,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r13,QWORD[56+rsp] - mov r12,QWORD[64+rsp] - mov rbp,QWORD[72+rsp] - mov rbx,QWORD[80+rsp] - lea rsp,[88+rsp] -$L$epilogue: -$L$abort: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_rc4_md5_enc: -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - lea r10,[$L$body] - cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] - - lea r10,[$L$epilogue] - cmp rbx,r10 - jae NEAR $L$in_prologue - - mov r15,QWORD[40+rax] - mov r14,QWORD[48+rax] - mov r13,QWORD[56+rax] - mov r12,QWORD[64+rax] - mov rbp,QWORD[72+rax] - mov rbx,QWORD[80+rax] - lea rax,[88+rax] - - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - -$L$in_prologue: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 -ALIGN 4 - DD $L$SEH_begin_rc4_md5_enc wrt ..imagebase - DD $L$SEH_end_rc4_md5_enc wrt ..imagebase - DD $L$SEH_info_rc4_md5_enc wrt ..imagebase - -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_info_rc4_md5_enc: -DB 9,0,0,0 - DD se_handler wrt ..imagebase diff --git a/win-x86_64/crypto/sha/sha1-x86_64.asm b/win-x86_64/crypto/sha/sha1-x86_64.asm index 0f5361a..168f78d 100644 --- a/win-x86_64/crypto/sha/sha1-x86_64.asm +++ b/win-x86_64/crypto/sha/sha1-x86_64.asm @@ -24,6 +24,11 @@ $L$SEH_begin_sha1_block_data_order: mov r10d,DWORD[((OPENSSL_ia32cap_P+8))] test r8d,512 jz NEAR $L$ialu + and r8d,268435456 + and r9d,1073741824 + or r8d,r9d + cmp r8d,1342177280 + je NEAR _avx_shortcut jmp NEAR _ssse3_shortcut ALIGN 16 @@ -2445,6 +2450,1146 @@ $L$epilogue_ssse3: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_sha1_block_data_order_ssse3: + +ALIGN 16 +sha1_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha1_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +_avx_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] + vzeroupper + vmovaps XMMWORD[(-40-96)+rax],xmm6 + vmovaps XMMWORD[(-40-80)+rax],xmm7 + vmovaps XMMWORD[(-40-64)+rax],xmm8 + vmovaps XMMWORD[(-40-48)+rax],xmm9 + vmovaps XMMWORD[(-40-32)+rax],xmm10 + vmovaps XMMWORD[(-40-16)+rax],xmm11 +$L$prologue_avx: + mov r14,rax + and rsp,-64 + mov r8,rdi + mov r9,rsi + mov r10,rdx + + shl r10,6 + add r10,r9 + lea r11,[((K_XX_XX+64))] + + mov eax,DWORD[r8] + mov ebx,DWORD[4+r8] + mov ecx,DWORD[8+r8] + mov edx,DWORD[12+r8] + mov esi,ebx + mov ebp,DWORD[16+r8] + mov edi,ecx + xor edi,edx + and esi,edi + + vmovdqa xmm6,XMMWORD[64+r11] + vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + vpshufb xmm0,xmm0,xmm6 + add r9,64 + vpshufb xmm1,xmm1,xmm6 + vpshufb xmm2,xmm2,xmm6 + vpshufb xmm3,xmm3,xmm6 + vpaddd xmm4,xmm0,xmm11 + vpaddd xmm5,xmm1,xmm11 + vpaddd xmm6,xmm2,xmm11 + vmovdqa XMMWORD[rsp],xmm4 + vmovdqa XMMWORD[16+rsp],xmm5 + vmovdqa XMMWORD[32+rsp],xmm6 + jmp NEAR $L$oop_avx +ALIGN 16 +$L$oop_avx: + shrd ebx,ebx,2 + xor esi,edx + vpalignr xmm4,xmm1,xmm0,8 + mov edi,eax + add ebp,DWORD[rsp] + vpaddd xmm9,xmm11,xmm3 + xor ebx,ecx + shld eax,eax,5 + vpsrldq xmm8,xmm3,4 + add ebp,esi + and edi,ebx + vpxor xmm4,xmm4,xmm0 + xor ebx,ecx + add ebp,eax + vpxor xmm8,xmm8,xmm2 + shrd eax,eax,7 + xor edi,ecx + mov esi,ebp + add edx,DWORD[4+rsp] + vpxor xmm4,xmm4,xmm8 + xor eax,ebx + shld ebp,ebp,5 + vmovdqa XMMWORD[48+rsp],xmm9 + add edx,edi + and esi,eax + vpsrld xmm8,xmm4,31 + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor esi,ebx + vpslldq xmm10,xmm4,12 + vpaddd xmm4,xmm4,xmm4 + mov edi,edx + add ecx,DWORD[8+rsp] + xor ebp,eax + shld edx,edx,5 + vpsrld xmm9,xmm10,30 + vpor xmm4,xmm4,xmm8 + add ecx,esi + and edi,ebp + xor ebp,eax + add ecx,edx + vpslld xmm10,xmm10,2 + vpxor xmm4,xmm4,xmm9 + shrd edx,edx,7 + xor edi,eax + mov esi,ecx + add ebx,DWORD[12+rsp] + vpxor xmm4,xmm4,xmm10 + xor edx,ebp + shld ecx,ecx,5 + add ebx,edi + and esi,edx + xor edx,ebp + add ebx,ecx + shrd ecx,ecx,7 + xor esi,ebp + vpalignr xmm5,xmm2,xmm1,8 + mov edi,ebx + add eax,DWORD[16+rsp] + vpaddd xmm9,xmm11,xmm4 + xor ecx,edx + shld ebx,ebx,5 + vpsrldq xmm8,xmm4,4 + add eax,esi + and edi,ecx + vpxor xmm5,xmm5,xmm1 + xor ecx,edx + add eax,ebx + vpxor xmm8,xmm8,xmm3 + shrd ebx,ebx,7 + xor edi,edx + mov esi,eax + add ebp,DWORD[20+rsp] + vpxor xmm5,xmm5,xmm8 + xor ebx,ecx + shld eax,eax,5 + vmovdqa XMMWORD[rsp],xmm9 + add ebp,edi + and esi,ebx + vpsrld xmm8,xmm5,31 + xor ebx,ecx + add ebp,eax + shrd eax,eax,7 + xor esi,ecx + vpslldq xmm10,xmm5,12 + vpaddd xmm5,xmm5,xmm5 + mov edi,ebp + add edx,DWORD[24+rsp] + xor eax,ebx + shld ebp,ebp,5 + vpsrld xmm9,xmm10,30 + vpor xmm5,xmm5,xmm8 + add edx,esi + and edi,eax + xor eax,ebx + add edx,ebp + vpslld xmm10,xmm10,2 + vpxor xmm5,xmm5,xmm9 + shrd ebp,ebp,7 + xor edi,ebx + mov esi,edx + add ecx,DWORD[28+rsp] + vpxor xmm5,xmm5,xmm10 + xor ebp,eax + shld edx,edx,5 + vmovdqa xmm11,XMMWORD[((-32))+r11] + add ecx,edi + and esi,ebp + xor ebp,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + vpalignr xmm6,xmm3,xmm2,8 + mov edi,ecx + add ebx,DWORD[32+rsp] + vpaddd xmm9,xmm11,xmm5 + xor edx,ebp + shld ecx,ecx,5 + vpsrldq xmm8,xmm5,4 + add ebx,esi + and edi,edx + vpxor xmm6,xmm6,xmm2 + xor edx,ebp + add ebx,ecx + vpxor xmm8,xmm8,xmm4 + shrd ecx,ecx,7 + xor edi,ebp + mov esi,ebx + add eax,DWORD[36+rsp] + vpxor xmm6,xmm6,xmm8 + xor ecx,edx + shld ebx,ebx,5 + vmovdqa XMMWORD[16+rsp],xmm9 + add eax,edi + and esi,ecx + vpsrld xmm8,xmm6,31 + xor ecx,edx + add eax,ebx + shrd ebx,ebx,7 + xor esi,edx + vpslldq xmm10,xmm6,12 + vpaddd xmm6,xmm6,xmm6 + mov edi,eax + add ebp,DWORD[40+rsp] + xor ebx,ecx + shld eax,eax,5 + vpsrld xmm9,xmm10,30 + vpor xmm6,xmm6,xmm8 + add ebp,esi + and edi,ebx + xor ebx,ecx + add ebp,eax + vpslld xmm10,xmm10,2 + vpxor xmm6,xmm6,xmm9 + shrd eax,eax,7 + xor edi,ecx + mov esi,ebp + add edx,DWORD[44+rsp] + vpxor xmm6,xmm6,xmm10 + xor eax,ebx + shld ebp,ebp,5 + add edx,edi + and esi,eax + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor esi,ebx + vpalignr xmm7,xmm4,xmm3,8 + mov edi,edx + add ecx,DWORD[48+rsp] + vpaddd xmm9,xmm11,xmm6 + xor ebp,eax + shld edx,edx,5 + vpsrldq xmm8,xmm6,4 + add ecx,esi + and edi,ebp + vpxor xmm7,xmm7,xmm3 + xor ebp,eax + add ecx,edx + vpxor xmm8,xmm8,xmm5 + shrd edx,edx,7 + xor edi,eax + mov esi,ecx + add ebx,DWORD[52+rsp] + vpxor xmm7,xmm7,xmm8 + xor edx,ebp + shld ecx,ecx,5 + vmovdqa XMMWORD[32+rsp],xmm9 + add ebx,edi + and esi,edx + vpsrld xmm8,xmm7,31 + xor edx,ebp + add ebx,ecx + shrd ecx,ecx,7 + xor esi,ebp + vpslldq xmm10,xmm7,12 + vpaddd xmm7,xmm7,xmm7 + mov edi,ebx + add eax,DWORD[56+rsp] + xor ecx,edx + shld ebx,ebx,5 + vpsrld xmm9,xmm10,30 + vpor xmm7,xmm7,xmm8 + add eax,esi + and edi,ecx + xor ecx,edx + add eax,ebx + vpslld xmm10,xmm10,2 + vpxor xmm7,xmm7,xmm9 + shrd ebx,ebx,7 + xor edi,edx + mov esi,eax + add ebp,DWORD[60+rsp] + vpxor xmm7,xmm7,xmm10 + xor ebx,ecx + shld eax,eax,5 + add ebp,edi + and esi,ebx + xor ebx,ecx + add ebp,eax + vpalignr xmm8,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + shrd eax,eax,7 + xor esi,ecx + mov edi,ebp + add edx,DWORD[rsp] + vpxor xmm0,xmm0,xmm1 + xor eax,ebx + shld ebp,ebp,5 + vpaddd xmm9,xmm11,xmm7 + add edx,esi + and edi,eax + vpxor xmm0,xmm0,xmm8 + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor edi,ebx + vpsrld xmm8,xmm0,30 + vmovdqa XMMWORD[48+rsp],xmm9 + mov esi,edx + add ecx,DWORD[4+rsp] + xor ebp,eax + shld edx,edx,5 + vpslld xmm0,xmm0,2 + add ecx,edi + and esi,ebp + xor ebp,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + mov edi,ecx + add ebx,DWORD[8+rsp] + vpor xmm0,xmm0,xmm8 + xor edx,ebp + shld ecx,ecx,5 + add ebx,esi + and edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[12+rsp] + xor edi,ebp + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpalignr xmm8,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ebp,DWORD[16+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + vpxor xmm1,xmm1,xmm2 + add ebp,esi + xor edi,ecx + vpaddd xmm9,xmm11,xmm0 + shrd ebx,ebx,7 + add ebp,eax + vpxor xmm1,xmm1,xmm8 + add edx,DWORD[20+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + vpsrld xmm8,xmm1,30 + vmovdqa XMMWORD[rsp],xmm9 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpslld xmm1,xmm1,2 + add ecx,DWORD[24+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vpor xmm1,xmm1,xmm8 + add ebx,DWORD[28+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + vpalignr xmm8,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add eax,DWORD[32+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + vpxor xmm2,xmm2,xmm3 + add eax,esi + xor edi,edx + vpaddd xmm9,xmm11,xmm1 + vmovdqa xmm11,XMMWORD[r11] + shrd ecx,ecx,7 + add eax,ebx + vpxor xmm2,xmm2,xmm8 + add ebp,DWORD[36+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + vpsrld xmm8,xmm2,30 + vmovdqa XMMWORD[16+rsp],xmm9 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpslld xmm2,xmm2,2 + add edx,DWORD[40+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + vpor xmm2,xmm2,xmm8 + add ecx,DWORD[44+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + vpalignr xmm8,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebx,DWORD[48+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + vpxor xmm3,xmm3,xmm4 + add ebx,esi + xor edi,ebp + vpaddd xmm9,xmm11,xmm2 + shrd edx,edx,7 + add ebx,ecx + vpxor xmm3,xmm3,xmm8 + add eax,DWORD[52+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + vpsrld xmm8,xmm3,30 + vmovdqa XMMWORD[32+rsp],xmm9 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpslld xmm3,xmm3,2 + add ebp,DWORD[56+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpor xmm3,xmm3,xmm8 + add edx,DWORD[60+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpalignr xmm8,xmm3,xmm2,8 + vpxor xmm4,xmm4,xmm0 + add ecx,DWORD[rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + vpxor xmm4,xmm4,xmm5 + add ecx,esi + xor edi,eax + vpaddd xmm9,xmm11,xmm3 + shrd ebp,ebp,7 + add ecx,edx + vpxor xmm4,xmm4,xmm8 + add ebx,DWORD[4+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + vpsrld xmm8,xmm4,30 + vmovdqa XMMWORD[48+rsp],xmm9 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + vpslld xmm4,xmm4,2 + add eax,DWORD[8+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + vpor xmm4,xmm4,xmm8 + add ebp,DWORD[12+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpalignr xmm8,xmm4,xmm3,8 + vpxor xmm5,xmm5,xmm1 + add edx,DWORD[16+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + vpxor xmm5,xmm5,xmm6 + add edx,esi + xor edi,ebx + vpaddd xmm9,xmm11,xmm4 + shrd eax,eax,7 + add edx,ebp + vpxor xmm5,xmm5,xmm8 + add ecx,DWORD[20+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + vpsrld xmm8,xmm5,30 + vmovdqa XMMWORD[rsp],xmm9 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + vpslld xmm5,xmm5,2 + add ebx,DWORD[24+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + vpor xmm5,xmm5,xmm8 + add eax,DWORD[28+rsp] + shrd ecx,ecx,7 + mov esi,ebx + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + vpalignr xmm8,xmm5,xmm4,8 + vpxor xmm6,xmm6,xmm2 + add ebp,DWORD[32+rsp] + and esi,ecx + xor ecx,edx + shrd ebx,ebx,7 + vpxor xmm6,xmm6,xmm7 + mov edi,eax + xor esi,ecx + vpaddd xmm9,xmm11,xmm5 + shld eax,eax,5 + add ebp,esi + vpxor xmm6,xmm6,xmm8 + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[36+rsp] + vpsrld xmm8,xmm6,30 + vmovdqa XMMWORD[16+rsp],xmm9 + and edi,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,ebp + vpslld xmm6,xmm6,2 + xor edi,ebx + shld ebp,ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[40+rsp] + and esi,eax + vpor xmm6,xmm6,xmm8 + xor eax,ebx + shrd ebp,ebp,7 + mov edi,edx + xor esi,eax + shld edx,edx,5 + add ecx,esi + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[44+rsp] + and edi,ebp + xor ebp,eax + shrd edx,edx,7 + mov esi,ecx + xor edi,ebp + shld ecx,ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + vpalignr xmm8,xmm6,xmm5,8 + vpxor xmm7,xmm7,xmm3 + add eax,DWORD[48+rsp] + and esi,edx + xor edx,ebp + shrd ecx,ecx,7 + vpxor xmm7,xmm7,xmm0 + mov edi,ebx + xor esi,edx + vpaddd xmm9,xmm11,xmm6 + vmovdqa xmm11,XMMWORD[32+r11] + shld ebx,ebx,5 + add eax,esi + vpxor xmm7,xmm7,xmm8 + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[52+rsp] + vpsrld xmm8,xmm7,30 + vmovdqa XMMWORD[32+rsp],xmm9 + and edi,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + vpslld xmm7,xmm7,2 + xor edi,ecx + shld eax,eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[56+rsp] + and esi,ebx + vpor xmm7,xmm7,xmm8 + xor ebx,ecx + shrd eax,eax,7 + mov edi,ebp + xor esi,ebx + shld ebp,ebp,5 + add edx,esi + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[60+rsp] + and edi,eax + xor eax,ebx + shrd ebp,ebp,7 + mov esi,edx + xor edi,eax + shld edx,edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + vpalignr xmm8,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + add ebx,DWORD[rsp] + and esi,ebp + xor ebp,eax + shrd edx,edx,7 + vpxor xmm0,xmm0,xmm1 + mov edi,ecx + xor esi,ebp + vpaddd xmm9,xmm11,xmm7 + shld ecx,ecx,5 + add ebx,esi + vpxor xmm0,xmm0,xmm8 + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[4+rsp] + vpsrld xmm8,xmm0,30 + vmovdqa XMMWORD[48+rsp],xmm9 + and edi,edx + xor edx,ebp + shrd ecx,ecx,7 + mov esi,ebx + vpslld xmm0,xmm0,2 + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[8+rsp] + and esi,ecx + vpor xmm0,xmm0,xmm8 + xor ecx,edx + shrd ebx,ebx,7 + mov edi,eax + xor esi,ecx + shld eax,eax,5 + add ebp,esi + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[12+rsp] + and edi,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,ebp + xor edi,ebx + shld ebp,ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + vpalignr xmm8,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ecx,DWORD[16+rsp] + and esi,eax + xor eax,ebx + shrd ebp,ebp,7 + vpxor xmm1,xmm1,xmm2 + mov edi,edx + xor esi,eax + vpaddd xmm9,xmm11,xmm0 + shld edx,edx,5 + add ecx,esi + vpxor xmm1,xmm1,xmm8 + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[20+rsp] + vpsrld xmm8,xmm1,30 + vmovdqa XMMWORD[rsp],xmm9 + and edi,ebp + xor ebp,eax + shrd edx,edx,7 + mov esi,ecx + vpslld xmm1,xmm1,2 + xor edi,ebp + shld ecx,ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[24+rsp] + and esi,edx + vpor xmm1,xmm1,xmm8 + xor edx,ebp + shrd ecx,ecx,7 + mov edi,ebx + xor esi,edx + shld ebx,ebx,5 + add eax,esi + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[28+rsp] + and edi,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + xor edi,ecx + shld eax,eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + vpalignr xmm8,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add edx,DWORD[32+rsp] + and esi,ebx + xor ebx,ecx + shrd eax,eax,7 + vpxor xmm2,xmm2,xmm3 + mov edi,ebp + xor esi,ebx + vpaddd xmm9,xmm11,xmm1 + shld ebp,ebp,5 + add edx,esi + vpxor xmm2,xmm2,xmm8 + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[36+rsp] + vpsrld xmm8,xmm2,30 + vmovdqa XMMWORD[16+rsp],xmm9 + and edi,eax + xor eax,ebx + shrd ebp,ebp,7 + mov esi,edx + vpslld xmm2,xmm2,2 + xor edi,eax + shld edx,edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[40+rsp] + and esi,ebp + vpor xmm2,xmm2,xmm8 + xor ebp,eax + shrd edx,edx,7 + mov edi,ecx + xor esi,ebp + shld ecx,ecx,5 + add ebx,esi + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[44+rsp] + and edi,edx + xor edx,ebp + shrd ecx,ecx,7 + mov esi,ebx + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + add eax,ebx + vpalignr xmm8,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebp,DWORD[48+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + vpxor xmm3,xmm3,xmm4 + add ebp,esi + xor edi,ecx + vpaddd xmm9,xmm11,xmm2 + shrd ebx,ebx,7 + add ebp,eax + vpxor xmm3,xmm3,xmm8 + add edx,DWORD[52+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + vpsrld xmm8,xmm3,30 + vmovdqa XMMWORD[32+rsp],xmm9 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpslld xmm3,xmm3,2 + add ecx,DWORD[56+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vpor xmm3,xmm3,xmm8 + add ebx,DWORD[60+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[rsp] + vpaddd xmm9,xmm11,xmm3 + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + vmovdqa XMMWORD[48+rsp],xmm9 + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[4+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[8+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[12+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + cmp r9,r10 + je NEAR $L$done_avx + vmovdqa xmm6,XMMWORD[64+r11] + vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + vpshufb xmm0,xmm0,xmm6 + add r9,64 + add ebx,DWORD[16+rsp] + xor esi,ebp + vpshufb xmm1,xmm1,xmm6 + mov edi,ecx + shld ecx,ecx,5 + vpaddd xmm4,xmm0,xmm11 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + vmovdqa XMMWORD[rsp],xmm4 + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + vpshufb xmm2,xmm2,xmm6 + mov edi,edx + shld edx,edx,5 + vpaddd xmm5,xmm1,xmm11 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vmovdqa XMMWORD[16+rsp],xmm5 + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + vpshufb xmm3,xmm3,xmm6 + mov edi,ebp + shld ebp,ebp,5 + vpaddd xmm6,xmm2,xmm11 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + vmovdqa XMMWORD[32+rsp],xmm6 + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + shrd ecx,ecx,7 + add eax,ebx + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + add edx,DWORD[12+r8] + mov DWORD[r8],eax + add ebp,DWORD[16+r8] + mov DWORD[4+r8],esi + mov ebx,esi + mov DWORD[8+r8],ecx + mov edi,ecx + mov DWORD[12+r8],edx + xor edi,edx + mov DWORD[16+r8],ebp + and esi,edi + jmp NEAR $L$oop_avx + +ALIGN 16 +$L$done_avx: + add ebx,DWORD[16+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + shrd ecx,ecx,7 + add eax,ebx + vzeroupper + + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + mov DWORD[r8],eax + add edx,DWORD[12+r8] + mov DWORD[4+r8],esi + add ebp,DWORD[16+r8] + mov DWORD[8+r8],ecx + mov DWORD[12+r8],edx + mov DWORD[16+r8],ebp + movaps xmm6,XMMWORD[((-40-96))+r14] + movaps xmm7,XMMWORD[((-40-80))+r14] + movaps xmm8,XMMWORD[((-40-64))+r14] + movaps xmm9,XMMWORD[((-40-48))+r14] + movaps xmm10,XMMWORD[((-40-32))+r14] + movaps xmm11,XMMWORD[((-40-16))+r14] + lea rsi,[r14] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha1_block_data_order_avx: ALIGN 64 K_XX_XX: DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2605,6 +3750,9 @@ ALIGN 4 DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha1_block_data_order: @@ -2614,3 +3762,7 @@ $L$SEH_info_sha1_block_data_order_ssse3: DB 9,0,0,0 DD ssse3_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha1_block_data_order_avx: +DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/win-x86_64/crypto/sha/sha256-x86_64.asm b/win-x86_64/crypto/sha/sha256-x86_64.asm index e6193c5..efaf9b5 100644 --- a/win-x86_64/crypto/sha/sha256-x86_64.asm +++ b/win-x86_64/crypto/sha/sha256-x86_64.asm @@ -23,6 +23,11 @@ $L$SEH_begin_sha256_block_data_order: mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] mov r11d,DWORD[8+r11] + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut test r10d,512 jnz NEAR $L$ssse3_shortcut push rbx @@ -2877,6 +2882,1082 @@ $L$epilogue_ssse3: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_sha256_block_data_order_ssse3: + +ALIGN 64 +sha256_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$avx_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[((64+24))+rsp],r11 + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_avx: + + vzeroupper + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + vmovdqa xmm8,XMMWORD[((K256+512+32))] + vmovdqa xmm9,XMMWORD[((K256+512+64))] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm7,XMMWORD[((K256+512))] + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm0,xmm0,xmm7 + lea rbp,[K256] + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,XMMWORD[rbp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,XMMWORD[32+rbp] + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + vpaddd xmm7,xmm3,XMMWORD[96+rbp] + vmovdqa XMMWORD[rsp],xmm4 + mov r14d,eax + vmovdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + vmovdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + vmovdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + sub rbp,-128 + vpalignr xmm4,xmm1,xmm0,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm3,xmm2,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm0,xmm0,xmm7 + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm3,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm0,xmm0,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm0,xmm0,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + vpshufd xmm7,xmm0,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm0,xmm0,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm0,XMMWORD[rbp] + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[rsp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm0,xmm3,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm1,xmm1,xmm7 + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm0,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm1,xmm1,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm1,xmm1,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + vpshufd xmm7,xmm1,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm1,xmm1,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm1,XMMWORD[32+rbp] + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[16+rsp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm1,xmm0,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm2,xmm2,xmm7 + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm1,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm2,xmm2,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm2,xmm2,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + vpshufd xmm7,xmm2,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm2,xmm2,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[32+rsp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm2,xmm1,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm3,xmm3,xmm7 + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm2,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm3,xmm3,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm3,xmm3,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + vpshufd xmm7,xmm3,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm3,xmm3,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm3,XMMWORD[96+rbp] + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_avx + + mov rsi,QWORD[((64+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha256_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -2982,6 +4063,9 @@ ALIGN 4 DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha256_block_data_order: @@ -2992,3 +4076,7 @@ $L$SEH_info_sha256_block_data_order_ssse3: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha256_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/win-x86_64/crypto/sha/sha512-x86_64.asm b/win-x86_64/crypto/sha/sha512-x86_64.asm index b76cc0e..71449cd 100644 --- a/win-x86_64/crypto/sha/sha512-x86_64.asm +++ b/win-x86_64/crypto/sha/sha512-x86_64.asm @@ -19,6 +19,17 @@ $L$SEH_begin_sha512_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] + mov r9d,DWORD[r11] + mov r10d,DWORD[4+r11] + mov r11d,DWORD[8+r11] + test r10d,2048 + jnz NEAR $L$xop_shortcut + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut push rbx push rbp push r12 @@ -1801,6 +1812,2282 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 + +ALIGN 64 +sha512_block_data_order_xop: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_xop: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$xop_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[((128+24))+rsp],r11 + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_xop: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_xop +ALIGN 16 +$L$loop_xop: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$xop_00_47 + +ALIGN 16 +$L$xop_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + ror r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r8 + xor r12,r10 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rax + vpaddq xmm0,xmm0,xmm11 + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax +DB 143,72,120,195,209,7 + xor r12,r10 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,223,3 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + ror r14,28 + vpsrlq xmm10,xmm7,6 + add rdx,r11 + add r11,rdi + vpaddq xmm0,xmm0,xmm8 + mov r13,rdx + add r14,r11 +DB 143,72,120,195,203,42 + ror r13,23 + mov r11,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + vpaddq xmm0,xmm0,xmm11 + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + ror r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rcx + xor r12,r8 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r10 + vpaddq xmm1,xmm1,xmm11 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 +DB 143,72,120,195,209,7 + xor r12,r8 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,216,3 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + ror r14,28 + vpsrlq xmm10,xmm0,6 + add rbx,r9 + add r9,rdi + vpaddq xmm1,xmm1,xmm8 + mov r13,rbx + add r14,r9 +DB 143,72,120,195,203,42 + ror r13,23 + mov r9,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + vpaddq xmm1,xmm1,xmm11 + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + ror r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rax + xor r12,rcx + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r8 + vpaddq xmm2,xmm2,xmm11 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 +DB 143,72,120,195,209,7 + xor r12,rcx + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,217,3 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + ror r14,28 + vpsrlq xmm10,xmm1,6 + add r11,rdx + add rdx,rdi + vpaddq xmm2,xmm2,xmm8 + mov r13,r11 + add r14,rdx +DB 143,72,120,195,203,42 + ror r13,23 + mov rdx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + vpaddq xmm2,xmm2,xmm11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + ror r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r10 + xor r12,rax + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rcx + vpaddq xmm3,xmm3,xmm11 + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx +DB 143,72,120,195,209,7 + xor r12,rax + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,218,3 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + ror r14,28 + vpsrlq xmm10,xmm2,6 + add r9,rbx + add rbx,rdi + vpaddq xmm3,xmm3,xmm8 + mov r13,r9 + add r14,rbx +DB 143,72,120,195,203,42 + ror r13,23 + mov rbx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + vpaddq xmm3,xmm3,xmm11 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + ror r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r8 + xor r12,r10 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rax + vpaddq xmm4,xmm4,xmm11 + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax +DB 143,72,120,195,209,7 + xor r12,r10 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,219,3 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + ror r14,28 + vpsrlq xmm10,xmm3,6 + add rdx,r11 + add r11,rdi + vpaddq xmm4,xmm4,xmm8 + mov r13,rdx + add r14,r11 +DB 143,72,120,195,203,42 + ror r13,23 + mov r11,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + vpaddq xmm4,xmm4,xmm11 + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + ror r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rcx + xor r12,r8 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r10 + vpaddq xmm5,xmm5,xmm11 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 +DB 143,72,120,195,209,7 + xor r12,r8 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,220,3 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + ror r14,28 + vpsrlq xmm10,xmm4,6 + add rbx,r9 + add r9,rdi + vpaddq xmm5,xmm5,xmm8 + mov r13,rbx + add r14,r9 +DB 143,72,120,195,203,42 + ror r13,23 + mov r9,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + vpaddq xmm5,xmm5,xmm11 + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + ror r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rax + xor r12,rcx + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r8 + vpaddq xmm6,xmm6,xmm11 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 +DB 143,72,120,195,209,7 + xor r12,rcx + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,221,3 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + ror r14,28 + vpsrlq xmm10,xmm5,6 + add r11,rdx + add rdx,rdi + vpaddq xmm6,xmm6,xmm8 + mov r13,r11 + add r14,rdx +DB 143,72,120,195,203,42 + ror r13,23 + mov rdx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + vpaddq xmm6,xmm6,xmm11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + ror r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r10 + xor r12,rax + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rcx + vpaddq xmm7,xmm7,xmm11 + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx +DB 143,72,120,195,209,7 + xor r12,rax + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,222,3 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + ror r14,28 + vpsrlq xmm10,xmm6,6 + add r9,rbx + add rbx,rdi + vpaddq xmm7,xmm7,xmm8 + mov r13,r9 + add r14,rbx +DB 143,72,120,195,203,42 + ror r13,23 + mov rbx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + vpaddq xmm7,xmm7,xmm11 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$xop_00_47 + ror r13,23 + mov rax,r14 + mov r12,r9 + ror r14,5 + xor r13,r8 + xor r12,r10 + ror r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + ror r14,6 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + ror r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + ror r13,23 + mov r11,r14 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + ror r13,23 + mov r10,r14 + mov r12,rdx + ror r14,5 + xor r13,rcx + xor r12,r8 + ror r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + ror r14,6 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + ror r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + ror r13,23 + mov r9,r14 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + ror r13,23 + mov r8,r14 + mov r12,rbx + ror r14,5 + xor r13,rax + xor r12,rcx + ror r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + ror r14,6 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + ror r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + ror r13,23 + mov rdx,r14 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + ror r13,23 + mov rcx,r14 + mov r12,r11 + ror r14,5 + xor r13,r10 + xor r12,rax + ror r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + ror r14,6 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + ror r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + ror r13,23 + mov rbx,r14 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + ror r13,23 + mov rax,r14 + mov r12,r9 + ror r14,5 + xor r13,r8 + xor r12,r10 + ror r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + ror r14,6 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + ror r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + ror r13,23 + mov r11,r14 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + ror r13,23 + mov r10,r14 + mov r12,rdx + ror r14,5 + xor r13,rcx + xor r12,r8 + ror r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + ror r14,6 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + ror r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + ror r13,23 + mov r9,r14 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + ror r13,23 + mov r8,r14 + mov r12,rbx + ror r14,5 + xor r13,rax + xor r12,rcx + ror r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + ror r14,6 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + ror r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + ror r13,23 + mov rdx,r14 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + ror r13,23 + mov rcx,r14 + mov r12,r11 + ror r14,5 + xor r13,r10 + xor r12,rax + ror r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + ror r14,6 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + ror r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + ror r13,23 + mov rbx,r14 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_xop + + mov rsi,QWORD[((128+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_xop: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha512_block_data_order_xop: + +ALIGN 64 +sha512_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$avx_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[((128+24))+rsp],r11 + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_avx: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm0,xmm0,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm7,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm7,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm0,xmm0,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm7,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[8+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm0,xmm0,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm1,xmm1,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[16+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm0,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm0,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm1,xmm1,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm0,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[24+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm1,xmm1,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm2,xmm2,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[32+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm1,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm1,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm2,xmm2,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm1,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[40+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm2,xmm2,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm3,xmm3,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[48+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm2,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm2,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm3,xmm3,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm2,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[56+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm3,xmm3,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm4,xmm4,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[64+rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm3,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm3,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm4,xmm4,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm3,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[72+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm4,xmm4,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm5,xmm5,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[80+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm4,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm4,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm5,xmm5,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm4,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[88+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm5,xmm5,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm6,xmm6,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[96+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm5,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm5,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm6,xmm6,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm5,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[104+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm6,xmm6,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm7,xmm7,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[112+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm6,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm6,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm7,xmm7,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm6,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[120+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm7,xmm7,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_avx + + mov rsi,QWORD[((128+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha512_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -1903,9 +4190,23 @@ ALIGN 4 DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase DD $L$SEH_end_sha512_block_data_order wrt ..imagebase DD $L$SEH_info_sha512_block_data_order wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha512_block_data_order: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_sha512_block_data_order_xop: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase +$L$SEH_info_sha512_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase |