diff options
Diffstat (limited to 'win-x86_64/crypto/sha/sha256-x86_64.asm')
-rw-r--r-- | win-x86_64/crypto/sha/sha256-x86_64.asm | 1088 |
1 files changed, 1088 insertions, 0 deletions
diff --git a/win-x86_64/crypto/sha/sha256-x86_64.asm b/win-x86_64/crypto/sha/sha256-x86_64.asm index e6193c5..efaf9b5 100644 --- a/win-x86_64/crypto/sha/sha256-x86_64.asm +++ b/win-x86_64/crypto/sha/sha256-x86_64.asm @@ -23,6 +23,11 @@ $L$SEH_begin_sha256_block_data_order: mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] mov r11d,DWORD[8+r11] + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut test r10d,512 jnz NEAR $L$ssse3_shortcut push rbx @@ -2877,6 +2882,1082 @@ $L$epilogue_ssse3: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_sha256_block_data_order_ssse3: + +ALIGN 64 +sha256_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$avx_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[((64+24))+rsp],r11 + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_avx: + + vzeroupper + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + vmovdqa xmm8,XMMWORD[((K256+512+32))] + vmovdqa xmm9,XMMWORD[((K256+512+64))] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm7,XMMWORD[((K256+512))] + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm0,xmm0,xmm7 + lea rbp,[K256] + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,XMMWORD[rbp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,XMMWORD[32+rbp] + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + vpaddd xmm7,xmm3,XMMWORD[96+rbp] + vmovdqa XMMWORD[rsp],xmm4 + mov r14d,eax + vmovdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + vmovdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + vmovdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + sub rbp,-128 + vpalignr xmm4,xmm1,xmm0,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm3,xmm2,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm0,xmm0,xmm7 + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm3,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm0,xmm0,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm0,xmm0,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + vpshufd xmm7,xmm0,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm0,xmm0,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm0,XMMWORD[rbp] + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[rsp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm0,xmm3,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm1,xmm1,xmm7 + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm0,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm1,xmm1,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm1,xmm1,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + vpshufd xmm7,xmm1,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm1,xmm1,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm1,XMMWORD[32+rbp] + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[16+rsp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm1,xmm0,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm2,xmm2,xmm7 + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm1,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm2,xmm2,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm2,xmm2,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + vpshufd xmm7,xmm2,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm2,xmm2,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[32+rsp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm2,xmm1,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm3,xmm3,xmm7 + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm2,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm3,xmm3,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm3,xmm3,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + vpshufd xmm7,xmm3,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm3,xmm3,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm3,XMMWORD[96+rbp] + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_avx + + mov rsi,QWORD[((64+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha256_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -2982,6 +4063,9 @@ ALIGN 4 DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha256_block_data_order: @@ -2992,3 +4076,7 @@ $L$SEH_info_sha256_block_data_order_ssse3: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha256_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase |