summaryrefslogtreecommitdiffstats
path: root/linux-x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'linux-x86_64')
-rw-r--r--linux-x86_64/crypto/bn/x86_64-mont5.S9
-rw-r--r--linux-x86_64/crypto/ec/p256-x86_64-asm.S241
-rw-r--r--linux-x86_64/crypto/rc4/rc4-md5-x86_64.S1262
-rw-r--r--linux-x86_64/crypto/sha/sha1-x86_64.S1121
-rw-r--r--linux-x86_64/crypto/sha/sha256-x86_64.S1062
-rw-r--r--linux-x86_64/crypto/sha/sha512-x86_64.S2241
6 files changed, 4433 insertions, 1503 deletions
diff --git a/linux-x86_64/crypto/bn/x86_64-mont5.S b/linux-x86_64/crypto/bn/x86_64-mont5.S
index 02edc69..214064e 100644
--- a/linux-x86_64/crypto/bn/x86_64-mont5.S
+++ b/linux-x86_64/crypto/bn/x86_64-mont5.S
@@ -1561,6 +1561,15 @@ sqr8x_reduction:
.align 32
.L8x_tail_done:
addq (%rdx),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+
+
xorq %rax,%rax
negq %rsi
diff --git a/linux-x86_64/crypto/ec/p256-x86_64-asm.S b/linux-x86_64/crypto/ec/p256-x86_64-asm.S
index db00544..2884c69 100644
--- a/linux-x86_64/crypto/ec/p256-x86_64-asm.S
+++ b/linux-x86_64/crypto/ec/p256-x86_64-asm.S
@@ -8,10 +8,6 @@
.Lpoly:
.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
-
-.LRR:
-.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
-
.LOne:
.long 1,1,1,1,1,1,1,1
.LTwo:
@@ -21,8 +17,6 @@
.LONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
-.globl ecp_nistz256_mul_by_2
-.hidden ecp_nistz256_mul_by_2
.type ecp_nistz256_mul_by_2,@function
.align 64
ecp_nistz256_mul_by_2:
@@ -66,228 +60,6 @@ ecp_nistz256_mul_by_2:
-.globl ecp_nistz256_div_by_2
-.hidden ecp_nistz256_div_by_2
-.type ecp_nistz256_div_by_2,@function
-.align 32
-ecp_nistz256_div_by_2:
- pushq %r12
- pushq %r13
-
- movq 0(%rsi),%r8
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq %r8,%rax
- movq 24(%rsi),%r11
- leaq .Lpoly(%rip),%rsi
-
- movq %r9,%rdx
- xorq %r13,%r13
- addq 0(%rsi),%r8
- movq %r10,%rcx
- adcq 8(%rsi),%r9
- adcq 16(%rsi),%r10
- movq %r11,%r12
- adcq 24(%rsi),%r11
- adcq $0,%r13
- xorq %rsi,%rsi
- testq $1,%rax
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- cmovzq %rcx,%r10
- cmovzq %r12,%r11
- cmovzq %rsi,%r13
-
- movq %r9,%rax
- shrq $1,%r8
- shlq $63,%rax
- movq %r10,%rdx
- shrq $1,%r9
- orq %rax,%r8
- shlq $63,%rdx
- movq %r11,%rcx
- shrq $1,%r10
- orq %rdx,%r9
- shlq $63,%rcx
- shrq $1,%r11
- shlq $63,%r13
- orq %rcx,%r10
- orq %r13,%r11
-
- movq %r8,0(%rdi)
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- popq %r13
- popq %r12
- .byte 0xf3,0xc3
-.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
-
-
-
-.globl ecp_nistz256_mul_by_3
-.hidden ecp_nistz256_mul_by_3
-.type ecp_nistz256_mul_by_3,@function
-.align 32
-ecp_nistz256_mul_by_3:
- pushq %r12
- pushq %r13
-
- movq 0(%rsi),%r8
- xorq %r13,%r13
- movq 8(%rsi),%r9
- addq %r8,%r8
- movq 16(%rsi),%r10
- adcq %r9,%r9
- movq 24(%rsi),%r11
- movq %r8,%rax
- adcq %r10,%r10
- adcq %r11,%r11
- movq %r9,%rdx
- adcq $0,%r13
-
- subq $-1,%r8
- movq %r10,%rcx
- sbbq .Lpoly+8(%rip),%r9
- sbbq $0,%r10
- movq %r11,%r12
- sbbq .Lpoly+24(%rip),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- cmovzq %rcx,%r10
- cmovzq %r12,%r11
-
- xorq %r13,%r13
- addq 0(%rsi),%r8
- adcq 8(%rsi),%r9
- movq %r8,%rax
- adcq 16(%rsi),%r10
- adcq 24(%rsi),%r11
- movq %r9,%rdx
- adcq $0,%r13
-
- subq $-1,%r8
- movq %r10,%rcx
- sbbq .Lpoly+8(%rip),%r9
- sbbq $0,%r10
- movq %r11,%r12
- sbbq .Lpoly+24(%rip),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- movq %r8,0(%rdi)
- cmovzq %rcx,%r10
- movq %r9,8(%rdi)
- cmovzq %r12,%r11
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- popq %r13
- popq %r12
- .byte 0xf3,0xc3
-.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
-
-
-
-.globl ecp_nistz256_add
-.hidden ecp_nistz256_add
-.type ecp_nistz256_add,@function
-.align 32
-ecp_nistz256_add:
- pushq %r12
- pushq %r13
-
- movq 0(%rsi),%r8
- xorq %r13,%r13
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- leaq .Lpoly(%rip),%rsi
-
- addq 0(%rdx),%r8
- adcq 8(%rdx),%r9
- movq %r8,%rax
- adcq 16(%rdx),%r10
- adcq 24(%rdx),%r11
- movq %r9,%rdx
- adcq $0,%r13
-
- subq 0(%rsi),%r8
- movq %r10,%rcx
- sbbq 8(%rsi),%r9
- sbbq 16(%rsi),%r10
- movq %r11,%r12
- sbbq 24(%rsi),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- movq %r8,0(%rdi)
- cmovzq %rcx,%r10
- movq %r9,8(%rdi)
- cmovzq %r12,%r11
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- popq %r13
- popq %r12
- .byte 0xf3,0xc3
-.size ecp_nistz256_add,.-ecp_nistz256_add
-
-
-
-.globl ecp_nistz256_sub
-.hidden ecp_nistz256_sub
-.type ecp_nistz256_sub,@function
-.align 32
-ecp_nistz256_sub:
- pushq %r12
- pushq %r13
-
- movq 0(%rsi),%r8
- xorq %r13,%r13
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- leaq .Lpoly(%rip),%rsi
-
- subq 0(%rdx),%r8
- sbbq 8(%rdx),%r9
- movq %r8,%rax
- sbbq 16(%rdx),%r10
- sbbq 24(%rdx),%r11
- movq %r9,%rdx
- sbbq $0,%r13
-
- addq 0(%rsi),%r8
- movq %r10,%rcx
- adcq 8(%rsi),%r9
- adcq 16(%rsi),%r10
- movq %r11,%r12
- adcq 24(%rsi),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- movq %r8,0(%rdi)
- cmovzq %rcx,%r10
- movq %r9,8(%rdi)
- cmovzq %r12,%r11
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- popq %r13
- popq %r12
- .byte 0xf3,0xc3
-.size ecp_nistz256_sub,.-ecp_nistz256_sub
-
-
-
.globl ecp_nistz256_neg
.hidden ecp_nistz256_neg
.type ecp_nistz256_neg,@function
@@ -336,19 +108,6 @@ ecp_nistz256_neg:
-.globl ecp_nistz256_to_mont
-.hidden ecp_nistz256_to_mont
-.type ecp_nistz256_to_mont,@function
-.align 32
-ecp_nistz256_to_mont:
- leaq .LRR(%rip),%rdx
- jmp .Lmul_mont
-.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
-
-
-
-
-
.globl ecp_nistz256_mul_mont
diff --git a/linux-x86_64/crypto/rc4/rc4-md5-x86_64.S b/linux-x86_64/crypto/rc4/rc4-md5-x86_64.S
deleted file mode 100644
index 06c8d67..0000000
--- a/linux-x86_64/crypto/rc4/rc4-md5-x86_64.S
+++ /dev/null
@@ -1,1262 +0,0 @@
-#if defined(__x86_64__)
-.text
-.align 16
-
-.globl rc4_md5_enc
-.hidden rc4_md5_enc
-.type rc4_md5_enc,@function
-rc4_md5_enc:
- cmpq $0,%r9
- je .Labort
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $40,%rsp
-.Lbody:
- movq %rcx,%r11
- movq %r9,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- movq %r8,%r15
- xorq %rbp,%rbp
- xorq %rcx,%rcx
-
- leaq 8(%rdi),%rdi
- movb -8(%rdi),%bpl
- movb -4(%rdi),%cl
-
- incb %bpl
- subq %r13,%r14
- movl (%rdi,%rbp,4),%eax
- addb %al,%cl
- leaq (%rdi,%rbp,4),%rsi
- shlq $6,%r12
- addq %r15,%r12
- movq %r12,16(%rsp)
-
- movq %r11,24(%rsp)
- movl 0(%r11),%r8d
- movl 4(%r11),%r9d
- movl 8(%r11),%r10d
- movl 12(%r11),%r11d
- jmp .Loop
-
-.align 16
-.Loop:
- movl %r8d,0(%rsp)
- movl %r9d,4(%rsp)
- movl %r10d,8(%rsp)
- movl %r11d,%r12d
- movl %r11d,12(%rsp)
- pxor %xmm0,%xmm0
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 0(%r15),%r8d
- addb %dl,%al
- movl 4(%rsi),%ebx
- addl $3614090360,%r8d
- xorl %r11d,%r12d
- movzbl %al,%eax
- movl %edx,0(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $7,%r8d
- movl %r10d,%r12d
- movd (%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- pxor %xmm1,%xmm1
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 4(%r15),%r11d
- addb %dl,%bl
- movl 8(%rsi),%eax
- addl $3905402710,%r11d
- xorl %r10d,%r12d
- movzbl %bl,%ebx
- movl %edx,4(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $12,%r11d
- movl %r9d,%r12d
- movd (%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 8(%r15),%r10d
- addb %dl,%al
- movl 12(%rsi),%ebx
- addl $606105819,%r10d
- xorl %r9d,%r12d
- movzbl %al,%eax
- movl %edx,8(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $17,%r10d
- movl %r8d,%r12d
- pinsrw $1,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 12(%r15),%r9d
- addb %dl,%bl
- movl 16(%rsi),%eax
- addl $3250441966,%r9d
- xorl %r8d,%r12d
- movzbl %bl,%ebx
- movl %edx,12(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $22,%r9d
- movl %r11d,%r12d
- pinsrw $1,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 16(%r15),%r8d
- addb %dl,%al
- movl 20(%rsi),%ebx
- addl $4118548399,%r8d
- xorl %r11d,%r12d
- movzbl %al,%eax
- movl %edx,16(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $7,%r8d
- movl %r10d,%r12d
- pinsrw $2,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 20(%r15),%r11d
- addb %dl,%bl
- movl 24(%rsi),%eax
- addl $1200080426,%r11d
- xorl %r10d,%r12d
- movzbl %bl,%ebx
- movl %edx,20(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $12,%r11d
- movl %r9d,%r12d
- pinsrw $2,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 24(%r15),%r10d
- addb %dl,%al
- movl 28(%rsi),%ebx
- addl $2821735955,%r10d
- xorl %r9d,%r12d
- movzbl %al,%eax
- movl %edx,24(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $17,%r10d
- movl %r8d,%r12d
- pinsrw $3,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 28(%r15),%r9d
- addb %dl,%bl
- movl 32(%rsi),%eax
- addl $4249261313,%r9d
- xorl %r8d,%r12d
- movzbl %bl,%ebx
- movl %edx,28(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $22,%r9d
- movl %r11d,%r12d
- pinsrw $3,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 32(%r15),%r8d
- addb %dl,%al
- movl 36(%rsi),%ebx
- addl $1770035416,%r8d
- xorl %r11d,%r12d
- movzbl %al,%eax
- movl %edx,32(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $7,%r8d
- movl %r10d,%r12d
- pinsrw $4,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 36(%r15),%r11d
- addb %dl,%bl
- movl 40(%rsi),%eax
- addl $2336552879,%r11d
- xorl %r10d,%r12d
- movzbl %bl,%ebx
- movl %edx,36(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $12,%r11d
- movl %r9d,%r12d
- pinsrw $4,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 40(%r15),%r10d
- addb %dl,%al
- movl 44(%rsi),%ebx
- addl $4294925233,%r10d
- xorl %r9d,%r12d
- movzbl %al,%eax
- movl %edx,40(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $17,%r10d
- movl %r8d,%r12d
- pinsrw $5,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 44(%r15),%r9d
- addb %dl,%bl
- movl 48(%rsi),%eax
- addl $2304563134,%r9d
- xorl %r8d,%r12d
- movzbl %bl,%ebx
- movl %edx,44(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $22,%r9d
- movl %r11d,%r12d
- pinsrw $5,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 48(%r15),%r8d
- addb %dl,%al
- movl 52(%rsi),%ebx
- addl $1804603682,%r8d
- xorl %r11d,%r12d
- movzbl %al,%eax
- movl %edx,48(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $7,%r8d
- movl %r10d,%r12d
- pinsrw $6,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 52(%r15),%r11d
- addb %dl,%bl
- movl 56(%rsi),%eax
- addl $4254626195,%r11d
- xorl %r10d,%r12d
- movzbl %bl,%ebx
- movl %edx,52(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $12,%r11d
- movl %r9d,%r12d
- pinsrw $6,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 56(%r15),%r10d
- addb %dl,%al
- movl 60(%rsi),%ebx
- addl $2792965006,%r10d
- xorl %r9d,%r12d
- movzbl %al,%eax
- movl %edx,56(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $17,%r10d
- movl %r8d,%r12d
- pinsrw $7,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movdqu (%r13),%xmm2
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 60(%r15),%r9d
- addb %dl,%bl
- movl 64(%rsi),%eax
- addl $1236535329,%r9d
- xorl %r8d,%r12d
- movzbl %bl,%ebx
- movl %edx,60(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $22,%r9d
- movl %r10d,%r12d
- pinsrw $7,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- psllq $8,%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm1,%xmm2
- pxor %xmm0,%xmm0
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 4(%r15),%r8d
- addb %dl,%al
- movl 68(%rsi),%ebx
- addl $4129170786,%r8d
- xorl %r10d,%r12d
- movzbl %al,%eax
- movl %edx,64(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $5,%r8d
- movl %r9d,%r12d
- movd (%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- pxor %xmm1,%xmm1
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 24(%r15),%r11d
- addb %dl,%bl
- movl 72(%rsi),%eax
- addl $3225465664,%r11d
- xorl %r9d,%r12d
- movzbl %bl,%ebx
- movl %edx,68(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $9,%r11d
- movl %r8d,%r12d
- movd (%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 44(%r15),%r10d
- addb %dl,%al
- movl 76(%rsi),%ebx
- addl $643717713,%r10d
- xorl %r8d,%r12d
- movzbl %al,%eax
- movl %edx,72(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $14,%r10d
- movl %r11d,%r12d
- pinsrw $1,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 0(%r15),%r9d
- addb %dl,%bl
- movl 80(%rsi),%eax
- addl $3921069994,%r9d
- xorl %r11d,%r12d
- movzbl %bl,%ebx
- movl %edx,76(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $20,%r9d
- movl %r10d,%r12d
- pinsrw $1,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 20(%r15),%r8d
- addb %dl,%al
- movl 84(%rsi),%ebx
- addl $3593408605,%r8d
- xorl %r10d,%r12d
- movzbl %al,%eax
- movl %edx,80(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $5,%r8d
- movl %r9d,%r12d
- pinsrw $2,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 40(%r15),%r11d
- addb %dl,%bl
- movl 88(%rsi),%eax
- addl $38016083,%r11d
- xorl %r9d,%r12d
- movzbl %bl,%ebx
- movl %edx,84(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $9,%r11d
- movl %r8d,%r12d
- pinsrw $2,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 60(%r15),%r10d
- addb %dl,%al
- movl 92(%rsi),%ebx
- addl $3634488961,%r10d
- xorl %r8d,%r12d
- movzbl %al,%eax
- movl %edx,88(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $14,%r10d
- movl %r11d,%r12d
- pinsrw $3,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 16(%r15),%r9d
- addb %dl,%bl
- movl 96(%rsi),%eax
- addl $3889429448,%r9d
- xorl %r11d,%r12d
- movzbl %bl,%ebx
- movl %edx,92(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $20,%r9d
- movl %r10d,%r12d
- pinsrw $3,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 36(%r15),%r8d
- addb %dl,%al
- movl 100(%rsi),%ebx
- addl $568446438,%r8d
- xorl %r10d,%r12d
- movzbl %al,%eax
- movl %edx,96(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $5,%r8d
- movl %r9d,%r12d
- pinsrw $4,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 56(%r15),%r11d
- addb %dl,%bl
- movl 104(%rsi),%eax
- addl $3275163606,%r11d
- xorl %r9d,%r12d
- movzbl %bl,%ebx
- movl %edx,100(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $9,%r11d
- movl %r8d,%r12d
- pinsrw $4,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 12(%r15),%r10d
- addb %dl,%al
- movl 108(%rsi),%ebx
- addl $4107603335,%r10d
- xorl %r8d,%r12d
- movzbl %al,%eax
- movl %edx,104(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $14,%r10d
- movl %r11d,%r12d
- pinsrw $5,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 32(%r15),%r9d
- addb %dl,%bl
- movl 112(%rsi),%eax
- addl $1163531501,%r9d
- xorl %r11d,%r12d
- movzbl %bl,%ebx
- movl %edx,108(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $20,%r9d
- movl %r10d,%r12d
- pinsrw $5,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 52(%r15),%r8d
- addb %dl,%al
- movl 116(%rsi),%ebx
- addl $2850285829,%r8d
- xorl %r10d,%r12d
- movzbl %al,%eax
- movl %edx,112(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $5,%r8d
- movl %r9d,%r12d
- pinsrw $6,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 8(%r15),%r11d
- addb %dl,%bl
- movl 120(%rsi),%eax
- addl $4243563512,%r11d
- xorl %r9d,%r12d
- movzbl %bl,%ebx
- movl %edx,116(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $9,%r11d
- movl %r8d,%r12d
- pinsrw $6,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 28(%r15),%r10d
- addb %dl,%al
- movl 124(%rsi),%ebx
- addl $1735328473,%r10d
- xorl %r8d,%r12d
- movzbl %al,%eax
- movl %edx,120(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $14,%r10d
- movl %r11d,%r12d
- pinsrw $7,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movdqu 16(%r13),%xmm3
- addb $32,%bpl
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 48(%r15),%r9d
- addb %dl,%bl
- movl 0(%rdi,%rbp,4),%eax
- addl $2368359562,%r9d
- xorl %r11d,%r12d
- movzbl %bl,%ebx
- movl %edx,124(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $20,%r9d
- movl %r11d,%r12d
- pinsrw $7,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movq %rcx,%rsi
- xorq %rcx,%rcx
- movb %sil,%cl
- leaq (%rdi,%rbp,4),%rsi
- psllq $8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm0,%xmm0
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r9d,%r12d
- addl 20(%r15),%r8d
- addb %dl,%al
- movl 4(%rsi),%ebx
- addl $4294588738,%r8d
- movzbl %al,%eax
- addl %r12d,%r8d
- movl %edx,0(%rsi)
- addb %bl,%cl
- roll $4,%r8d
- movl %r10d,%r12d
- movd (%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- pxor %xmm1,%xmm1
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r8d,%r12d
- addl 32(%r15),%r11d
- addb %dl,%bl
- movl 8(%rsi),%eax
- addl $2272392833,%r11d
- movzbl %bl,%ebx
- addl %r12d,%r11d
- movl %edx,4(%rsi)
- addb %al,%cl
- roll $11,%r11d
- movl %r9d,%r12d
- movd (%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r11d,%r12d
- addl 44(%r15),%r10d
- addb %dl,%al
- movl 12(%rsi),%ebx
- addl $1839030562,%r10d
- movzbl %al,%eax
- addl %r12d,%r10d
- movl %edx,8(%rsi)
- addb %bl,%cl
- roll $16,%r10d
- movl %r8d,%r12d
- pinsrw $1,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r10d,%r12d
- addl 56(%r15),%r9d
- addb %dl,%bl
- movl 16(%rsi),%eax
- addl $4259657740,%r9d
- movzbl %bl,%ebx
- addl %r12d,%r9d
- movl %edx,12(%rsi)
- addb %al,%cl
- roll $23,%r9d
- movl %r11d,%r12d
- pinsrw $1,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r9d,%r12d
- addl 4(%r15),%r8d
- addb %dl,%al
- movl 20(%rsi),%ebx
- addl $2763975236,%r8d
- movzbl %al,%eax
- addl %r12d,%r8d
- movl %edx,16(%rsi)
- addb %bl,%cl
- roll $4,%r8d
- movl %r10d,%r12d
- pinsrw $2,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r8d,%r12d
- addl 16(%r15),%r11d
- addb %dl,%bl
- movl 24(%rsi),%eax
- addl $1272893353,%r11d
- movzbl %bl,%ebx
- addl %r12d,%r11d
- movl %edx,20(%rsi)
- addb %al,%cl
- roll $11,%r11d
- movl %r9d,%r12d
- pinsrw $2,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r11d,%r12d
- addl 28(%r15),%r10d
- addb %dl,%al
- movl 28(%rsi),%ebx
- addl $4139469664,%r10d
- movzbl %al,%eax
- addl %r12d,%r10d
- movl %edx,24(%rsi)
- addb %bl,%cl
- roll $16,%r10d
- movl %r8d,%r12d
- pinsrw $3,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r10d,%r12d
- addl 40(%r15),%r9d
- addb %dl,%bl
- movl 32(%rsi),%eax
- addl $3200236656,%r9d
- movzbl %bl,%ebx
- addl %r12d,%r9d
- movl %edx,28(%rsi)
- addb %al,%cl
- roll $23,%r9d
- movl %r11d,%r12d
- pinsrw $3,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r9d,%r12d
- addl 52(%r15),%r8d
- addb %dl,%al
- movl 36(%rsi),%ebx
- addl $681279174,%r8d
- movzbl %al,%eax
- addl %r12d,%r8d
- movl %edx,32(%rsi)
- addb %bl,%cl
- roll $4,%r8d
- movl %r10d,%r12d
- pinsrw $4,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r8d,%r12d
- addl 0(%r15),%r11d
- addb %dl,%bl
- movl 40(%rsi),%eax
- addl $3936430074,%r11d
- movzbl %bl,%ebx
- addl %r12d,%r11d
- movl %edx,36(%rsi)
- addb %al,%cl
- roll $11,%r11d
- movl %r9d,%r12d
- pinsrw $4,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r11d,%r12d
- addl 12(%r15),%r10d
- addb %dl,%al
- movl 44(%rsi),%ebx
- addl $3572445317,%r10d
- movzbl %al,%eax
- addl %r12d,%r10d
- movl %edx,40(%rsi)
- addb %bl,%cl
- roll $16,%r10d
- movl %r8d,%r12d
- pinsrw $5,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r10d,%r12d
- addl 24(%r15),%r9d
- addb %dl,%bl
- movl 48(%rsi),%eax
- addl $76029189,%r9d
- movzbl %bl,%ebx
- addl %r12d,%r9d
- movl %edx,44(%rsi)
- addb %al,%cl
- roll $23,%r9d
- movl %r11d,%r12d
- pinsrw $5,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r9d,%r12d
- addl 36(%r15),%r8d
- addb %dl,%al
- movl 52(%rsi),%ebx
- addl $3654602809,%r8d
- movzbl %al,%eax
- addl %r12d,%r8d
- movl %edx,48(%rsi)
- addb %bl,%cl
- roll $4,%r8d
- movl %r10d,%r12d
- pinsrw $6,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r8d,%r12d
- addl 48(%r15),%r11d
- addb %dl,%bl
- movl 56(%rsi),%eax
- addl $3873151461,%r11d
- movzbl %bl,%ebx
- addl %r12d,%r11d
- movl %edx,52(%rsi)
- addb %al,%cl
- roll $11,%r11d
- movl %r9d,%r12d
- pinsrw $6,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r11d,%r12d
- addl 60(%r15),%r10d
- addb %dl,%al
- movl 60(%rsi),%ebx
- addl $530742520,%r10d
- movzbl %al,%eax
- addl %r12d,%r10d
- movl %edx,56(%rsi)
- addb %bl,%cl
- roll $16,%r10d
- movl %r8d,%r12d
- pinsrw $7,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movdqu 32(%r13),%xmm4
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r10d,%r12d
- addl 8(%r15),%r9d
- addb %dl,%bl
- movl 64(%rsi),%eax
- addl $3299628645,%r9d
- movzbl %bl,%ebx
- addl %r12d,%r9d
- movl %edx,60(%rsi)
- addb %al,%cl
- roll $23,%r9d
- movl $-1,%r12d
- pinsrw $7,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- psllq $8,%xmm1
- pxor %xmm0,%xmm4
- pxor %xmm1,%xmm4
- pxor %xmm0,%xmm0
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r9d,%r12d
- addl 0(%r15),%r8d
- addb %dl,%al
- movl 68(%rsi),%ebx
- addl $4096336452,%r8d
- movzbl %al,%eax
- xorl %r10d,%r12d
- movl %edx,64(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $6,%r8d
- movl $-1,%r12d
- movd (%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- pxor %xmm1,%xmm1
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r8d,%r12d
- addl 28(%r15),%r11d
- addb %dl,%bl
- movl 72(%rsi),%eax
- addl $1126891415,%r11d
- movzbl %bl,%ebx
- xorl %r9d,%r12d
- movl %edx,68(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $10,%r11d
- movl $-1,%r12d
- movd (%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r11d,%r12d
- addl 56(%r15),%r10d
- addb %dl,%al
- movl 76(%rsi),%ebx
- addl $2878612391,%r10d
- movzbl %al,%eax
- xorl %r8d,%r12d
- movl %edx,72(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $15,%r10d
- movl $-1,%r12d
- pinsrw $1,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r10d,%r12d
- addl 20(%r15),%r9d
- addb %dl,%bl
- movl 80(%rsi),%eax
- addl $4237533241,%r9d
- movzbl %bl,%ebx
- xorl %r11d,%r12d
- movl %edx,76(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $21,%r9d
- movl $-1,%r12d
- pinsrw $1,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r9d,%r12d
- addl 48(%r15),%r8d
- addb %dl,%al
- movl 84(%rsi),%ebx
- addl $1700485571,%r8d
- movzbl %al,%eax
- xorl %r10d,%r12d
- movl %edx,80(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $6,%r8d
- movl $-1,%r12d
- pinsrw $2,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r8d,%r12d
- addl 12(%r15),%r11d
- addb %dl,%bl
- movl 88(%rsi),%eax
- addl $2399980690,%r11d
- movzbl %bl,%ebx
- xorl %r9d,%r12d
- movl %edx,84(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $10,%r11d
- movl $-1,%r12d
- pinsrw $2,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r11d,%r12d
- addl 40(%r15),%r10d
- addb %dl,%al
- movl 92(%rsi),%ebx
- addl $4293915773,%r10d
- movzbl %al,%eax
- xorl %r8d,%r12d
- movl %edx,88(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $15,%r10d
- movl $-1,%r12d
- pinsrw $3,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r10d,%r12d
- addl 4(%r15),%r9d
- addb %dl,%bl
- movl 96(%rsi),%eax
- addl $2240044497,%r9d
- movzbl %bl,%ebx
- xorl %r11d,%r12d
- movl %edx,92(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $21,%r9d
- movl $-1,%r12d
- pinsrw $3,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r9d,%r12d
- addl 32(%r15),%r8d
- addb %dl,%al
- movl 100(%rsi),%ebx
- addl $1873313359,%r8d
- movzbl %al,%eax
- xorl %r10d,%r12d
- movl %edx,96(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $6,%r8d
- movl $-1,%r12d
- pinsrw $4,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r8d,%r12d
- addl 60(%r15),%r11d
- addb %dl,%bl
- movl 104(%rsi),%eax
- addl $4264355552,%r11d
- movzbl %bl,%ebx
- xorl %r9d,%r12d
- movl %edx,100(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $10,%r11d
- movl $-1,%r12d
- pinsrw $4,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r11d,%r12d
- addl 24(%r15),%r10d
- addb %dl,%al
- movl 108(%rsi),%ebx
- addl $2734768916,%r10d
- movzbl %al,%eax
- xorl %r8d,%r12d
- movl %edx,104(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $15,%r10d
- movl $-1,%r12d
- pinsrw $5,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r10d,%r12d
- addl 52(%r15),%r9d
- addb %dl,%bl
- movl 112(%rsi),%eax
- addl $1309151649,%r9d
- movzbl %bl,%ebx
- xorl %r11d,%r12d
- movl %edx,108(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $21,%r9d
- movl $-1,%r12d
- pinsrw $5,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r9d,%r12d
- addl 16(%r15),%r8d
- addb %dl,%al
- movl 116(%rsi),%ebx
- addl $4149444226,%r8d
- movzbl %al,%eax
- xorl %r10d,%r12d
- movl %edx,112(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $6,%r8d
- movl $-1,%r12d
- pinsrw $6,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r8d,%r12d
- addl 44(%r15),%r11d
- addb %dl,%bl
- movl 120(%rsi),%eax
- addl $3174756917,%r11d
- movzbl %bl,%ebx
- xorl %r9d,%r12d
- movl %edx,116(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $10,%r11d
- movl $-1,%r12d
- pinsrw $6,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r11d,%r12d
- addl 8(%r15),%r10d
- addb %dl,%al
- movl 124(%rsi),%ebx
- addl $718787259,%r10d
- movzbl %al,%eax
- xorl %r8d,%r12d
- movl %edx,120(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $15,%r10d
- movl $-1,%r12d
- pinsrw $7,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movdqu 48(%r13),%xmm5
- addb $32,%bpl
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r10d,%r12d
- addl 36(%r15),%r9d
- addb %dl,%bl
- movl 0(%rdi,%rbp,4),%eax
- addl $3951481745,%r9d
- movzbl %bl,%ebx
- xorl %r11d,%r12d
- movl %edx,124(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $21,%r9d
- movl $-1,%r12d
- pinsrw $7,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movq %rbp,%rsi
- xorq %rbp,%rbp
- movb %sil,%bpl
- movq %rcx,%rsi
- xorq %rcx,%rcx
- movb %sil,%cl
- leaq (%rdi,%rbp,4),%rsi
- psllq $8,%xmm1
- pxor %xmm0,%xmm5
- pxor %xmm1,%xmm5
- addl 0(%rsp),%r8d
- addl 4(%rsp),%r9d
- addl 8(%rsp),%r10d
- addl 12(%rsp),%r11d
-
- movdqu %xmm2,(%r14,%r13,1)
- movdqu %xmm3,16(%r14,%r13,1)
- movdqu %xmm4,32(%r14,%r13,1)
- movdqu %xmm5,48(%r14,%r13,1)
- leaq 64(%r15),%r15
- leaq 64(%r13),%r13
- cmpq 16(%rsp),%r15
- jb .Loop
-
- movq 24(%rsp),%r12
- subb %al,%cl
- movl %r8d,0(%r12)
- movl %r9d,4(%r12)
- movl %r10d,8(%r12)
- movl %r11d,12(%r12)
- subb $1,%bpl
- movl %ebp,-8(%rdi)
- movl %ecx,-4(%rdi)
-
- movq 40(%rsp),%r15
- movq 48(%rsp),%r14
- movq 56(%rsp),%r13
- movq 64(%rsp),%r12
- movq 72(%rsp),%rbp
- movq 80(%rsp),%rbx
- leaq 88(%rsp),%rsp
-.Lepilogue:
-.Labort:
- .byte 0xf3,0xc3
-.size rc4_md5_enc,.-rc4_md5_enc
-#endif
diff --git a/linux-x86_64/crypto/sha/sha1-x86_64.S b/linux-x86_64/crypto/sha/sha1-x86_64.S
index 7668c2b..d830b53 100644
--- a/linux-x86_64/crypto/sha/sha1-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha1-x86_64.S
@@ -13,6 +13,11 @@ sha1_block_data_order:
movl OPENSSL_ia32cap_P+8(%rip),%r10d
testl $512,%r8d
jz .Lialu
+ andl $268435456,%r8d
+ andl $1073741824,%r9d
+ orl %r9d,%r8d
+ cmpl $1342177280,%r8d
+ je _avx_shortcut
jmp _ssse3_shortcut
.align 16
@@ -2408,6 +2413,1122 @@ _ssse3_shortcut:
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.type sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ leaq -64(%rsp),%rsp
+ vzeroupper
+ movq %rax,%r14
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r11
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa -64(%r11),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm11,%xmm0,%xmm4
+ vpaddd %xmm11,%xmm1,%xmm5
+ vpaddd %xmm11,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp .Loop_avx
+.align 16
+.Loop_avx:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vpxor %xmm10,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm11,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm10
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm10,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa -32(%r11),%xmm11
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vpaddd %xmm5,%xmm11,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm10
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm6,%xmm6
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm10,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm11,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm10,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm11,%xmm9
+ addl %esi,%edx
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm11,%xmm9
+ vmovdqa 0(%r11),%xmm11
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm11,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm11,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm11,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm11,%xmm9
+ vmovdqa 32(%r11),%xmm11
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm11,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm11,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_avx
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa -64(%r11),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm11,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm11,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm11,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ leaq (%r14),%rsi
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/linux-x86_64/crypto/sha/sha256-x86_64.S b/linux-x86_64/crypto/sha/sha256-x86_64.S
index f526de5..445b497 100644
--- a/linux-x86_64/crypto/sha/sha256-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha256-x86_64.S
@@ -12,6 +12,11 @@ sha256_block_data_order:
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je .Lavx_shortcut
testl $512,%r10d
jnz .Lssse3_shortcut
pushq %rbx
@@ -2841,4 +2846,1061 @@ sha256_block_data_order_ssse3:
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.type sha256_block_data_order_avx,@function
+.align 64
+sha256_block_data_order_avx:
+.Lavx_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+.Lprologue_avx:
+
+ vzeroupper
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%xmm8
+ vmovdqa K256+512+64(%rip),%xmm9
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-128,%rbp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm0,%xmm0
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpshufd $80,%xmm0,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm1,%xmm1
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpshufd $80,%xmm1,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm2,%xmm2
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpshufd $80,%xmm2,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm3,%xmm3
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpshufd $80,%xmm3,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_avx
+
+ movq 64+24(%rsp),%rsi
+ vzeroupper
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
#endif
diff --git a/linux-x86_64/crypto/sha/sha512-x86_64.S b/linux-x86_64/crypto/sha/sha512-x86_64.S
index ca3a3a1..d65743f 100644
--- a/linux-x86_64/crypto/sha/sha512-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha512-x86_64.S
@@ -8,6 +8,17 @@
.type sha512_block_data_order,@function
.align 16
sha512_block_data_order:
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $2048,%r10d
+ jnz .Lxop_shortcut
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je .Lavx_shortcut
pushq %rbx
pushq %rbp
pushq %r12
@@ -1784,4 +1795,2234 @@ K512:
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha512_block_data_order_xop,@function
+.align 64
+sha512_block_data_order_xop:
+.Lxop_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %r11,128+24(%rsp)
+.Lprologue_xop:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_xop
+.align 16
+.Lloop_xop:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ rorq $23,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rax,%r14
+ vpaddq %xmm11,%xmm0,%xmm0
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+.byte 143,72,120,195,209,7
+ xorq %r10,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,223,3
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm7,%xmm10
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %rdx,%r13
+ addq %r11,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r11
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpaddq %xmm11,%xmm0,%xmm0
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ rorq $23,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r10,%r14
+ vpaddq %xmm11,%xmm1,%xmm1
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+.byte 143,72,120,195,209,7
+ xorq %r8,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,216,3
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm0,%xmm10
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rbx,%r13
+ addq %r9,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r9
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpaddq %xmm11,%xmm1,%xmm1
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ rorq $23,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r8,%r14
+ vpaddq %xmm11,%xmm2,%xmm2
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+.byte 143,72,120,195,209,7
+ xorq %rcx,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,217,3
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm1,%xmm10
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %r11,%r13
+ addq %rdx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rdx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpaddq %xmm11,%xmm2,%xmm2
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ rorq $23,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rcx,%r14
+ vpaddq %xmm11,%xmm3,%xmm3
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+.byte 143,72,120,195,209,7
+ xorq %rax,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,218,3
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm2,%xmm10
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r9,%r13
+ addq %rbx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rbx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpaddq %xmm11,%xmm3,%xmm3
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ rorq $23,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rax,%r14
+ vpaddq %xmm11,%xmm4,%xmm4
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+.byte 143,72,120,195,209,7
+ xorq %r10,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,219,3
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm3,%xmm10
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %rdx,%r13
+ addq %r11,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r11
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpaddq %xmm11,%xmm4,%xmm4
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ rorq $23,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r10,%r14
+ vpaddq %xmm11,%xmm5,%xmm5
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+.byte 143,72,120,195,209,7
+ xorq %r8,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,220,3
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm4,%xmm10
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rbx,%r13
+ addq %r9,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r9
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpaddq %xmm11,%xmm5,%xmm5
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ rorq $23,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r8,%r14
+ vpaddq %xmm11,%xmm6,%xmm6
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+.byte 143,72,120,195,209,7
+ xorq %rcx,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,221,3
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm5,%xmm10
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %r11,%r13
+ addq %rdx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rdx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpaddq %xmm11,%xmm6,%xmm6
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ rorq $23,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rcx,%r14
+ vpaddq %xmm11,%xmm7,%xmm7
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+.byte 143,72,120,195,209,7
+ xorq %rax,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,222,3
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm6,%xmm10
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r9,%r13
+ addq %rbx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rbx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpaddq %xmm11,%xmm7,%xmm7
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lxop_00_47
+ rorq $23,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ rorq $4,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ rorq $6,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ rorq $23,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ rorq $23,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ rorq $4,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ rorq $6,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ rorq $28,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ rorq $23,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ rorq $23,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ rorq $4,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ rorq $6,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ rorq $28,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ rorq $23,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ rorq $23,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ rorq $4,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ rorq $6,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ rorq $23,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ rorq $23,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ rorq $4,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ rorq $6,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ rorq $23,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ rorq $23,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ rorq $4,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ rorq $6,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ rorq $28,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ rorq $23,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ rorq $23,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ rorq $4,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ rorq $6,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ rorq $28,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ rorq $23,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ rorq $23,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ rorq $4,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ rorq $6,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ rorq $23,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_xop
+
+ movq 128+24(%rsp),%rsi
+ vzeroupper
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue_xop:
+ .byte 0xf3,0xc3
+.size sha512_block_data_order_xop,.-sha512_block_data_order_xop
+.type sha512_block_data_order_avx,@function
+.align 64
+sha512_block_data_order_avx:
+.Lavx_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %r11,128+24(%rsp)
+.Lprologue_avx:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm0,%xmm0
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm7,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm7,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm7,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm0,%xmm0
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm1,%xmm1
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm0,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm0,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm0,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm1,%xmm1
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm2,%xmm2
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm1,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm1,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm1,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm2,%xmm2
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm3,%xmm3
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm2,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm2,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm2,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm3,%xmm3
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm4,%xmm4
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm3,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm3,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm3,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm4,%xmm4
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm5,%xmm5
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm4,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm4,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm4,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm5,%xmm5
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm6,%xmm6
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm5,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm5,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm5,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm6,%xmm6
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm7,%xmm7
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm6,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm6,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm6,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm7,%xmm7
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lavx_00_47
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_avx
+
+ movq 128+24(%rsp),%rsi
+ vzeroupper
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
#endif