summaryrefslogtreecommitdiffstats
path: root/mac-x86_64
diff options
context:
space:
mode:
authorAdam Langley <agl@google.com>2015-01-22 14:27:53 -0800
committerAdam Langley <agl@google.com>2015-01-30 16:52:14 -0800
commitd9e397b599b13d642138480a28c14db7a136bf05 (patch)
tree34bab61dc4ce323b123ad4614dbc07e86ea2f9ef /mac-x86_64
downloadexternal_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.zip
external_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.tar.gz
external_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.tar.bz2
Initial commit of BoringSSL for Android.
Diffstat (limited to 'mac-x86_64')
-rw-r--r--mac-x86_64/crypto/aes/aes-x86_64.S2535
-rw-r--r--mac-x86_64/crypto/aes/aesni-x86_64.S3178
-rw-r--r--mac-x86_64/crypto/aes/bsaes-x86_64.S2504
-rw-r--r--mac-x86_64/crypto/aes/vpaes-x86_64.S834
-rw-r--r--mac-x86_64/crypto/bn/modexp512-x86_64.S1776
-rw-r--r--mac-x86_64/crypto/bn/rsaz-avx2.S34
-rw-r--r--mac-x86_64/crypto/bn/rsaz-x86_64.S1126
-rw-r--r--mac-x86_64/crypto/bn/x86_64-mont.S726
-rw-r--r--mac-x86_64/crypto/bn/x86_64-mont5.S1822
-rw-r--r--mac-x86_64/crypto/cpu-x86_64-asm.S147
-rw-r--r--mac-x86_64/crypto/md5/md5-x86_64.S671
-rw-r--r--mac-x86_64/crypto/modes/aesni-gcm-x86_64.S19
-rw-r--r--mac-x86_64/crypto/modes/ghash-x86_64.S1328
-rw-r--r--mac-x86_64/crypto/rc4/rc4-md5-x86_64.S1262
-rw-r--r--mac-x86_64/crypto/rc4/rc4-x86_64.S622
-rw-r--r--mac-x86_64/crypto/sha/sha1-x86_64.S2425
-rw-r--r--mac-x86_64/crypto/sha/sha256-x86_64.S2843
-rw-r--r--mac-x86_64/crypto/sha/sha512-x86_64.S1786
18 files changed, 25638 insertions, 0 deletions
diff --git a/mac-x86_64/crypto/aes/aes-x86_64.S b/mac-x86_64/crypto/aes/aes-x86_64.S
new file mode 100644
index 0000000..02f378b
--- /dev/null
+++ b/mac-x86_64/crypto/aes/aes-x86_64.S
@@ -0,0 +1,2535 @@
+#if defined(__x86_64__)
+.text
+
+.p2align 4
+_x86_64_AES_encrypt:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+
+ movl 240(%r15),%r13d
+ subl $1,%r13d
+ jmp L$enc_loop
+.p2align 4
+L$enc_loop:
+
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movl 0(%r14,%rsi,8),%r10d
+ movl 0(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r12d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movzbl %dl,%ebp
+ xorl 3(%r14,%rsi,8),%r10d
+ xorl 3(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r8d
+
+ movzbl %dh,%esi
+ shrl $16,%ecx
+ movzbl %ah,%ebp
+ xorl 3(%r14,%rsi,8),%r12d
+ shrl $16,%edx
+ xorl 3(%r14,%rbp,8),%r8d
+
+ shrl $16,%ebx
+ leaq 16(%r15),%r15
+ shrl $16,%eax
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ xorl 2(%r14,%rsi,8),%r10d
+ xorl 2(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r12d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movzbl %bl,%ebp
+ xorl 1(%r14,%rsi,8),%r10d
+ xorl 1(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r8d
+
+ movl 12(%r15),%edx
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movl 0(%r15),%eax
+ xorl 1(%r14,%rdi,8),%r12d
+ xorl 1(%r14,%rbp,8),%r8d
+
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ subl $1,%r13d
+ jnz L$enc_loop
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movzbl 2(%r14,%rsi,8),%r10d
+ movzbl 2(%r14,%rdi,8),%r11d
+ movzbl 2(%r14,%rbp,8),%r12d
+
+ movzbl %dl,%esi
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movzbl 2(%r14,%rsi,8),%r8d
+ movl 0(%r14,%rdi,8),%edi
+ movl 0(%r14,%rbp,8),%ebp
+
+ andl $65280,%edi
+ andl $65280,%ebp
+
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+ shrl $16,%ecx
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ shrl $16,%edx
+ movl 0(%r14,%rsi,8),%esi
+ movl 0(%r14,%rdi,8),%edi
+
+ andl $65280,%esi
+ andl $65280,%edi
+ shrl $16,%ebx
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+ shrl $16,%eax
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ movl 0(%r14,%rsi,8),%esi
+ movl 0(%r14,%rdi,8),%edi
+ movl 0(%r14,%rbp,8),%ebp
+
+ andl $16711680,%esi
+ andl $16711680,%edi
+ andl $16711680,%ebp
+
+ xorl %esi,%r10d
+ xorl %edi,%r11d
+ xorl %ebp,%r12d
+
+ movzbl %bl,%esi
+ movzbl %dh,%edi
+ movzbl %ah,%ebp
+ movl 0(%r14,%rsi,8),%esi
+ movl 2(%r14,%rdi,8),%edi
+ movl 2(%r14,%rbp,8),%ebp
+
+ andl $16711680,%esi
+ andl $4278190080,%edi
+ andl $4278190080,%ebp
+
+ xorl %esi,%r8d
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movl 16+12(%r15),%edx
+ movl 2(%r14,%rsi,8),%esi
+ movl 2(%r14,%rdi,8),%edi
+ movl 16+0(%r15),%eax
+
+ andl $4278190080,%esi
+ andl $4278190080,%edi
+
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+
+ movl 16+4(%r15),%ebx
+ movl 16+8(%r15),%ecx
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+.byte 0xf3,0xc3
+
+
+.p2align 4
+_x86_64_AES_encrypt_compact:
+ leaq 128(%r14),%r8
+ movl 0-128(%r8),%edi
+ movl 32-128(%r8),%ebp
+ movl 64-128(%r8),%r10d
+ movl 96-128(%r8),%r11d
+ movl 128-128(%r8),%edi
+ movl 160-128(%r8),%ebp
+ movl 192-128(%r8),%r10d
+ movl 224-128(%r8),%r11d
+ jmp L$enc_loop_compact
+.p2align 4
+L$enc_loop_compact:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+ leaq 16(%r15),%r15
+ movzbl %al,%r10d
+ movzbl %bl,%r11d
+ movzbl %cl,%r12d
+ movzbl %dl,%r8d
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ shrl $16,%ecx
+ movzbl %dh,%ebp
+ movzbl (%r14,%r10,1),%r10d
+ movzbl (%r14,%r11,1),%r11d
+ movzbl (%r14,%r12,1),%r12d
+ movzbl (%r14,%r8,1),%r8d
+
+ movzbl (%r14,%rsi,1),%r9d
+ movzbl %ah,%esi
+ movzbl (%r14,%rdi,1),%r13d
+ movzbl %cl,%edi
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+
+ shll $8,%r9d
+ shrl $16,%edx
+ shll $8,%r13d
+ xorl %r9d,%r10d
+ shrl $16,%eax
+ movzbl %dl,%r9d
+ shrl $16,%ebx
+ xorl %r13d,%r11d
+ shll $8,%ebp
+ movzbl %al,%r13d
+ movzbl (%r14,%rdi,1),%edi
+ xorl %ebp,%r12d
+
+ shll $8,%esi
+ movzbl %bl,%ebp
+ shll $16,%edi
+ xorl %esi,%r8d
+ movzbl (%r14,%r9,1),%r9d
+ movzbl %dh,%esi
+ movzbl (%r14,%r13,1),%r13d
+ xorl %edi,%r10d
+
+ shrl $8,%ecx
+ movzbl %ah,%edi
+ shll $16,%r9d
+ shrl $8,%ebx
+ shll $16,%r13d
+ xorl %r9d,%r11d
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rcx,1),%edx
+ movzbl (%r14,%rbx,1),%ecx
+
+ shll $16,%ebp
+ xorl %r13d,%r12d
+ shll $24,%esi
+ xorl %ebp,%r8d
+ shll $24,%edi
+ xorl %esi,%r10d
+ shll $24,%edx
+ xorl %edi,%r11d
+ shll $24,%ecx
+ movl %r10d,%eax
+ movl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ cmpq 16(%rsp),%r15
+ je L$enc_compact_done
+ movl $2155905152,%r10d
+ movl $2155905152,%r11d
+ andl %eax,%r10d
+ andl %ebx,%r11d
+ movl %r10d,%esi
+ movl %r11d,%edi
+ shrl $7,%r10d
+ leal (%rax,%rax,1),%r8d
+ shrl $7,%r11d
+ leal (%rbx,%rbx,1),%r9d
+ subl %r10d,%esi
+ subl %r11d,%edi
+ andl $4278124286,%r8d
+ andl $4278124286,%r9d
+ andl $454761243,%esi
+ andl $454761243,%edi
+ movl %eax,%r10d
+ movl %ebx,%r11d
+ xorl %esi,%r8d
+ xorl %edi,%r9d
+
+ xorl %r8d,%eax
+ xorl %r9d,%ebx
+ movl $2155905152,%r12d
+ roll $24,%eax
+ movl $2155905152,%ebp
+ roll $24,%ebx
+ andl %ecx,%r12d
+ andl %edx,%ebp
+ xorl %r8d,%eax
+ xorl %r9d,%ebx
+ movl %r12d,%esi
+ rorl $16,%r10d
+ movl %ebp,%edi
+ rorl $16,%r11d
+ leal (%rcx,%rcx,1),%r8d
+ shrl $7,%r12d
+ xorl %r10d,%eax
+ shrl $7,%ebp
+ xorl %r11d,%ebx
+ rorl $8,%r10d
+ leal (%rdx,%rdx,1),%r9d
+ rorl $8,%r11d
+ subl %r12d,%esi
+ subl %ebp,%edi
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+
+ andl $4278124286,%r8d
+ andl $4278124286,%r9d
+ andl $454761243,%esi
+ andl $454761243,%edi
+ movl %ecx,%r12d
+ movl %edx,%ebp
+ xorl %esi,%r8d
+ xorl %edi,%r9d
+
+ rorl $16,%r12d
+ xorl %r8d,%ecx
+ rorl $16,%ebp
+ xorl %r9d,%edx
+ roll $24,%ecx
+ movl 0(%r14),%esi
+ roll $24,%edx
+ xorl %r8d,%ecx
+ movl 64(%r14),%edi
+ xorl %r9d,%edx
+ movl 128(%r14),%r8d
+ xorl %r12d,%ecx
+ rorl $8,%r12d
+ xorl %ebp,%edx
+ rorl $8,%ebp
+ xorl %r12d,%ecx
+ movl 192(%r14),%r9d
+ xorl %ebp,%edx
+ jmp L$enc_loop_compact
+.p2align 4
+L$enc_compact_done:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+.byte 0xf3,0xc3
+
+.p2align 4
+.globl _asm_AES_encrypt
+.private_extern _asm_AES_encrypt
+
+.private_extern _asm_AES_encrypt
+_asm_AES_encrypt:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r10
+ leaq -63(%rdx),%rcx
+ andq $-64,%rsp
+ subq %rsp,%rcx
+ negq %rcx
+ andq $960,%rcx
+ subq %rcx,%rsp
+ subq $32,%rsp
+
+ movq %rsi,16(%rsp)
+ movq %r10,24(%rsp)
+L$enc_prologue:
+
+ movq %rdx,%r15
+ movl 240(%r15),%r13d
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+
+ shll $4,%r13d
+ leaq (%r15,%r13,1),%rbp
+ movq %r15,(%rsp)
+ movq %rbp,8(%rsp)
+
+
+ leaq L$AES_Te+2048(%rip),%r14
+ leaq 768(%rsp),%rbp
+ subq %r14,%rbp
+ andq $768,%rbp
+ leaq (%r14,%rbp,1),%r14
+
+ call _x86_64_AES_encrypt_compact
+
+ movq 16(%rsp),%r9
+ movq 24(%rsp),%rsi
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$enc_epilogue:
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_x86_64_AES_decrypt:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+
+ movl 240(%r15),%r13d
+ subl $1,%r13d
+ jmp L$dec_loop
+.p2align 4
+L$dec_loop:
+
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movl 0(%r14,%rsi,8),%r10d
+ movl 0(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r12d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movzbl %dl,%ebp
+ xorl 3(%r14,%rsi,8),%r10d
+ xorl 3(%r14,%rdi,8),%r11d
+ movl 0(%r14,%rbp,8),%r8d
+
+ movzbl %bh,%esi
+ shrl $16,%eax
+ movzbl %ch,%ebp
+ xorl 3(%r14,%rsi,8),%r12d
+ shrl $16,%edx
+ xorl 3(%r14,%rbp,8),%r8d
+
+ shrl $16,%ebx
+ leaq 16(%r15),%r15
+ shrl $16,%ecx
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ xorl 2(%r14,%rsi,8),%r10d
+ xorl 2(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r12d
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ movzbl %bl,%ebp
+ xorl 1(%r14,%rsi,8),%r10d
+ xorl 1(%r14,%rdi,8),%r11d
+ xorl 2(%r14,%rbp,8),%r8d
+
+ movzbl %dh,%esi
+ movl 12(%r15),%edx
+ movzbl %ah,%ebp
+ xorl 1(%r14,%rsi,8),%r12d
+ movl 0(%r15),%eax
+ xorl 1(%r14,%rbp,8),%r8d
+
+ xorl %r10d,%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ xorl %r12d,%ecx
+ xorl %r11d,%ebx
+ xorl %r8d,%edx
+ subl $1,%r13d
+ jnz L$dec_loop
+ leaq 2048(%r14),%r14
+ movzbl %al,%esi
+ movzbl %bl,%edi
+ movzbl %cl,%ebp
+ movzbl (%r14,%rsi,1),%r10d
+ movzbl (%r14,%rdi,1),%r11d
+ movzbl (%r14,%rbp,1),%r12d
+
+ movzbl %dl,%esi
+ movzbl %dh,%edi
+ movzbl %ah,%ebp
+ movzbl (%r14,%rsi,1),%r8d
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $8,%edi
+ shll $8,%ebp
+
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+ shrl $16,%edx
+
+ movzbl %bh,%esi
+ movzbl %ch,%edi
+ shrl $16,%eax
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+
+ shll $8,%esi
+ shll $8,%edi
+ shrl $16,%ebx
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+ shrl $16,%ecx
+
+ movzbl %cl,%esi
+ movzbl %dl,%edi
+ movzbl %al,%ebp
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $16,%esi
+ shll $16,%edi
+ shll $16,%ebp
+
+ xorl %esi,%r10d
+ xorl %edi,%r11d
+ xorl %ebp,%r12d
+
+ movzbl %bl,%esi
+ movzbl %bh,%edi
+ movzbl %ch,%ebp
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movzbl (%r14,%rbp,1),%ebp
+
+ shll $16,%esi
+ shll $24,%edi
+ shll $24,%ebp
+
+ xorl %esi,%r8d
+ xorl %edi,%r10d
+ xorl %ebp,%r11d
+
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ movl 16+12(%r15),%edx
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%edi
+ movl 16+0(%r15),%eax
+
+ shll $24,%esi
+ shll $24,%edi
+
+ xorl %esi,%r12d
+ xorl %edi,%r8d
+
+ movl 16+4(%r15),%ebx
+ movl 16+8(%r15),%ecx
+ leaq -2048(%r14),%r14
+ xorl %r10d,%eax
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+.byte 0xf3,0xc3
+
+
+.p2align 4
+_x86_64_AES_decrypt_compact:
+ leaq 128(%r14),%r8
+ movl 0-128(%r8),%edi
+ movl 32-128(%r8),%ebp
+ movl 64-128(%r8),%r10d
+ movl 96-128(%r8),%r11d
+ movl 128-128(%r8),%edi
+ movl 160-128(%r8),%ebp
+ movl 192-128(%r8),%r10d
+ movl 224-128(%r8),%r11d
+ jmp L$dec_loop_compact
+
+.p2align 4
+L$dec_loop_compact:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+ leaq 16(%r15),%r15
+ movzbl %al,%r10d
+ movzbl %bl,%r11d
+ movzbl %cl,%r12d
+ movzbl %dl,%r8d
+ movzbl %dh,%esi
+ movzbl %ah,%edi
+ shrl $16,%edx
+ movzbl %bh,%ebp
+ movzbl (%r14,%r10,1),%r10d
+ movzbl (%r14,%r11,1),%r11d
+ movzbl (%r14,%r12,1),%r12d
+ movzbl (%r14,%r8,1),%r8d
+
+ movzbl (%r14,%rsi,1),%r9d
+ movzbl %ch,%esi
+ movzbl (%r14,%rdi,1),%r13d
+ movzbl (%r14,%rbp,1),%ebp
+ movzbl (%r14,%rsi,1),%esi
+
+ shrl $16,%ecx
+ shll $8,%r13d
+ shll $8,%r9d
+ movzbl %cl,%edi
+ shrl $16,%eax
+ xorl %r9d,%r10d
+ shrl $16,%ebx
+ movzbl %dl,%r9d
+
+ shll $8,%ebp
+ xorl %r13d,%r11d
+ shll $8,%esi
+ movzbl %al,%r13d
+ movzbl (%r14,%rdi,1),%edi
+ xorl %ebp,%r12d
+ movzbl %bl,%ebp
+
+ shll $16,%edi
+ xorl %esi,%r8d
+ movzbl (%r14,%r9,1),%r9d
+ movzbl %bh,%esi
+ movzbl (%r14,%rbp,1),%ebp
+ xorl %edi,%r10d
+ movzbl (%r14,%r13,1),%r13d
+ movzbl %ch,%edi
+
+ shll $16,%ebp
+ shll $16,%r9d
+ shll $16,%r13d
+ xorl %ebp,%r8d
+ movzbl %dh,%ebp
+ xorl %r9d,%r11d
+ shrl $8,%eax
+ xorl %r13d,%r12d
+
+ movzbl (%r14,%rsi,1),%esi
+ movzbl (%r14,%rdi,1),%ebx
+ movzbl (%r14,%rbp,1),%ecx
+ movzbl (%r14,%rax,1),%edx
+
+ movl %r10d,%eax
+ shll $24,%esi
+ shll $24,%ebx
+ shll $24,%ecx
+ xorl %esi,%eax
+ shll $24,%edx
+ xorl %r11d,%ebx
+ xorl %r12d,%ecx
+ xorl %r8d,%edx
+ cmpq 16(%rsp),%r15
+ je L$dec_compact_done
+
+ movq 256+0(%r14),%rsi
+ shlq $32,%rbx
+ shlq $32,%rdx
+ movq 256+8(%r14),%rdi
+ orq %rbx,%rax
+ orq %rdx,%rcx
+ movq 256+16(%r14),%rbp
+ movq %rsi,%r9
+ movq %rsi,%r12
+ andq %rax,%r9
+ andq %rcx,%r12
+ movq %r9,%rbx
+ movq %r12,%rdx
+ shrq $7,%r9
+ leaq (%rax,%rax,1),%r8
+ shrq $7,%r12
+ leaq (%rcx,%rcx,1),%r11
+ subq %r9,%rbx
+ subq %r12,%rdx
+ andq %rdi,%r8
+ andq %rdi,%r11
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r8
+ xorq %rdx,%r11
+ movq %rsi,%r10
+ movq %rsi,%r13
+
+ andq %r8,%r10
+ andq %r11,%r13
+ movq %r10,%rbx
+ movq %r13,%rdx
+ shrq $7,%r10
+ leaq (%r8,%r8,1),%r9
+ shrq $7,%r13
+ leaq (%r11,%r11,1),%r12
+ subq %r10,%rbx
+ subq %r13,%rdx
+ andq %rdi,%r9
+ andq %rdi,%r12
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r9
+ xorq %rdx,%r12
+ movq %rsi,%r10
+ movq %rsi,%r13
+
+ andq %r9,%r10
+ andq %r12,%r13
+ movq %r10,%rbx
+ movq %r13,%rdx
+ shrq $7,%r10
+ xorq %rax,%r8
+ shrq $7,%r13
+ xorq %rcx,%r11
+ subq %r10,%rbx
+ subq %r13,%rdx
+ leaq (%r9,%r9,1),%r10
+ leaq (%r12,%r12,1),%r13
+ xorq %rax,%r9
+ xorq %rcx,%r12
+ andq %rdi,%r10
+ andq %rdi,%r13
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r10
+ xorq %rdx,%r13
+
+ xorq %r10,%rax
+ xorq %r13,%rcx
+ xorq %r10,%r8
+ xorq %r13,%r11
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ xorq %r10,%r9
+ shrq $32,%rbx
+ xorq %r13,%r12
+ shrq $32,%rdx
+ xorq %r8,%r10
+ roll $8,%eax
+ xorq %r11,%r13
+ roll $8,%ecx
+ xorq %r9,%r10
+ roll $8,%ebx
+ xorq %r12,%r13
+
+ roll $8,%edx
+ xorl %r10d,%eax
+ shrq $32,%r10
+ xorl %r13d,%ecx
+ shrq $32,%r13
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+ movq %r8,%r10
+ roll $24,%r8d
+ movq %r11,%r13
+ roll $24,%r11d
+ shrq $32,%r10
+ xorl %r8d,%eax
+ shrq $32,%r13
+ xorl %r11d,%ecx
+ roll $24,%r10d
+ movq %r9,%r8
+ roll $24,%r13d
+ movq %r12,%r11
+ shrq $32,%r8
+ xorl %r10d,%ebx
+ shrq $32,%r11
+ xorl %r13d,%edx
+
+ movq 0(%r14),%rsi
+ roll $16,%r9d
+ movq 64(%r14),%rdi
+ roll $16,%r12d
+ movq 128(%r14),%rbp
+ roll $16,%r8d
+ movq 192(%r14),%r10
+ xorl %r9d,%eax
+ roll $16,%r11d
+ xorl %r12d,%ecx
+ movq 256(%r14),%r13
+ xorl %r8d,%ebx
+ xorl %r11d,%edx
+ jmp L$dec_loop_compact
+.p2align 4
+L$dec_compact_done:
+ xorl 0(%r15),%eax
+ xorl 4(%r15),%ebx
+ xorl 8(%r15),%ecx
+ xorl 12(%r15),%edx
+.byte 0xf3,0xc3
+
+.p2align 4
+.globl _asm_AES_decrypt
+.private_extern _asm_AES_decrypt
+
+.private_extern _asm_AES_decrypt
+_asm_AES_decrypt:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r10
+ leaq -63(%rdx),%rcx
+ andq $-64,%rsp
+ subq %rsp,%rcx
+ negq %rcx
+ andq $960,%rcx
+ subq %rcx,%rsp
+ subq $32,%rsp
+
+ movq %rsi,16(%rsp)
+ movq %r10,24(%rsp)
+L$dec_prologue:
+
+ movq %rdx,%r15
+ movl 240(%r15),%r13d
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+
+ shll $4,%r13d
+ leaq (%r15,%r13,1),%rbp
+ movq %r15,(%rsp)
+ movq %rbp,8(%rsp)
+
+
+ leaq L$AES_Td+2048(%rip),%r14
+ leaq 768(%rsp),%rbp
+ subq %r14,%rbp
+ andq $768,%rbp
+ leaq (%r14,%rbp,1),%r14
+ shrq $3,%rbp
+ addq %rbp,%r14
+
+ call _x86_64_AES_decrypt_compact
+
+ movq 16(%rsp),%r9
+ movq 24(%rsp),%rsi
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$dec_epilogue:
+ .byte 0xf3,0xc3
+
+.p2align 4
+.globl _asm_AES_set_encrypt_key
+.private_extern _asm_AES_set_encrypt_key
+
+_asm_AES_set_encrypt_key:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $8,%rsp
+L$enc_key_prologue:
+
+ call _x86_64_AES_set_encrypt_key
+
+ movq 40(%rsp),%rbp
+ movq 48(%rsp),%rbx
+ addq $56,%rsp
+L$enc_key_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.p2align 4
+_x86_64_AES_set_encrypt_key:
+ movl %esi,%ecx
+ movq %rdi,%rsi
+ movq %rdx,%rdi
+
+ testq $-1,%rsi
+ jz L$badpointer
+ testq $-1,%rdi
+ jz L$badpointer
+
+ leaq L$AES_Te(%rip),%rbp
+ leaq 2048+128(%rbp),%rbp
+
+
+ movl 0-128(%rbp),%eax
+ movl 32-128(%rbp),%ebx
+ movl 64-128(%rbp),%r8d
+ movl 96-128(%rbp),%edx
+ movl 128-128(%rbp),%eax
+ movl 160-128(%rbp),%ebx
+ movl 192-128(%rbp),%r8d
+ movl 224-128(%rbp),%edx
+
+ cmpl $128,%ecx
+ je L$10rounds
+ cmpl $192,%ecx
+ je L$12rounds
+ cmpl $256,%ecx
+ je L$14rounds
+ movq $-2,%rax
+ jmp L$exit
+
+L$10rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rdx,8(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp L$10shortcut
+.p2align 2
+L$10loop:
+ movl 0(%rdi),%eax
+ movl 12(%rdi),%edx
+L$10shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,16(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,20(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,24(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,28(%rdi)
+ addl $1,%ecx
+ leaq 16(%rdi),%rdi
+ cmpl $10,%ecx
+ jl L$10loop
+
+ movl $10,80(%rdi)
+ xorq %rax,%rax
+ jmp L$exit
+
+L$12rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rdx,16(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp L$12shortcut
+.p2align 2
+L$12loop:
+ movl 0(%rdi),%eax
+ movl 20(%rdi),%edx
+L$12shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,24(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,28(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,32(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,36(%rdi)
+
+ cmpl $7,%ecx
+ je L$12break
+ addl $1,%ecx
+
+ xorl 16(%rdi),%eax
+ movl %eax,40(%rdi)
+ xorl 20(%rdi),%eax
+ movl %eax,44(%rdi)
+
+ leaq 24(%rdi),%rdi
+ jmp L$12loop
+L$12break:
+ movl $12,72(%rdi)
+ xorq %rax,%rax
+ jmp L$exit
+
+L$14rounds:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rcx
+ movq 24(%rsi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+
+ shrq $32,%rdx
+ xorl %ecx,%ecx
+ jmp L$14shortcut
+.p2align 2
+L$14loop:
+ movl 0(%rdi),%eax
+ movl 28(%rdi),%edx
+L$14shortcut:
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $8,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ xorl 1024-128(%rbp,%rcx,4),%eax
+ movl %eax,32(%rdi)
+ xorl 4(%rdi),%eax
+ movl %eax,36(%rdi)
+ xorl 8(%rdi),%eax
+ movl %eax,40(%rdi)
+ xorl 12(%rdi),%eax
+ movl %eax,44(%rdi)
+
+ cmpl $6,%ecx
+ je L$14break
+ addl $1,%ecx
+
+ movl %eax,%edx
+ movl 16(%rdi),%eax
+ movzbl %dl,%esi
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shrl $16,%edx
+ shll $8,%ebx
+ movzbl %dl,%esi
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ movzbl %dh,%esi
+ shll $16,%ebx
+ xorl %ebx,%eax
+
+ movzbl -128(%rbp,%rsi,1),%ebx
+ shll $24,%ebx
+ xorl %ebx,%eax
+
+ movl %eax,48(%rdi)
+ xorl 20(%rdi),%eax
+ movl %eax,52(%rdi)
+ xorl 24(%rdi),%eax
+ movl %eax,56(%rdi)
+ xorl 28(%rdi),%eax
+ movl %eax,60(%rdi)
+
+ leaq 32(%rdi),%rdi
+ jmp L$14loop
+L$14break:
+ movl $14,48(%rdi)
+ xorq %rax,%rax
+ jmp L$exit
+
+L$badpointer:
+ movq $-1,%rax
+L$exit:
+.byte 0xf3,0xc3
+
+.p2align 4
+.globl _asm_AES_set_decrypt_key
+.private_extern _asm_AES_set_decrypt_key
+
+_asm_AES_set_decrypt_key:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdx
+L$dec_key_prologue:
+
+ call _x86_64_AES_set_encrypt_key
+ movq (%rsp),%r8
+ cmpl $0,%eax
+ jne L$abort
+
+ movl 240(%r8),%r14d
+ xorq %rdi,%rdi
+ leaq (%rdi,%r14,4),%rcx
+ movq %r8,%rsi
+ leaq (%r8,%rcx,4),%rdi
+.p2align 2
+L$invert:
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 0(%rdi),%rcx
+ movq 8(%rdi),%rdx
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,0(%rsi)
+ movq %rdx,8(%rsi)
+ leaq 16(%rsi),%rsi
+ leaq -16(%rdi),%rdi
+ cmpq %rsi,%rdi
+ jne L$invert
+
+ leaq L$AES_Te+2048+1024(%rip),%rax
+
+ movq 40(%rax),%rsi
+ movq 48(%rax),%rdi
+ movq 56(%rax),%rbp
+
+ movq %r8,%r15
+ subl $1,%r14d
+.p2align 2
+L$permute:
+ leaq 16(%r15),%r15
+ movq 0(%r15),%rax
+ movq 8(%r15),%rcx
+ movq %rsi,%r9
+ movq %rsi,%r12
+ andq %rax,%r9
+ andq %rcx,%r12
+ movq %r9,%rbx
+ movq %r12,%rdx
+ shrq $7,%r9
+ leaq (%rax,%rax,1),%r8
+ shrq $7,%r12
+ leaq (%rcx,%rcx,1),%r11
+ subq %r9,%rbx
+ subq %r12,%rdx
+ andq %rdi,%r8
+ andq %rdi,%r11
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r8
+ xorq %rdx,%r11
+ movq %rsi,%r10
+ movq %rsi,%r13
+
+ andq %r8,%r10
+ andq %r11,%r13
+ movq %r10,%rbx
+ movq %r13,%rdx
+ shrq $7,%r10
+ leaq (%r8,%r8,1),%r9
+ shrq $7,%r13
+ leaq (%r11,%r11,1),%r12
+ subq %r10,%rbx
+ subq %r13,%rdx
+ andq %rdi,%r9
+ andq %rdi,%r12
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r9
+ xorq %rdx,%r12
+ movq %rsi,%r10
+ movq %rsi,%r13
+
+ andq %r9,%r10
+ andq %r12,%r13
+ movq %r10,%rbx
+ movq %r13,%rdx
+ shrq $7,%r10
+ xorq %rax,%r8
+ shrq $7,%r13
+ xorq %rcx,%r11
+ subq %r10,%rbx
+ subq %r13,%rdx
+ leaq (%r9,%r9,1),%r10
+ leaq (%r12,%r12,1),%r13
+ xorq %rax,%r9
+ xorq %rcx,%r12
+ andq %rdi,%r10
+ andq %rdi,%r13
+ andq %rbp,%rbx
+ andq %rbp,%rdx
+ xorq %rbx,%r10
+ xorq %rdx,%r13
+
+ xorq %r10,%rax
+ xorq %r13,%rcx
+ xorq %r10,%r8
+ xorq %r13,%r11
+ movq %rax,%rbx
+ movq %rcx,%rdx
+ xorq %r10,%r9
+ shrq $32,%rbx
+ xorq %r13,%r12
+ shrq $32,%rdx
+ xorq %r8,%r10
+ roll $8,%eax
+ xorq %r11,%r13
+ roll $8,%ecx
+ xorq %r9,%r10
+ roll $8,%ebx
+ xorq %r12,%r13
+
+ roll $8,%edx
+ xorl %r10d,%eax
+ shrq $32,%r10
+ xorl %r13d,%ecx
+ shrq $32,%r13
+ xorl %r10d,%ebx
+ xorl %r13d,%edx
+
+ movq %r8,%r10
+ roll $24,%r8d
+ movq %r11,%r13
+ roll $24,%r11d
+ shrq $32,%r10
+ xorl %r8d,%eax
+ shrq $32,%r13
+ xorl %r11d,%ecx
+ roll $24,%r10d
+ movq %r9,%r8
+ roll $24,%r13d
+ movq %r12,%r11
+ shrq $32,%r8
+ xorl %r10d,%ebx
+ shrq $32,%r11
+ xorl %r13d,%edx
+
+
+ roll $16,%r9d
+
+ roll $16,%r12d
+
+ roll $16,%r8d
+
+ xorl %r9d,%eax
+ roll $16,%r11d
+ xorl %r12d,%ecx
+
+ xorl %r8d,%ebx
+ xorl %r11d,%edx
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ subl $1,%r14d
+ jnz L$permute
+
+ xorq %rax,%rax
+L$abort:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbp
+ movq 48(%rsp),%rbx
+ addq $56,%rsp
+L$dec_key_epilogue:
+ .byte 0xf3,0xc3
+
+.p2align 4
+.globl _asm_AES_cbc_encrypt
+.private_extern _asm_AES_cbc_encrypt
+
+
+.private_extern _asm_AES_cbc_encrypt
+_asm_AES_cbc_encrypt:
+ cmpq $0,%rdx
+ je L$cbc_epilogue
+ pushfq
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+L$cbc_prologue:
+
+ cld
+ movl %r9d,%r9d
+
+ leaq L$AES_Te(%rip),%r14
+ cmpq $0,%r9
+ jne L$cbc_picked_te
+ leaq L$AES_Td(%rip),%r14
+L$cbc_picked_te:
+
+ movl _OPENSSL_ia32cap_P(%rip),%r10d
+ cmpq $512,%rdx
+ jb L$cbc_slow_prologue
+ testq $15,%rdx
+ jnz L$cbc_slow_prologue
+ btl $28,%r10d
+ jc L$cbc_slow_prologue
+
+
+ leaq -88-248(%rsp),%r15
+ andq $-64,%r15
+
+
+ movq %r14,%r10
+ leaq 2304(%r14),%r11
+ movq %r15,%r12
+ andq $4095,%r10
+ andq $4095,%r11
+ andq $4095,%r12
+
+ cmpq %r11,%r12
+ jb L$cbc_te_break_out
+ subq %r11,%r12
+ subq %r12,%r15
+ jmp L$cbc_te_ok
+L$cbc_te_break_out:
+ subq %r10,%r12
+ andq $4095,%r12
+ addq $320,%r12
+ subq %r12,%r15
+.p2align 2
+L$cbc_te_ok:
+
+ xchgq %rsp,%r15
+
+ movq %r15,16(%rsp)
+L$cbc_fast_body:
+ movq %rdi,24(%rsp)
+ movq %rsi,32(%rsp)
+ movq %rdx,40(%rsp)
+ movq %rcx,48(%rsp)
+ movq %r8,56(%rsp)
+ movl $0,80+240(%rsp)
+ movq %r8,%rbp
+ movq %r9,%rbx
+ movq %rsi,%r9
+ movq %rdi,%r8
+ movq %rcx,%r15
+
+ movl 240(%r15),%eax
+
+ movq %r15,%r10
+ subq %r14,%r10
+ andq $4095,%r10
+ cmpq $2304,%r10
+ jb L$cbc_do_ecopy
+ cmpq $4096-248,%r10
+ jb L$cbc_skip_ecopy
+.p2align 2
+L$cbc_do_ecopy:
+ movq %r15,%rsi
+ leaq 80(%rsp),%rdi
+ leaq 80(%rsp),%r15
+ movl $30,%ecx
+.long 0x90A548F3
+ movl %eax,(%rdi)
+L$cbc_skip_ecopy:
+ movq %r15,0(%rsp)
+
+ movl $18,%ecx
+.p2align 2
+L$cbc_prefetch_te:
+ movq 0(%r14),%r10
+ movq 32(%r14),%r11
+ movq 64(%r14),%r12
+ movq 96(%r14),%r13
+ leaq 128(%r14),%r14
+ subl $1,%ecx
+ jnz L$cbc_prefetch_te
+ leaq -2304(%r14),%r14
+
+ cmpq $0,%rbx
+ je L$FAST_DECRYPT
+
+
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+
+.p2align 2
+L$cbc_fast_enc_loop:
+ xorl 0(%r8),%eax
+ xorl 4(%r8),%ebx
+ xorl 8(%r8),%ecx
+ xorl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_encrypt
+
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ subq $16,%r10
+ testq $-16,%r10
+ movq %r10,40(%rsp)
+ jnz L$cbc_fast_enc_loop
+ movq 56(%rsp),%rbp
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ jmp L$cbc_fast_cleanup
+
+
+.p2align 4
+L$FAST_DECRYPT:
+ cmpq %r8,%r9
+ je L$cbc_fast_dec_in_place
+
+ movq %rbp,64(%rsp)
+.p2align 2
+L$cbc_fast_dec_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_decrypt
+
+ movq 64(%rsp),%rbp
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ xorl 0(%rbp),%eax
+ xorl 4(%rbp),%ebx
+ xorl 8(%rbp),%ecx
+ xorl 12(%rbp),%edx
+ movq %r8,%rbp
+
+ subq $16,%r10
+ movq %r10,40(%rsp)
+ movq %rbp,64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ jnz L$cbc_fast_dec_loop
+ movq 56(%rsp),%r12
+ movq 0(%rbp),%r10
+ movq 8(%rbp),%r11
+ movq %r10,0(%r12)
+ movq %r11,8(%r12)
+ jmp L$cbc_fast_cleanup
+
+.p2align 4
+L$cbc_fast_dec_in_place:
+ movq 0(%rbp),%r10
+ movq 8(%rbp),%r11
+ movq %r10,0+64(%rsp)
+ movq %r11,8+64(%rsp)
+.p2align 2
+L$cbc_fast_dec_in_place_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+
+ call _x86_64_AES_decrypt
+
+ movq 24(%rsp),%r8
+ movq 40(%rsp),%r10
+ xorl 0+64(%rsp),%eax
+ xorl 4+64(%rsp),%ebx
+ xorl 8+64(%rsp),%ecx
+ xorl 12+64(%rsp),%edx
+
+ movq 0(%r8),%r11
+ movq 8(%r8),%r12
+ subq $16,%r10
+ jz L$cbc_fast_dec_in_place_done
+
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ movq %r10,40(%rsp)
+ jmp L$cbc_fast_dec_in_place_loop
+L$cbc_fast_dec_in_place_done:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+.p2align 2
+L$cbc_fast_cleanup:
+ cmpl $0,80+240(%rsp)
+ leaq 80(%rsp),%rdi
+ je L$cbc_exit
+ movl $30,%ecx
+ xorq %rax,%rax
+.long 0x90AB48F3
+
+ jmp L$cbc_exit
+
+
+.p2align 4
+L$cbc_slow_prologue:
+
+ leaq -88(%rsp),%rbp
+ andq $-64,%rbp
+
+ leaq -88-63(%rcx),%r10
+ subq %rbp,%r10
+ negq %r10
+ andq $960,%r10
+ subq %r10,%rbp
+
+ xchgq %rsp,%rbp
+
+ movq %rbp,16(%rsp)
+L$cbc_slow_body:
+
+
+
+
+ movq %r8,56(%rsp)
+ movq %r8,%rbp
+ movq %r9,%rbx
+ movq %rsi,%r9
+ movq %rdi,%r8
+ movq %rcx,%r15
+ movq %rdx,%r10
+
+ movl 240(%r15),%eax
+ movq %r15,0(%rsp)
+ shll $4,%eax
+ leaq (%r15,%rax,1),%rax
+ movq %rax,8(%rsp)
+
+
+ leaq 2048(%r14),%r14
+ leaq 768-8(%rsp),%rax
+ subq %r14,%rax
+ andq $768,%rax
+ leaq (%r14,%rax,1),%r14
+
+ cmpq $0,%rbx
+ je L$SLOW_DECRYPT
+
+
+ testq $-16,%r10
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+ jz L$cbc_slow_enc_tail
+
+.p2align 2
+L$cbc_slow_enc_loop:
+ xorl 0(%r8),%eax
+ xorl 4(%r8),%ebx
+ xorl 8(%r8),%ecx
+ xorl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+ movq %r9,32(%rsp)
+ movq %r10,40(%rsp)
+
+ call _x86_64_AES_encrypt_compact
+
+ movq 24(%rsp),%r8
+ movq 32(%rsp),%r9
+ movq 40(%rsp),%r10
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ subq $16,%r10
+ testq $-16,%r10
+ jnz L$cbc_slow_enc_loop
+ testq $15,%r10
+ jnz L$cbc_slow_enc_tail
+ movq 56(%rsp),%rbp
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ jmp L$cbc_exit
+
+.p2align 2
+L$cbc_slow_enc_tail:
+ movq %rax,%r11
+ movq %rcx,%r12
+ movq %r10,%rcx
+ movq %r8,%rsi
+ movq %r9,%rdi
+.long 0x9066A4F3
+ movq $16,%rcx
+ subq %r10,%rcx
+ xorq %rax,%rax
+.long 0x9066AAF3
+ movq %r9,%r8
+ movq $16,%r10
+ movq %r11,%rax
+ movq %r12,%rcx
+ jmp L$cbc_slow_enc_loop
+
+.p2align 4
+L$SLOW_DECRYPT:
+ shrq $3,%rax
+ addq %rax,%r14
+
+ movq 0(%rbp),%r11
+ movq 8(%rbp),%r12
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+.p2align 2
+L$cbc_slow_dec_loop:
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movq 0(%rsp),%r15
+ movq %r8,24(%rsp)
+ movq %r9,32(%rsp)
+ movq %r10,40(%rsp)
+
+ call _x86_64_AES_decrypt_compact
+
+ movq 24(%rsp),%r8
+ movq 32(%rsp),%r9
+ movq 40(%rsp),%r10
+ xorl 0+64(%rsp),%eax
+ xorl 4+64(%rsp),%ebx
+ xorl 8+64(%rsp),%ecx
+ xorl 12+64(%rsp),%edx
+
+ movq 0(%r8),%r11
+ movq 8(%r8),%r12
+ subq $16,%r10
+ jc L$cbc_slow_dec_partial
+ jz L$cbc_slow_dec_done
+
+ movq %r11,0+64(%rsp)
+ movq %r12,8+64(%rsp)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ leaq 16(%r8),%r8
+ leaq 16(%r9),%r9
+ jmp L$cbc_slow_dec_loop
+L$cbc_slow_dec_done:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0(%r9)
+ movl %ebx,4(%r9)
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+
+ jmp L$cbc_exit
+
+.p2align 2
+L$cbc_slow_dec_partial:
+ movq 56(%rsp),%rdi
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+
+ movl %eax,0+64(%rsp)
+ movl %ebx,4+64(%rsp)
+ movl %ecx,8+64(%rsp)
+ movl %edx,12+64(%rsp)
+
+ movq %r9,%rdi
+ leaq 64(%rsp),%rsi
+ leaq 16(%r10),%rcx
+.long 0x9066A4F3
+ jmp L$cbc_exit
+
+.p2align 4
+L$cbc_exit:
+ movq 16(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$cbc_popfq:
+ popfq
+L$cbc_epilogue:
+ .byte 0xf3,0xc3
+
+.p2align 6
+L$AES_Te:
+.long 0xa56363c6,0xa56363c6
+.long 0x847c7cf8,0x847c7cf8
+.long 0x997777ee,0x997777ee
+.long 0x8d7b7bf6,0x8d7b7bf6
+.long 0x0df2f2ff,0x0df2f2ff
+.long 0xbd6b6bd6,0xbd6b6bd6
+.long 0xb16f6fde,0xb16f6fde
+.long 0x54c5c591,0x54c5c591
+.long 0x50303060,0x50303060
+.long 0x03010102,0x03010102
+.long 0xa96767ce,0xa96767ce
+.long 0x7d2b2b56,0x7d2b2b56
+.long 0x19fefee7,0x19fefee7
+.long 0x62d7d7b5,0x62d7d7b5
+.long 0xe6abab4d,0xe6abab4d
+.long 0x9a7676ec,0x9a7676ec
+.long 0x45caca8f,0x45caca8f
+.long 0x9d82821f,0x9d82821f
+.long 0x40c9c989,0x40c9c989
+.long 0x877d7dfa,0x877d7dfa
+.long 0x15fafaef,0x15fafaef
+.long 0xeb5959b2,0xeb5959b2
+.long 0xc947478e,0xc947478e
+.long 0x0bf0f0fb,0x0bf0f0fb
+.long 0xecadad41,0xecadad41
+.long 0x67d4d4b3,0x67d4d4b3
+.long 0xfda2a25f,0xfda2a25f
+.long 0xeaafaf45,0xeaafaf45
+.long 0xbf9c9c23,0xbf9c9c23
+.long 0xf7a4a453,0xf7a4a453
+.long 0x967272e4,0x967272e4
+.long 0x5bc0c09b,0x5bc0c09b
+.long 0xc2b7b775,0xc2b7b775
+.long 0x1cfdfde1,0x1cfdfde1
+.long 0xae93933d,0xae93933d
+.long 0x6a26264c,0x6a26264c
+.long 0x5a36366c,0x5a36366c
+.long 0x413f3f7e,0x413f3f7e
+.long 0x02f7f7f5,0x02f7f7f5
+.long 0x4fcccc83,0x4fcccc83
+.long 0x5c343468,0x5c343468
+.long 0xf4a5a551,0xf4a5a551
+.long 0x34e5e5d1,0x34e5e5d1
+.long 0x08f1f1f9,0x08f1f1f9
+.long 0x937171e2,0x937171e2
+.long 0x73d8d8ab,0x73d8d8ab
+.long 0x53313162,0x53313162
+.long 0x3f15152a,0x3f15152a
+.long 0x0c040408,0x0c040408
+.long 0x52c7c795,0x52c7c795
+.long 0x65232346,0x65232346
+.long 0x5ec3c39d,0x5ec3c39d
+.long 0x28181830,0x28181830
+.long 0xa1969637,0xa1969637
+.long 0x0f05050a,0x0f05050a
+.long 0xb59a9a2f,0xb59a9a2f
+.long 0x0907070e,0x0907070e
+.long 0x36121224,0x36121224
+.long 0x9b80801b,0x9b80801b
+.long 0x3de2e2df,0x3de2e2df
+.long 0x26ebebcd,0x26ebebcd
+.long 0x6927274e,0x6927274e
+.long 0xcdb2b27f,0xcdb2b27f
+.long 0x9f7575ea,0x9f7575ea
+.long 0x1b090912,0x1b090912
+.long 0x9e83831d,0x9e83831d
+.long 0x742c2c58,0x742c2c58
+.long 0x2e1a1a34,0x2e1a1a34
+.long 0x2d1b1b36,0x2d1b1b36
+.long 0xb26e6edc,0xb26e6edc
+.long 0xee5a5ab4,0xee5a5ab4
+.long 0xfba0a05b,0xfba0a05b
+.long 0xf65252a4,0xf65252a4
+.long 0x4d3b3b76,0x4d3b3b76
+.long 0x61d6d6b7,0x61d6d6b7
+.long 0xceb3b37d,0xceb3b37d
+.long 0x7b292952,0x7b292952
+.long 0x3ee3e3dd,0x3ee3e3dd
+.long 0x712f2f5e,0x712f2f5e
+.long 0x97848413,0x97848413
+.long 0xf55353a6,0xf55353a6
+.long 0x68d1d1b9,0x68d1d1b9
+.long 0x00000000,0x00000000
+.long 0x2cededc1,0x2cededc1
+.long 0x60202040,0x60202040
+.long 0x1ffcfce3,0x1ffcfce3
+.long 0xc8b1b179,0xc8b1b179
+.long 0xed5b5bb6,0xed5b5bb6
+.long 0xbe6a6ad4,0xbe6a6ad4
+.long 0x46cbcb8d,0x46cbcb8d
+.long 0xd9bebe67,0xd9bebe67
+.long 0x4b393972,0x4b393972
+.long 0xde4a4a94,0xde4a4a94
+.long 0xd44c4c98,0xd44c4c98
+.long 0xe85858b0,0xe85858b0
+.long 0x4acfcf85,0x4acfcf85
+.long 0x6bd0d0bb,0x6bd0d0bb
+.long 0x2aefefc5,0x2aefefc5
+.long 0xe5aaaa4f,0xe5aaaa4f
+.long 0x16fbfbed,0x16fbfbed
+.long 0xc5434386,0xc5434386
+.long 0xd74d4d9a,0xd74d4d9a
+.long 0x55333366,0x55333366
+.long 0x94858511,0x94858511
+.long 0xcf45458a,0xcf45458a
+.long 0x10f9f9e9,0x10f9f9e9
+.long 0x06020204,0x06020204
+.long 0x817f7ffe,0x817f7ffe
+.long 0xf05050a0,0xf05050a0
+.long 0x443c3c78,0x443c3c78
+.long 0xba9f9f25,0xba9f9f25
+.long 0xe3a8a84b,0xe3a8a84b
+.long 0xf35151a2,0xf35151a2
+.long 0xfea3a35d,0xfea3a35d
+.long 0xc0404080,0xc0404080
+.long 0x8a8f8f05,0x8a8f8f05
+.long 0xad92923f,0xad92923f
+.long 0xbc9d9d21,0xbc9d9d21
+.long 0x48383870,0x48383870
+.long 0x04f5f5f1,0x04f5f5f1
+.long 0xdfbcbc63,0xdfbcbc63
+.long 0xc1b6b677,0xc1b6b677
+.long 0x75dadaaf,0x75dadaaf
+.long 0x63212142,0x63212142
+.long 0x30101020,0x30101020
+.long 0x1affffe5,0x1affffe5
+.long 0x0ef3f3fd,0x0ef3f3fd
+.long 0x6dd2d2bf,0x6dd2d2bf
+.long 0x4ccdcd81,0x4ccdcd81
+.long 0x140c0c18,0x140c0c18
+.long 0x35131326,0x35131326
+.long 0x2fececc3,0x2fececc3
+.long 0xe15f5fbe,0xe15f5fbe
+.long 0xa2979735,0xa2979735
+.long 0xcc444488,0xcc444488
+.long 0x3917172e,0x3917172e
+.long 0x57c4c493,0x57c4c493
+.long 0xf2a7a755,0xf2a7a755
+.long 0x827e7efc,0x827e7efc
+.long 0x473d3d7a,0x473d3d7a
+.long 0xac6464c8,0xac6464c8
+.long 0xe75d5dba,0xe75d5dba
+.long 0x2b191932,0x2b191932
+.long 0x957373e6,0x957373e6
+.long 0xa06060c0,0xa06060c0
+.long 0x98818119,0x98818119
+.long 0xd14f4f9e,0xd14f4f9e
+.long 0x7fdcdca3,0x7fdcdca3
+.long 0x66222244,0x66222244
+.long 0x7e2a2a54,0x7e2a2a54
+.long 0xab90903b,0xab90903b
+.long 0x8388880b,0x8388880b
+.long 0xca46468c,0xca46468c
+.long 0x29eeeec7,0x29eeeec7
+.long 0xd3b8b86b,0xd3b8b86b
+.long 0x3c141428,0x3c141428
+.long 0x79dedea7,0x79dedea7
+.long 0xe25e5ebc,0xe25e5ebc
+.long 0x1d0b0b16,0x1d0b0b16
+.long 0x76dbdbad,0x76dbdbad
+.long 0x3be0e0db,0x3be0e0db
+.long 0x56323264,0x56323264
+.long 0x4e3a3a74,0x4e3a3a74
+.long 0x1e0a0a14,0x1e0a0a14
+.long 0xdb494992,0xdb494992
+.long 0x0a06060c,0x0a06060c
+.long 0x6c242448,0x6c242448
+.long 0xe45c5cb8,0xe45c5cb8
+.long 0x5dc2c29f,0x5dc2c29f
+.long 0x6ed3d3bd,0x6ed3d3bd
+.long 0xefacac43,0xefacac43
+.long 0xa66262c4,0xa66262c4
+.long 0xa8919139,0xa8919139
+.long 0xa4959531,0xa4959531
+.long 0x37e4e4d3,0x37e4e4d3
+.long 0x8b7979f2,0x8b7979f2
+.long 0x32e7e7d5,0x32e7e7d5
+.long 0x43c8c88b,0x43c8c88b
+.long 0x5937376e,0x5937376e
+.long 0xb76d6dda,0xb76d6dda
+.long 0x8c8d8d01,0x8c8d8d01
+.long 0x64d5d5b1,0x64d5d5b1
+.long 0xd24e4e9c,0xd24e4e9c
+.long 0xe0a9a949,0xe0a9a949
+.long 0xb46c6cd8,0xb46c6cd8
+.long 0xfa5656ac,0xfa5656ac
+.long 0x07f4f4f3,0x07f4f4f3
+.long 0x25eaeacf,0x25eaeacf
+.long 0xaf6565ca,0xaf6565ca
+.long 0x8e7a7af4,0x8e7a7af4
+.long 0xe9aeae47,0xe9aeae47
+.long 0x18080810,0x18080810
+.long 0xd5baba6f,0xd5baba6f
+.long 0x887878f0,0x887878f0
+.long 0x6f25254a,0x6f25254a
+.long 0x722e2e5c,0x722e2e5c
+.long 0x241c1c38,0x241c1c38
+.long 0xf1a6a657,0xf1a6a657
+.long 0xc7b4b473,0xc7b4b473
+.long 0x51c6c697,0x51c6c697
+.long 0x23e8e8cb,0x23e8e8cb
+.long 0x7cdddda1,0x7cdddda1
+.long 0x9c7474e8,0x9c7474e8
+.long 0x211f1f3e,0x211f1f3e
+.long 0xdd4b4b96,0xdd4b4b96
+.long 0xdcbdbd61,0xdcbdbd61
+.long 0x868b8b0d,0x868b8b0d
+.long 0x858a8a0f,0x858a8a0f
+.long 0x907070e0,0x907070e0
+.long 0x423e3e7c,0x423e3e7c
+.long 0xc4b5b571,0xc4b5b571
+.long 0xaa6666cc,0xaa6666cc
+.long 0xd8484890,0xd8484890
+.long 0x05030306,0x05030306
+.long 0x01f6f6f7,0x01f6f6f7
+.long 0x120e0e1c,0x120e0e1c
+.long 0xa36161c2,0xa36161c2
+.long 0x5f35356a,0x5f35356a
+.long 0xf95757ae,0xf95757ae
+.long 0xd0b9b969,0xd0b9b969
+.long 0x91868617,0x91868617
+.long 0x58c1c199,0x58c1c199
+.long 0x271d1d3a,0x271d1d3a
+.long 0xb99e9e27,0xb99e9e27
+.long 0x38e1e1d9,0x38e1e1d9
+.long 0x13f8f8eb,0x13f8f8eb
+.long 0xb398982b,0xb398982b
+.long 0x33111122,0x33111122
+.long 0xbb6969d2,0xbb6969d2
+.long 0x70d9d9a9,0x70d9d9a9
+.long 0x898e8e07,0x898e8e07
+.long 0xa7949433,0xa7949433
+.long 0xb69b9b2d,0xb69b9b2d
+.long 0x221e1e3c,0x221e1e3c
+.long 0x92878715,0x92878715
+.long 0x20e9e9c9,0x20e9e9c9
+.long 0x49cece87,0x49cece87
+.long 0xff5555aa,0xff5555aa
+.long 0x78282850,0x78282850
+.long 0x7adfdfa5,0x7adfdfa5
+.long 0x8f8c8c03,0x8f8c8c03
+.long 0xf8a1a159,0xf8a1a159
+.long 0x80898909,0x80898909
+.long 0x170d0d1a,0x170d0d1a
+.long 0xdabfbf65,0xdabfbf65
+.long 0x31e6e6d7,0x31e6e6d7
+.long 0xc6424284,0xc6424284
+.long 0xb86868d0,0xb86868d0
+.long 0xc3414182,0xc3414182
+.long 0xb0999929,0xb0999929
+.long 0x772d2d5a,0x772d2d5a
+.long 0x110f0f1e,0x110f0f1e
+.long 0xcbb0b07b,0xcbb0b07b
+.long 0xfc5454a8,0xfc5454a8
+.long 0xd6bbbb6d,0xd6bbbb6d
+.long 0x3a16162c,0x3a16162c
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.byte 0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5
+.byte 0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76
+.byte 0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0
+.byte 0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0
+.byte 0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc
+.byte 0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15
+.byte 0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a
+.byte 0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75
+.byte 0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0
+.byte 0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84
+.byte 0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b
+.byte 0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf
+.byte 0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85
+.byte 0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8
+.byte 0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5
+.byte 0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2
+.byte 0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17
+.byte 0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73
+.byte 0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88
+.byte 0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb
+.byte 0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c
+.byte 0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79
+.byte 0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9
+.byte 0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08
+.byte 0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6
+.byte 0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a
+.byte 0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e
+.byte 0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e
+.byte 0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94
+.byte 0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf
+.byte 0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68
+.byte 0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
+.long 0x00000001, 0x00000002, 0x00000004, 0x00000008
+.long 0x00000010, 0x00000020, 0x00000040, 0x00000080
+.long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
+.long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
+.p2align 6
+L$AES_Td:
+.long 0x50a7f451,0x50a7f451
+.long 0x5365417e,0x5365417e
+.long 0xc3a4171a,0xc3a4171a
+.long 0x965e273a,0x965e273a
+.long 0xcb6bab3b,0xcb6bab3b
+.long 0xf1459d1f,0xf1459d1f
+.long 0xab58faac,0xab58faac
+.long 0x9303e34b,0x9303e34b
+.long 0x55fa3020,0x55fa3020
+.long 0xf66d76ad,0xf66d76ad
+.long 0x9176cc88,0x9176cc88
+.long 0x254c02f5,0x254c02f5
+.long 0xfcd7e54f,0xfcd7e54f
+.long 0xd7cb2ac5,0xd7cb2ac5
+.long 0x80443526,0x80443526
+.long 0x8fa362b5,0x8fa362b5
+.long 0x495ab1de,0x495ab1de
+.long 0x671bba25,0x671bba25
+.long 0x980eea45,0x980eea45
+.long 0xe1c0fe5d,0xe1c0fe5d
+.long 0x02752fc3,0x02752fc3
+.long 0x12f04c81,0x12f04c81
+.long 0xa397468d,0xa397468d
+.long 0xc6f9d36b,0xc6f9d36b
+.long 0xe75f8f03,0xe75f8f03
+.long 0x959c9215,0x959c9215
+.long 0xeb7a6dbf,0xeb7a6dbf
+.long 0xda595295,0xda595295
+.long 0x2d83bed4,0x2d83bed4
+.long 0xd3217458,0xd3217458
+.long 0x2969e049,0x2969e049
+.long 0x44c8c98e,0x44c8c98e
+.long 0x6a89c275,0x6a89c275
+.long 0x78798ef4,0x78798ef4
+.long 0x6b3e5899,0x6b3e5899
+.long 0xdd71b927,0xdd71b927
+.long 0xb64fe1be,0xb64fe1be
+.long 0x17ad88f0,0x17ad88f0
+.long 0x66ac20c9,0x66ac20c9
+.long 0xb43ace7d,0xb43ace7d
+.long 0x184adf63,0x184adf63
+.long 0x82311ae5,0x82311ae5
+.long 0x60335197,0x60335197
+.long 0x457f5362,0x457f5362
+.long 0xe07764b1,0xe07764b1
+.long 0x84ae6bbb,0x84ae6bbb
+.long 0x1ca081fe,0x1ca081fe
+.long 0x942b08f9,0x942b08f9
+.long 0x58684870,0x58684870
+.long 0x19fd458f,0x19fd458f
+.long 0x876cde94,0x876cde94
+.long 0xb7f87b52,0xb7f87b52
+.long 0x23d373ab,0x23d373ab
+.long 0xe2024b72,0xe2024b72
+.long 0x578f1fe3,0x578f1fe3
+.long 0x2aab5566,0x2aab5566
+.long 0x0728ebb2,0x0728ebb2
+.long 0x03c2b52f,0x03c2b52f
+.long 0x9a7bc586,0x9a7bc586
+.long 0xa50837d3,0xa50837d3
+.long 0xf2872830,0xf2872830
+.long 0xb2a5bf23,0xb2a5bf23
+.long 0xba6a0302,0xba6a0302
+.long 0x5c8216ed,0x5c8216ed
+.long 0x2b1ccf8a,0x2b1ccf8a
+.long 0x92b479a7,0x92b479a7
+.long 0xf0f207f3,0xf0f207f3
+.long 0xa1e2694e,0xa1e2694e
+.long 0xcdf4da65,0xcdf4da65
+.long 0xd5be0506,0xd5be0506
+.long 0x1f6234d1,0x1f6234d1
+.long 0x8afea6c4,0x8afea6c4
+.long 0x9d532e34,0x9d532e34
+.long 0xa055f3a2,0xa055f3a2
+.long 0x32e18a05,0x32e18a05
+.long 0x75ebf6a4,0x75ebf6a4
+.long 0x39ec830b,0x39ec830b
+.long 0xaaef6040,0xaaef6040
+.long 0x069f715e,0x069f715e
+.long 0x51106ebd,0x51106ebd
+.long 0xf98a213e,0xf98a213e
+.long 0x3d06dd96,0x3d06dd96
+.long 0xae053edd,0xae053edd
+.long 0x46bde64d,0x46bde64d
+.long 0xb58d5491,0xb58d5491
+.long 0x055dc471,0x055dc471
+.long 0x6fd40604,0x6fd40604
+.long 0xff155060,0xff155060
+.long 0x24fb9819,0x24fb9819
+.long 0x97e9bdd6,0x97e9bdd6
+.long 0xcc434089,0xcc434089
+.long 0x779ed967,0x779ed967
+.long 0xbd42e8b0,0xbd42e8b0
+.long 0x888b8907,0x888b8907
+.long 0x385b19e7,0x385b19e7
+.long 0xdbeec879,0xdbeec879
+.long 0x470a7ca1,0x470a7ca1
+.long 0xe90f427c,0xe90f427c
+.long 0xc91e84f8,0xc91e84f8
+.long 0x00000000,0x00000000
+.long 0x83868009,0x83868009
+.long 0x48ed2b32,0x48ed2b32
+.long 0xac70111e,0xac70111e
+.long 0x4e725a6c,0x4e725a6c
+.long 0xfbff0efd,0xfbff0efd
+.long 0x5638850f,0x5638850f
+.long 0x1ed5ae3d,0x1ed5ae3d
+.long 0x27392d36,0x27392d36
+.long 0x64d90f0a,0x64d90f0a
+.long 0x21a65c68,0x21a65c68
+.long 0xd1545b9b,0xd1545b9b
+.long 0x3a2e3624,0x3a2e3624
+.long 0xb1670a0c,0xb1670a0c
+.long 0x0fe75793,0x0fe75793
+.long 0xd296eeb4,0xd296eeb4
+.long 0x9e919b1b,0x9e919b1b
+.long 0x4fc5c080,0x4fc5c080
+.long 0xa220dc61,0xa220dc61
+.long 0x694b775a,0x694b775a
+.long 0x161a121c,0x161a121c
+.long 0x0aba93e2,0x0aba93e2
+.long 0xe52aa0c0,0xe52aa0c0
+.long 0x43e0223c,0x43e0223c
+.long 0x1d171b12,0x1d171b12
+.long 0x0b0d090e,0x0b0d090e
+.long 0xadc78bf2,0xadc78bf2
+.long 0xb9a8b62d,0xb9a8b62d
+.long 0xc8a91e14,0xc8a91e14
+.long 0x8519f157,0x8519f157
+.long 0x4c0775af,0x4c0775af
+.long 0xbbdd99ee,0xbbdd99ee
+.long 0xfd607fa3,0xfd607fa3
+.long 0x9f2601f7,0x9f2601f7
+.long 0xbcf5725c,0xbcf5725c
+.long 0xc53b6644,0xc53b6644
+.long 0x347efb5b,0x347efb5b
+.long 0x7629438b,0x7629438b
+.long 0xdcc623cb,0xdcc623cb
+.long 0x68fcedb6,0x68fcedb6
+.long 0x63f1e4b8,0x63f1e4b8
+.long 0xcadc31d7,0xcadc31d7
+.long 0x10856342,0x10856342
+.long 0x40229713,0x40229713
+.long 0x2011c684,0x2011c684
+.long 0x7d244a85,0x7d244a85
+.long 0xf83dbbd2,0xf83dbbd2
+.long 0x1132f9ae,0x1132f9ae
+.long 0x6da129c7,0x6da129c7
+.long 0x4b2f9e1d,0x4b2f9e1d
+.long 0xf330b2dc,0xf330b2dc
+.long 0xec52860d,0xec52860d
+.long 0xd0e3c177,0xd0e3c177
+.long 0x6c16b32b,0x6c16b32b
+.long 0x99b970a9,0x99b970a9
+.long 0xfa489411,0xfa489411
+.long 0x2264e947,0x2264e947
+.long 0xc48cfca8,0xc48cfca8
+.long 0x1a3ff0a0,0x1a3ff0a0
+.long 0xd82c7d56,0xd82c7d56
+.long 0xef903322,0xef903322
+.long 0xc74e4987,0xc74e4987
+.long 0xc1d138d9,0xc1d138d9
+.long 0xfea2ca8c,0xfea2ca8c
+.long 0x360bd498,0x360bd498
+.long 0xcf81f5a6,0xcf81f5a6
+.long 0x28de7aa5,0x28de7aa5
+.long 0x268eb7da,0x268eb7da
+.long 0xa4bfad3f,0xa4bfad3f
+.long 0xe49d3a2c,0xe49d3a2c
+.long 0x0d927850,0x0d927850
+.long 0x9bcc5f6a,0x9bcc5f6a
+.long 0x62467e54,0x62467e54
+.long 0xc2138df6,0xc2138df6
+.long 0xe8b8d890,0xe8b8d890
+.long 0x5ef7392e,0x5ef7392e
+.long 0xf5afc382,0xf5afc382
+.long 0xbe805d9f,0xbe805d9f
+.long 0x7c93d069,0x7c93d069
+.long 0xa92dd56f,0xa92dd56f
+.long 0xb31225cf,0xb31225cf
+.long 0x3b99acc8,0x3b99acc8
+.long 0xa77d1810,0xa77d1810
+.long 0x6e639ce8,0x6e639ce8
+.long 0x7bbb3bdb,0x7bbb3bdb
+.long 0x097826cd,0x097826cd
+.long 0xf418596e,0xf418596e
+.long 0x01b79aec,0x01b79aec
+.long 0xa89a4f83,0xa89a4f83
+.long 0x656e95e6,0x656e95e6
+.long 0x7ee6ffaa,0x7ee6ffaa
+.long 0x08cfbc21,0x08cfbc21
+.long 0xe6e815ef,0xe6e815ef
+.long 0xd99be7ba,0xd99be7ba
+.long 0xce366f4a,0xce366f4a
+.long 0xd4099fea,0xd4099fea
+.long 0xd67cb029,0xd67cb029
+.long 0xafb2a431,0xafb2a431
+.long 0x31233f2a,0x31233f2a
+.long 0x3094a5c6,0x3094a5c6
+.long 0xc066a235,0xc066a235
+.long 0x37bc4e74,0x37bc4e74
+.long 0xa6ca82fc,0xa6ca82fc
+.long 0xb0d090e0,0xb0d090e0
+.long 0x15d8a733,0x15d8a733
+.long 0x4a9804f1,0x4a9804f1
+.long 0xf7daec41,0xf7daec41
+.long 0x0e50cd7f,0x0e50cd7f
+.long 0x2ff69117,0x2ff69117
+.long 0x8dd64d76,0x8dd64d76
+.long 0x4db0ef43,0x4db0ef43
+.long 0x544daacc,0x544daacc
+.long 0xdf0496e4,0xdf0496e4
+.long 0xe3b5d19e,0xe3b5d19e
+.long 0x1b886a4c,0x1b886a4c
+.long 0xb81f2cc1,0xb81f2cc1
+.long 0x7f516546,0x7f516546
+.long 0x04ea5e9d,0x04ea5e9d
+.long 0x5d358c01,0x5d358c01
+.long 0x737487fa,0x737487fa
+.long 0x2e410bfb,0x2e410bfb
+.long 0x5a1d67b3,0x5a1d67b3
+.long 0x52d2db92,0x52d2db92
+.long 0x335610e9,0x335610e9
+.long 0x1347d66d,0x1347d66d
+.long 0x8c61d79a,0x8c61d79a
+.long 0x7a0ca137,0x7a0ca137
+.long 0x8e14f859,0x8e14f859
+.long 0x893c13eb,0x893c13eb
+.long 0xee27a9ce,0xee27a9ce
+.long 0x35c961b7,0x35c961b7
+.long 0xede51ce1,0xede51ce1
+.long 0x3cb1477a,0x3cb1477a
+.long 0x59dfd29c,0x59dfd29c
+.long 0x3f73f255,0x3f73f255
+.long 0x79ce1418,0x79ce1418
+.long 0xbf37c773,0xbf37c773
+.long 0xeacdf753,0xeacdf753
+.long 0x5baafd5f,0x5baafd5f
+.long 0x146f3ddf,0x146f3ddf
+.long 0x86db4478,0x86db4478
+.long 0x81f3afca,0x81f3afca
+.long 0x3ec468b9,0x3ec468b9
+.long 0x2c342438,0x2c342438
+.long 0x5f40a3c2,0x5f40a3c2
+.long 0x72c31d16,0x72c31d16
+.long 0x0c25e2bc,0x0c25e2bc
+.long 0x8b493c28,0x8b493c28
+.long 0x41950dff,0x41950dff
+.long 0x7101a839,0x7101a839
+.long 0xdeb30c08,0xdeb30c08
+.long 0x9ce4b4d8,0x9ce4b4d8
+.long 0x90c15664,0x90c15664
+.long 0x6184cb7b,0x6184cb7b
+.long 0x70b632d5,0x70b632d5
+.long 0x745c6c48,0x745c6c48
+.long 0x4257b8d0,0x4257b8d0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 0x52,0x09,0x6a,0xd5,0x30,0x36,0xa5,0x38
+.byte 0xbf,0x40,0xa3,0x9e,0x81,0xf3,0xd7,0xfb
+.byte 0x7c,0xe3,0x39,0x82,0x9b,0x2f,0xff,0x87
+.byte 0x34,0x8e,0x43,0x44,0xc4,0xde,0xe9,0xcb
+.byte 0x54,0x7b,0x94,0x32,0xa6,0xc2,0x23,0x3d
+.byte 0xee,0x4c,0x95,0x0b,0x42,0xfa,0xc3,0x4e
+.byte 0x08,0x2e,0xa1,0x66,0x28,0xd9,0x24,0xb2
+.byte 0x76,0x5b,0xa2,0x49,0x6d,0x8b,0xd1,0x25
+.byte 0x72,0xf8,0xf6,0x64,0x86,0x68,0x98,0x16
+.byte 0xd4,0xa4,0x5c,0xcc,0x5d,0x65,0xb6,0x92
+.byte 0x6c,0x70,0x48,0x50,0xfd,0xed,0xb9,0xda
+.byte 0x5e,0x15,0x46,0x57,0xa7,0x8d,0x9d,0x84
+.byte 0x90,0xd8,0xab,0x00,0x8c,0xbc,0xd3,0x0a
+.byte 0xf7,0xe4,0x58,0x05,0xb8,0xb3,0x45,0x06
+.byte 0xd0,0x2c,0x1e,0x8f,0xca,0x3f,0x0f,0x02
+.byte 0xc1,0xaf,0xbd,0x03,0x01,0x13,0x8a,0x6b
+.byte 0x3a,0x91,0x11,0x41,0x4f,0x67,0xdc,0xea
+.byte 0x97,0xf2,0xcf,0xce,0xf0,0xb4,0xe6,0x73
+.byte 0x96,0xac,0x74,0x22,0xe7,0xad,0x35,0x85
+.byte 0xe2,0xf9,0x37,0xe8,0x1c,0x75,0xdf,0x6e
+.byte 0x47,0xf1,0x1a,0x71,0x1d,0x29,0xc5,0x89
+.byte 0x6f,0xb7,0x62,0x0e,0xaa,0x18,0xbe,0x1b
+.byte 0xfc,0x56,0x3e,0x4b,0xc6,0xd2,0x79,0x20
+.byte 0x9a,0xdb,0xc0,0xfe,0x78,0xcd,0x5a,0xf4
+.byte 0x1f,0xdd,0xa8,0x33,0x88,0x07,0xc7,0x31
+.byte 0xb1,0x12,0x10,0x59,0x27,0x80,0xec,0x5f
+.byte 0x60,0x51,0x7f,0xa9,0x19,0xb5,0x4a,0x0d
+.byte 0x2d,0xe5,0x7a,0x9f,0x93,0xc9,0x9c,0xef
+.byte 0xa0,0xe0,0x3b,0x4d,0xae,0x2a,0xf5,0xb0
+.byte 0xc8,0xeb,0xbb,0x3c,0x83,0x53,0x99,0x61
+.byte 0x17,0x2b,0x04,0x7e,0xba,0x77,0xd6,0x26
+.byte 0xe1,0x69,0x14,0x63,0x55,0x21,0x0c,0x7d
+.long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
+.long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
+.byte 65,69,83,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+#endif
diff --git a/mac-x86_64/crypto/aes/aesni-x86_64.S b/mac-x86_64/crypto/aes/aesni-x86_64.S
new file mode 100644
index 0000000..032c94d
--- /dev/null
+++ b/mac-x86_64/crypto/aes/aesni-x86_64.S
@@ -0,0 +1,3178 @@
+#if defined(__x86_64__)
+.text
+
+.globl _aesni_encrypt
+.private_extern _aesni_encrypt
+
+.p2align 4
+_aesni_encrypt:
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+L$oop_enc1_1:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz L$oop_enc1_1
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ .byte 0xf3,0xc3
+
+
+.globl _aesni_decrypt
+.private_extern _aesni_decrypt
+
+.p2align 4
+_aesni_decrypt:
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+L$oop_dec1_2:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz L$oop_dec1_2
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_encrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$enc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_decrypt2:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$dec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_encrypt3:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$enc_loop3:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop3
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_decrypt3:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+L$dec_loop3:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop3
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_encrypt4:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
+
+L$enc_loop4:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop4
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_decrypt4:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
+
+L$dec_loop4:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop4
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_encrypt6:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm7
+ addq $16,%rax
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups -16(%rcx,%rax,1),%xmm0
+ jmp L$enc_loop6_enter
+.p2align 4
+L$enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+L$enc_loop6_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop6
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_decrypt6:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm7
+ addq $16,%rax
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups -16(%rcx,%rax,1),%xmm0
+ jmp L$dec_loop6_enter
+.p2align 4
+L$dec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+L$dec_loop6_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop6
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_encrypt8:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
+ addq $16,%rax
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm8
+ pxor %xmm0,%xmm9
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups -16(%rcx,%rax,1),%xmm0
+ jmp L$enc_loop8_enter
+.p2align 4
+L$enc_loop8:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+L$enc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$enc_loop8
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+.byte 102,68,15,56,221,192
+.byte 102,68,15,56,221,200
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_aesni_decrypt8:
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
+ addq $16,%rax
+ pxor %xmm0,%xmm7
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm8
+ pxor %xmm0,%xmm9
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups -16(%rcx,%rax,1),%xmm0
+ jmp L$dec_loop8_enter
+.p2align 4
+L$dec_loop8:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+L$dec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$dec_loop8
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+.byte 102,68,15,56,223,192
+.byte 102,68,15,56,223,200
+ .byte 0xf3,0xc3
+
+.globl _aesni_ecb_encrypt
+.private_extern _aesni_ecb_encrypt
+
+.p2align 4
+_aesni_ecb_encrypt:
+ andq $-16,%rdx
+ jz L$ecb_ret
+
+ movl 240(%rcx),%eax
+ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %eax,%r10d
+ testl %r8d,%r8d
+ jz L$ecb_decrypt
+
+ cmpq $128,%rdx
+ jb L$ecb_enc_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ jmp L$ecb_enc_loop8_enter
+.p2align 4
+L$ecb_enc_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+L$ecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ subq $128,%rdx
+ jnc L$ecb_enc_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $128,%rdx
+ jz L$ecb_ret
+
+L$ecb_enc_tail:
+ movups (%rdi),%xmm2
+ cmpq $32,%rdx
+ jb L$ecb_enc_one
+ movups 16(%rdi),%xmm3
+ je L$ecb_enc_two
+ movups 32(%rdi),%xmm4
+ cmpq $64,%rdx
+ jb L$ecb_enc_three
+ movups 48(%rdi),%xmm5
+ je L$ecb_enc_four
+ movups 64(%rdi),%xmm6
+ cmpq $96,%rdx
+ jb L$ecb_enc_five
+ movups 80(%rdi),%xmm7
+ je L$ecb_enc_six
+ movdqu 96(%rdi),%xmm8
+ call _aesni_encrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_3:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_two:
+ call _aesni_encrypt2
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_five:
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_enc_six:
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ jmp L$ecb_ret
+
+.p2align 4
+L$ecb_decrypt:
+ cmpq $128,%rdx
+ jb L$ecb_dec_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ jmp L$ecb_dec_loop8_enter
+.p2align 4
+L$ecb_dec_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+L$ecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ movups (%r11),%xmm0
+ subq $128,%rdx
+ jnc L$ecb_dec_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $128,%rdx
+ jz L$ecb_ret
+
+L$ecb_dec_tail:
+ movups (%rdi),%xmm2
+ cmpq $32,%rdx
+ jb L$ecb_dec_one
+ movups 16(%rdi),%xmm3
+ je L$ecb_dec_two
+ movups 32(%rdi),%xmm4
+ cmpq $64,%rdx
+ jb L$ecb_dec_three
+ movups 48(%rdi),%xmm5
+ je L$ecb_dec_four
+ movups 64(%rdi),%xmm6
+ cmpq $96,%rdx
+ jb L$ecb_dec_five
+ movups 80(%rdi),%xmm7
+ je L$ecb_dec_six
+ movups 96(%rdi),%xmm8
+ movups (%rcx),%xmm0
+ call _aesni_decrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_4:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_two:
+ call _aesni_decrypt2
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_five:
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp L$ecb_ret
+.p2align 4
+L$ecb_dec_six:
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+
+L$ecb_ret:
+ .byte 0xf3,0xc3
+
+.globl _aesni_ccm64_encrypt_blocks
+.private_extern _aesni_ccm64_encrypt_blocks
+
+.p2align 4
+_aesni_ccm64_encrypt_blocks:
+ movl 240(%rcx),%eax
+ movdqu (%r8),%xmm6
+ movdqa L$increment64(%rip),%xmm9
+ movdqa L$bswap_mask(%rip),%xmm7
+
+ shll $4,%eax
+ movl $16,%r10d
+ leaq 0(%rcx),%r11
+ movdqu (%r9),%xmm3
+ movdqa %xmm6,%xmm2
+ leaq 32(%rcx,%rax,1),%rcx
+.byte 102,15,56,0,247
+ subq %rax,%r10
+ jmp L$ccm64_enc_outer
+.p2align 4
+L$ccm64_enc_outer:
+ movups (%r11),%xmm0
+ movq %r10,%rax
+ movups (%rdi),%xmm8
+
+ xorps %xmm0,%xmm2
+ movups 16(%r11),%xmm1
+ xorps %xmm8,%xmm0
+ xorps %xmm0,%xmm3
+ movups 32(%r11),%xmm0
+
+L$ccm64_enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$ccm64_enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ paddq %xmm9,%xmm6
+ decq %rdx
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+
+ leaq 16(%rdi),%rdi
+ xorps %xmm2,%xmm8
+ movdqa %xmm6,%xmm2
+ movups %xmm8,(%rsi)
+.byte 102,15,56,0,215
+ leaq 16(%rsi),%rsi
+ jnz L$ccm64_enc_outer
+
+ movups %xmm3,(%r9)
+ .byte 0xf3,0xc3
+
+.globl _aesni_ccm64_decrypt_blocks
+.private_extern _aesni_ccm64_decrypt_blocks
+
+.p2align 4
+_aesni_ccm64_decrypt_blocks:
+ movl 240(%rcx),%eax
+ movups (%r8),%xmm6
+ movdqu (%r9),%xmm3
+ movdqa L$increment64(%rip),%xmm9
+ movdqa L$bswap_mask(%rip),%xmm7
+
+ movaps %xmm6,%xmm2
+ movl %eax,%r10d
+ movq %rcx,%r11
+.byte 102,15,56,0,247
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_5:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_5
+.byte 102,15,56,221,209
+ shll $4,%r10d
+ movl $16,%eax
+ movups (%rdi),%xmm8
+ paddq %xmm9,%xmm6
+ leaq 16(%rdi),%rdi
+ subq %r10,%rax
+ leaq 32(%r11,%r10,1),%rcx
+ movq %rax,%r10
+ jmp L$ccm64_dec_outer
+.p2align 4
+L$ccm64_dec_outer:
+ xorps %xmm2,%xmm8
+ movdqa %xmm6,%xmm2
+ movups %xmm8,(%rsi)
+ leaq 16(%rsi),%rsi
+.byte 102,15,56,0,215
+
+ subq $1,%rdx
+ jz L$ccm64_dec_break
+
+ movups (%r11),%xmm0
+ movq %r10,%rax
+ movups 16(%r11),%xmm1
+ xorps %xmm0,%xmm8
+ xorps %xmm0,%xmm2
+ xorps %xmm8,%xmm3
+ movups 32(%r11),%xmm0
+ jmp L$ccm64_dec2_loop
+.p2align 4
+L$ccm64_dec2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$ccm64_dec2_loop
+ movups (%rdi),%xmm8
+ paddq %xmm9,%xmm6
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ leaq 16(%rdi),%rdi
+ jmp L$ccm64_dec_outer
+
+.p2align 4
+L$ccm64_dec_break:
+
+ movl 240(%r11),%eax
+ movups (%r11),%xmm0
+ movups 16(%r11),%xmm1
+ xorps %xmm0,%xmm8
+ leaq 32(%r11),%r11
+ xorps %xmm8,%xmm3
+L$oop_enc1_6:
+.byte 102,15,56,220,217
+ decl %eax
+ movups (%r11),%xmm1
+ leaq 16(%r11),%r11
+ jnz L$oop_enc1_6
+.byte 102,15,56,221,217
+ movups %xmm3,(%r9)
+ .byte 0xf3,0xc3
+
+.globl _aesni_ctr32_encrypt_blocks
+.private_extern _aesni_ctr32_encrypt_blocks
+
+.p2align 4
+_aesni_ctr32_encrypt_blocks:
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $128,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+
+ cmpq $1,%rdx
+ je L$ctr32_one_shortcut
+
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%r11d
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
+ movdqa %xmm2,112(%rsp)
+
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %r11d,%eax
+ xorl %r11d,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
+ movdqa %xmm3,16(%rsp)
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %r11d,%eax
+ bswapl %r10d
+.byte 102,15,58,34,232,3
+ xorl %r11d,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
+ xorl %r11d,%r9d
+ bswapl %r10d
+ movl %r9d,80+12(%rsp)
+ xorl %r11d,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ movl _OPENSSL_ia32cap_P+4(%rip),%r10d
+ xorl %r11d,%r9d
+ andl $71303168,%r10d
+ movl %r9d,112+12(%rsp)
+
+ movups 16(%rcx),%xmm1
+
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+
+ cmpq $8,%rdx
+ jb L$ctr32_tail
+
+ subq $6,%rdx
+ cmpl $4194304,%r10d
+ je L$ctr32_6x
+
+ leaq 128(%rcx),%rcx
+ subq $2,%rdx
+ jmp L$ctr32_loop8
+
+.p2align 4
+L$ctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %r11d
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
+ jmp L$ctr32_loop6
+
+.p2align 4
+L$ctr32_loop6:
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %r11d,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %r11d,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %r11d,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+
+ call L$enc_loop6
+
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ subq $6,%rdx
+ jnc L$ctr32_loop6
+
+ addq $6,%rdx
+ jz L$ctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp L$ctr32_tail
+
+.p2align 5
+L$ctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
+.byte 102,15,56,220,209
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
+.byte 102,15,56,220,217
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
+.byte 102,15,56,220,225
+ xorl %r11d,%r9d
+ nop
+.byte 102,15,56,220,233
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %r11d,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ xorl %r11d,%r9d
+ movdqu 0(%rdi),%xmm10
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ jb L$ctr32_enc_done
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je L$ctr32_enc_done
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+ jmp L$ctr32_enc_done
+
+.p2align 4
+L$ctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112-128(%rdi),%xmm10
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+ movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc L$ctr32_loop8
+
+ addq $8,%rdx
+ jz L$ctr32_done
+ leaq -128(%rcx),%rcx
+
+L$ctr32_tail:
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb L$ctr32_loop3
+ je L$ctr32_loop4
+
+ shll $4,%eax
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
+
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,225
+ addq $16,%rax
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+
+ call L$enc_loop8_enter
+
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb L$ctr32_done
+
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je L$ctr32_done
+
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp L$ctr32_done
+
+.p2align 5
+L$ctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ jnz L$ctr32_loop4
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp L$ctr32_done
+
+.p2align 5
+L$ctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ jnz L$ctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb L$ctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je L$ctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
+ jmp L$ctr32_done
+
+.p2align 4
+L$ctr32_one_shortcut:
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm10
+ movl 240(%rcx),%eax
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_7:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_7
+.byte 102,15,56,221,209
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ jmp L$ctr32_done
+
+.p2align 4
+L$ctr32_done:
+ leaq (%rbp),%rsp
+ popq %rbp
+L$ctr32_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _aesni_xts_encrypt
+.private_extern _aesni_xts_encrypt
+
+.p2align 4
+_aesni_xts_encrypt:
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r9),%xmm2
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+ movups (%r8),%xmm0
+ movups 16(%r8),%xmm1
+ leaq 32(%r8),%r8
+ xorps %xmm0,%xmm2
+L$oop_enc1_8:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%r8),%xmm1
+ leaq 16(%r8),%r8
+ jnz L$oop_enc1_8
+.byte 102,15,56,221,209
+ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %r10d,%eax
+ shll $4,%r10d
+ movq %rdx,%r9
+ andq $-16,%rdx
+
+ movups 16(%rcx,%r10,1),%xmm1
+
+ movdqa L$xts_magic(%rip),%xmm8
+ movdqa %xmm2,%xmm15
+ pshufd $95,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pxor %xmm0,%xmm14
+ pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
+ subq $96,%rdx
+ jc L$xts_enc_short
+
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq L$xts_magic(%rip),%r8
+ jmp L$xts_enc_grandloop
+
+.p2align 5
+L$xts_enc_grandloop:
+ movdqu 0(%rdi),%xmm2
+ movdqa %xmm0,%xmm8
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm10,%xmm2
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm11,%xmm3
+.byte 102,15,56,220,209
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
+.byte 102,15,56,220,217
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
+.byte 102,15,56,220,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
+.byte 102,15,56,220,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
+
+ pxor %xmm9,%xmm10
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,220,208
+ pxor %xmm9,%xmm13
+ movdqa %xmm11,16(%rsp)
+.byte 102,15,56,220,216
+ pxor %xmm9,%xmm14
+ movdqa %xmm12,32(%rsp)
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $95,%xmm15,%xmm9
+ jmp L$xts_enc_loop6
+.p2align 5
+L$xts_enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -80(%rcx,%rax,1),%xmm0
+ jnz L$xts_enc_loop6
+
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
+.byte 102,15,56,220,217
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
+.byte 102,15,56,220,249
+ movups -64(%rcx),%xmm1
+
+ movdqa %xmm9,%xmm14
+.byte 102,15,56,220,208
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
+.byte 102,15,56,220,216
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+.byte 102,15,56,220,248
+ movups -48(%rcx),%xmm0
+
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
+.byte 102,15,56,220,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
+.byte 102,15,56,220,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
+.byte 102,15,56,220,249
+ movups -32(%rcx),%xmm1
+
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,220,216
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,220,248
+
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,220,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,220,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,221,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,221,92,36,16
+.byte 102,15,56,221,100,36,32
+ pand %xmm8,%xmm9
+ movq %r10,%rax
+.byte 102,15,56,221,108,36,48
+.byte 102,15,56,221,116,36,64
+.byte 102,15,56,221,124,36,80
+ pxor %xmm9,%xmm15
+
+ leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
+ subq $96,%rdx
+ jnc L$xts_enc_grandloop
+
+ movl $16+96,%eax
+ subl %r10d,%eax
+ movq %r11,%rcx
+ shrl $4,%eax
+
+L$xts_enc_short:
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ addq $96,%rdx
+ jz L$xts_enc_done
+
+ pxor %xmm0,%xmm11
+ cmpq $32,%rdx
+ jb L$xts_enc_one
+ pxor %xmm0,%xmm12
+ je L$xts_enc_two
+
+ pxor %xmm0,%xmm13
+ cmpq $64,%rdx
+ jb L$xts_enc_three
+ pxor %xmm0,%xmm14
+ je L$xts_enc_four
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ leaq 80(%rdi),%rdi
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm6
+
+ call _aesni_encrypt6
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm15,%xmm10
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ xorps %xmm14,%xmm6
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ jmp L$xts_enc_done
+
+.p2align 4
+L$xts_enc_one:
+ movups (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_9:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_9
+.byte 102,15,56,221,209
+ xorps %xmm10,%xmm2
+ movdqa %xmm11,%xmm10
+ movups %xmm2,(%rsi)
+ leaq 16(%rsi),%rsi
+ jmp L$xts_enc_done
+
+.p2align 4
+L$xts_enc_two:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ leaq 32(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+
+ call _aesni_encrypt2
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm12,%xmm10
+ xorps %xmm11,%xmm3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ leaq 32(%rsi),%rsi
+ jmp L$xts_enc_done
+
+.p2align 4
+L$xts_enc_three:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ leaq 48(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+
+ call _aesni_encrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ leaq 48(%rsi),%rsi
+ jmp L$xts_enc_done
+
+.p2align 4
+L$xts_enc_four:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+ leaq 64(%rdi),%rdi
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ xorps %xmm13,%xmm5
+
+ call _aesni_encrypt4
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp L$xts_enc_done
+
+.p2align 4
+L$xts_enc_done:
+ andq $15,%r9
+ jz L$xts_enc_ret
+ movq %r9,%rdx
+
+L$xts_enc_steal:
+ movzbl (%rdi),%eax
+ movzbl -16(%rsi),%ecx
+ leaq 1(%rdi),%rdi
+ movb %al,-16(%rsi)
+ movb %cl,0(%rsi)
+ leaq 1(%rsi),%rsi
+ subq $1,%rdx
+ jnz L$xts_enc_steal
+
+ subq %r9,%rsi
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups -16(%rsi),%xmm2
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_enc1_10:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_10
+.byte 102,15,56,221,209
+ xorps %xmm10,%xmm2
+ movups %xmm2,-16(%rsi)
+
+L$xts_enc_ret:
+ leaq (%rbp),%rsp
+ popq %rbp
+L$xts_enc_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _aesni_xts_decrypt
+.private_extern _aesni_xts_decrypt
+
+.p2align 4
+_aesni_xts_decrypt:
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $112,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r9),%xmm2
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+ movups (%r8),%xmm0
+ movups 16(%r8),%xmm1
+ leaq 32(%r8),%r8
+ xorps %xmm0,%xmm2
+L$oop_enc1_11:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%r8),%xmm1
+ leaq 16(%r8),%r8
+ jnz L$oop_enc1_11
+.byte 102,15,56,221,209
+ xorl %eax,%eax
+ testq $15,%rdx
+ setnz %al
+ shlq $4,%rax
+ subq %rax,%rdx
+
+ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %r10d,%eax
+ shll $4,%r10d
+ movq %rdx,%r9
+ andq $-16,%rdx
+
+ movups 16(%rcx,%r10,1),%xmm1
+
+ movdqa L$xts_magic(%rip),%xmm8
+ movdqa %xmm2,%xmm15
+ pshufd $95,%xmm2,%xmm9
+ pxor %xmm0,%xmm1
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm10
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm10
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm11
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm11
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm12
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm12
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm13
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+ pxor %xmm0,%xmm13
+ pxor %xmm14,%xmm15
+ movdqa %xmm15,%xmm14
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+ pxor %xmm0,%xmm14
+ pxor %xmm9,%xmm15
+ movaps %xmm1,96(%rsp)
+
+ subq $96,%rdx
+ jc L$xts_dec_short
+
+ movl $16+96,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ subq %r10,%rax
+ movups 16(%r11),%xmm1
+ movq %rax,%r10
+ leaq L$xts_magic(%rip),%r8
+ jmp L$xts_dec_grandloop
+
+.p2align 5
+L$xts_dec_grandloop:
+ movdqu 0(%rdi),%xmm2
+ movdqa %xmm0,%xmm8
+ movdqu 16(%rdi),%xmm3
+ pxor %xmm10,%xmm2
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm11,%xmm3
+.byte 102,15,56,222,209
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
+.byte 102,15,56,222,217
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
+.byte 102,15,56,222,225
+ movdqu 80(%rdi),%xmm7
+ pxor %xmm15,%xmm8
+ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
+.byte 102,15,56,222,233
+ movups 32(%r11),%xmm0
+ leaq 96(%rdi),%rdi
+ pxor %xmm8,%xmm7
+
+ pxor %xmm9,%xmm10
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm12
+
+.byte 102,15,56,222,208
+ pxor %xmm9,%xmm13
+ movdqa %xmm11,16(%rsp)
+.byte 102,15,56,222,216
+ pxor %xmm9,%xmm14
+ movdqa %xmm12,32(%rsp)
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ movdqa %xmm8,80(%rsp)
+ pshufd $95,%xmm15,%xmm9
+ jmp L$xts_dec_loop6
+.p2align 5
+L$xts_dec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups -64(%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -80(%rcx,%rax,1),%xmm0
+ jnz L$xts_dec_loop6
+
+ movdqa (%r8),%xmm8
+ movdqa %xmm9,%xmm14
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ paddq %xmm15,%xmm15
+ psrad $31,%xmm14
+.byte 102,15,56,222,217
+ pand %xmm8,%xmm14
+ movups (%r11),%xmm10
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+ pxor %xmm14,%xmm15
+ movaps %xmm10,%xmm11
+.byte 102,15,56,222,249
+ movups -64(%rcx),%xmm1
+
+ movdqa %xmm9,%xmm14
+.byte 102,15,56,222,208
+ paddd %xmm9,%xmm9
+ pxor %xmm15,%xmm10
+.byte 102,15,56,222,216
+ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ pand %xmm8,%xmm14
+ movaps %xmm11,%xmm12
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movdqa %xmm9,%xmm14
+.byte 102,15,56,222,248
+ movups -48(%rcx),%xmm0
+
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm11
+ psrad $31,%xmm14
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movdqa %xmm13,48(%rsp)
+ pxor %xmm14,%xmm15
+.byte 102,15,56,222,241
+ movaps %xmm12,%xmm13
+ movdqa %xmm9,%xmm14
+.byte 102,15,56,222,249
+ movups -32(%rcx),%xmm1
+
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,208
+ pxor %xmm15,%xmm12
+ psrad $31,%xmm14
+.byte 102,15,56,222,216
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm14
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+ pxor %xmm14,%xmm15
+ movaps %xmm13,%xmm14
+.byte 102,15,56,222,248
+
+ movdqa %xmm9,%xmm0
+ paddd %xmm9,%xmm9
+.byte 102,15,56,222,209
+ pxor %xmm15,%xmm13
+ psrad $31,%xmm0
+.byte 102,15,56,222,217
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm15
+ movups (%r11),%xmm0
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+
+ pxor %xmm15,%xmm14
+.byte 102,15,56,223,84,36,0
+ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+.byte 102,15,56,223,92,36,16
+.byte 102,15,56,223,100,36,32
+ pand %xmm8,%xmm9
+ movq %r10,%rax
+.byte 102,15,56,223,108,36,48
+.byte 102,15,56,223,116,36,64
+.byte 102,15,56,223,124,36,80
+ pxor %xmm9,%xmm15
+
+ leaq 96(%rsi),%rsi
+ movups %xmm2,-96(%rsi)
+ movups %xmm3,-80(%rsi)
+ movups %xmm4,-64(%rsi)
+ movups %xmm5,-48(%rsi)
+ movups %xmm6,-32(%rsi)
+ movups %xmm7,-16(%rsi)
+ subq $96,%rdx
+ jnc L$xts_dec_grandloop
+
+ movl $16+96,%eax
+ subl %r10d,%eax
+ movq %r11,%rcx
+ shrl $4,%eax
+
+L$xts_dec_short:
+ movl %eax,%r10d
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
+ addq $96,%rdx
+ jz L$xts_dec_done
+
+ pxor %xmm0,%xmm12
+ cmpq $32,%rdx
+ jb L$xts_dec_one
+ pxor %xmm0,%xmm13
+ je L$xts_dec_two
+
+ pxor %xmm0,%xmm14
+ cmpq $64,%rdx
+ jb L$xts_dec_three
+ je L$xts_dec_four
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+ pxor %xmm11,%xmm3
+ movdqu 64(%rdi),%xmm6
+ leaq 80(%rdi),%rdi
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm6
+
+ call _aesni_decrypt6
+
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ xorps %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ xorps %xmm14,%xmm6
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm14
+ movdqu %xmm5,48(%rsi)
+ pcmpgtd %xmm15,%xmm14
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ pshufd $19,%xmm14,%xmm11
+ andq $15,%r9
+ jz L$xts_dec_ret
+
+ movdqa %xmm15,%xmm10
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm11
+ pxor %xmm15,%xmm11
+ jmp L$xts_dec_done2
+
+.p2align 4
+L$xts_dec_one:
+ movups (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_12:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_12
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movdqa %xmm11,%xmm10
+ movups %xmm2,(%rsi)
+ movdqa %xmm12,%xmm11
+ leaq 16(%rsi),%rsi
+ jmp L$xts_dec_done
+
+.p2align 4
+L$xts_dec_two:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ leaq 32(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+
+ call _aesni_decrypt2
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm12,%xmm10
+ xorps %xmm11,%xmm3
+ movdqa %xmm13,%xmm11
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ leaq 32(%rsi),%rsi
+ jmp L$xts_dec_done
+
+.p2align 4
+L$xts_dec_three:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ leaq 48(%rdi),%rdi
+ xorps %xmm10,%xmm2
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+
+ call _aesni_decrypt3
+
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+ movdqa %xmm14,%xmm11
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ leaq 48(%rsi),%rsi
+ jmp L$xts_dec_done
+
+.p2align 4
+L$xts_dec_four:
+ movups (%rdi),%xmm2
+ movups 16(%rdi),%xmm3
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+ leaq 64(%rdi),%rdi
+ xorps %xmm11,%xmm3
+ xorps %xmm12,%xmm4
+ xorps %xmm13,%xmm5
+
+ call _aesni_decrypt4
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqa %xmm15,%xmm11
+ pxor %xmm12,%xmm4
+ movdqu %xmm2,(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm3,16(%rsi)
+ movdqu %xmm4,32(%rsi)
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp L$xts_dec_done
+
+.p2align 4
+L$xts_dec_done:
+ andq $15,%r9
+ jz L$xts_dec_ret
+L$xts_dec_done2:
+ movq %r9,%rdx
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups (%rdi),%xmm2
+ xorps %xmm11,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_13:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_13
+.byte 102,15,56,223,209
+ xorps %xmm11,%xmm2
+ movups %xmm2,(%rsi)
+
+L$xts_dec_steal:
+ movzbl 16(%rdi),%eax
+ movzbl (%rsi),%ecx
+ leaq 1(%rdi),%rdi
+ movb %al,(%rsi)
+ movb %cl,16(%rsi)
+ leaq 1(%rsi),%rsi
+ subq $1,%rdx
+ jnz L$xts_dec_steal
+
+ subq %r9,%rsi
+ movq %r11,%rcx
+ movl %r10d,%eax
+
+ movups (%rsi),%xmm2
+ xorps %xmm10,%xmm2
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_14:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_14
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+
+L$xts_dec_ret:
+ leaq (%rbp),%rsp
+ popq %rbp
+L$xts_dec_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _aesni_cbc_encrypt
+.private_extern _aesni_cbc_encrypt
+
+.p2align 4
+_aesni_cbc_encrypt:
+ testq %rdx,%rdx
+ jz L$cbc_ret
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ testl %r9d,%r9d
+ jz L$cbc_decrypt
+
+ movups (%r8),%xmm2
+ movl %r10d,%eax
+ cmpq $16,%rdx
+ jb L$cbc_enc_tail
+ subq $16,%rdx
+ jmp L$cbc_enc_loop
+.p2align 4
+L$cbc_enc_loop:
+ movups (%rdi),%xmm3
+ leaq 16(%rdi),%rdi
+
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm3
+ leaq 32(%rcx),%rcx
+ xorps %xmm3,%xmm2
+L$oop_enc1_15:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_15
+.byte 102,15,56,221,209
+ movl %r10d,%eax
+ movq %r11,%rcx
+ movups %xmm2,0(%rsi)
+ leaq 16(%rsi),%rsi
+ subq $16,%rdx
+ jnc L$cbc_enc_loop
+ addq $16,%rdx
+ jnz L$cbc_enc_tail
+ movups %xmm2,(%r8)
+ jmp L$cbc_ret
+
+L$cbc_enc_tail:
+ movq %rdx,%rcx
+ xchgq %rdi,%rsi
+.long 0x9066A4F3
+ movl $16,%ecx
+ subq %rdx,%rcx
+ xorl %eax,%eax
+.long 0x9066AAF3
+ leaq -16(%rdi),%rdi
+ movl %r10d,%eax
+ movq %rdi,%rsi
+ movq %r11,%rcx
+ xorq %rdx,%rdx
+ jmp L$cbc_enc_loop
+
+.p2align 4
+L$cbc_decrypt:
+ leaq (%rsp),%rax
+ pushq %rbp
+ subq $16,%rsp
+ andq $-16,%rsp
+ leaq -8(%rax),%rbp
+ movups (%r8),%xmm10
+ movl %r10d,%eax
+ cmpq $80,%rdx
+ jbe L$cbc_dec_tail
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ movl _OPENSSL_ia32cap_P+4(%rip),%r9d
+ cmpq $112,%rdx
+ jbe L$cbc_dec_six_or_seven
+
+ andl $71303168,%r9d
+ subq $80,%rdx
+ cmpl $4194304,%r9d
+ je L$cbc_dec_loop6_enter
+ subq $32,%rdx
+ leaq 112(%rcx),%rcx
+ jmp L$cbc_dec_loop8_enter
+.p2align 4
+L$cbc_dec_loop8:
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+L$cbc_dec_loop8_enter:
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ xorq %r11,%r11
+ cmpq $112,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+ setnc %r11b
+ shlq $7,%r11
+.byte 102,68,15,56,222,201
+ addq %rdi,%r11
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ jb L$cbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je L$cbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_done:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
+ movdqu 0(%r11),%xmm11
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+ movdqu 16(%r11),%xmm12
+ movdqu 32(%r11),%xmm13
+.byte 102,65,15,56,223,255
+.byte 102,68,15,56,223,193
+ movdqu 48(%r11),%xmm14
+ movdqu 64(%r11),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%r11),%xmm1
+ movups -112(%rcx),%xmm0
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm1,%xmm7
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
+
+ subq $128,%rdx
+ ja L$cbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+ leaq -112(%rcx),%rcx
+ addq $112,%rdx
+ jle L$cbc_dec_tail_collected
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ cmpq $80,%rdx
+ jbe L$cbc_dec_tail
+
+ movaps %xmm11,%xmm2
+L$cbc_dec_six_or_seven:
+ cmpq $96,%rdx
+ ja L$cbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+L$cbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %r11,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $96,%rdx
+ ja L$cbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $80,%rdx
+ jle L$cbc_dec_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
+L$cbc_dec_tail:
+ movups (%rdi),%xmm2
+ subq $16,%rdx
+ jbe L$cbc_dec_one
+
+ movups 16(%rdi),%xmm3
+ movaps %xmm2,%xmm11
+ subq $16,%rdx
+ jbe L$cbc_dec_two
+
+ movups 32(%rdi),%xmm4
+ movaps %xmm3,%xmm12
+ subq $16,%rdx
+ jbe L$cbc_dec_three
+
+ movups 48(%rdi),%xmm5
+ movaps %xmm4,%xmm13
+ subq $16,%rdx
+ jbe L$cbc_dec_four
+
+ movups 64(%rdi),%xmm6
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ subq $16,%rdx
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_one:
+ movaps %xmm2,%xmm11
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+L$oop_dec1_16:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_16
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
+ jmp L$cbc_dec_tail_collected
+.p2align 4
+L$cbc_dec_two:
+ movaps %xmm3,%xmm12
+ call _aesni_decrypt2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
+ leaq 16(%rsi),%rsi
+ jmp L$cbc_dec_tail_collected
+.p2align 4
+L$cbc_dec_three:
+ movaps %xmm4,%xmm13
+ call _aesni_decrypt3
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ movdqa %xmm4,%xmm2
+ leaq 32(%rsi),%rsi
+ jmp L$cbc_dec_tail_collected
+.p2align 4
+L$cbc_dec_four:
+ movaps %xmm5,%xmm14
+ call _aesni_decrypt4
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ movdqa %xmm5,%xmm2
+ leaq 48(%rsi),%rsi
+ jmp L$cbc_dec_tail_collected
+
+.p2align 4
+L$cbc_dec_tail_collected:
+ movups %xmm10,(%r8)
+ andq $15,%rdx
+ jnz L$cbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ jmp L$cbc_dec_ret
+.p2align 4
+L$cbc_dec_tail_partial:
+ movaps %xmm2,(%rsp)
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+ leaq (%rsp),%rsi
+.long 0x9066A4F3
+
+L$cbc_dec_ret:
+ leaq (%rbp),%rsp
+ popq %rbp
+L$cbc_ret:
+ .byte 0xf3,0xc3
+
+.globl _aesni_set_decrypt_key
+.private_extern _aesni_set_decrypt_key
+
+.p2align 4
+_aesni_set_decrypt_key:
+.byte 0x48,0x83,0xEC,0x08
+ call __aesni_set_encrypt_key
+ shll $4,%esi
+ testl %eax,%eax
+ jnz L$dec_key_ret
+ leaq 16(%rdx,%rsi,1),%rdi
+
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+ movups %xmm0,(%rdi)
+ movups %xmm1,(%rdx)
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+
+L$dec_key_inverse:
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+ movups %xmm0,16(%rdi)
+ movups %xmm1,-16(%rdx)
+ cmpq %rdx,%rdi
+ ja L$dec_key_inverse
+
+ movups (%rdx),%xmm0
+.byte 102,15,56,219,192
+ movups %xmm0,(%rdi)
+L$dec_key_ret:
+ addq $8,%rsp
+ .byte 0xf3,0xc3
+L$SEH_end_set_decrypt_key:
+
+.globl _aesni_set_encrypt_key
+.private_extern _aesni_set_encrypt_key
+
+.p2align 4
+_aesni_set_encrypt_key:
+__aesni_set_encrypt_key:
+.byte 0x48,0x83,0xEC,0x08
+ movq $-1,%rax
+ testq %rdi,%rdi
+ jz L$enc_key_ret
+ testq %rdx,%rdx
+ jz L$enc_key_ret
+
+ movups (%rdi),%xmm0
+ xorps %xmm4,%xmm4
+ leaq 16(%rdx),%rax
+ cmpl $256,%esi
+ je L$14rounds
+ cmpl $192,%esi
+ je L$12rounds
+ cmpl $128,%esi
+ jne L$bad_keybits
+
+L$10rounds:
+ movl $9,%esi
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,200,1
+ call L$key_expansion_128_cold
+.byte 102,15,58,223,200,2
+ call L$key_expansion_128
+.byte 102,15,58,223,200,4
+ call L$key_expansion_128
+.byte 102,15,58,223,200,8
+ call L$key_expansion_128
+.byte 102,15,58,223,200,16
+ call L$key_expansion_128
+.byte 102,15,58,223,200,32
+ call L$key_expansion_128
+.byte 102,15,58,223,200,64
+ call L$key_expansion_128
+.byte 102,15,58,223,200,128
+ call L$key_expansion_128
+.byte 102,15,58,223,200,27
+ call L$key_expansion_128
+.byte 102,15,58,223,200,54
+ call L$key_expansion_128
+ movups %xmm0,(%rax)
+ movl %esi,80(%rax)
+ xorl %eax,%eax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$12rounds:
+ movq 16(%rdi),%xmm2
+ movl $11,%esi
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,202,1
+ call L$key_expansion_192a_cold
+.byte 102,15,58,223,202,2
+ call L$key_expansion_192b
+.byte 102,15,58,223,202,4
+ call L$key_expansion_192a
+.byte 102,15,58,223,202,8
+ call L$key_expansion_192b
+.byte 102,15,58,223,202,16
+ call L$key_expansion_192a
+.byte 102,15,58,223,202,32
+ call L$key_expansion_192b
+.byte 102,15,58,223,202,64
+ call L$key_expansion_192a
+.byte 102,15,58,223,202,128
+ call L$key_expansion_192b
+ movups %xmm0,(%rax)
+ movl %esi,48(%rax)
+ xorq %rax,%rax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$14rounds:
+ movups 16(%rdi),%xmm2
+ movl $13,%esi
+ leaq 16(%rax),%rax
+ movups %xmm0,(%rdx)
+ movups %xmm2,16(%rdx)
+.byte 102,15,58,223,202,1
+ call L$key_expansion_256a_cold
+.byte 102,15,58,223,200,1
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,2
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,2
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,4
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,4
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,8
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,8
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,16
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,16
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,32
+ call L$key_expansion_256a
+.byte 102,15,58,223,200,32
+ call L$key_expansion_256b
+.byte 102,15,58,223,202,64
+ call L$key_expansion_256a
+ movups %xmm0,(%rax)
+ movl %esi,16(%rax)
+ xorq %rax,%rax
+ jmp L$enc_key_ret
+
+.p2align 4
+L$bad_keybits:
+ movq $-2,%rax
+L$enc_key_ret:
+ addq $8,%rsp
+ .byte 0xf3,0xc3
+L$SEH_end_set_encrypt_key:
+
+.p2align 4
+L$key_expansion_128:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+L$key_expansion_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ .byte 0xf3,0xc3
+
+.p2align 4
+L$key_expansion_192a:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+L$key_expansion_192a_cold:
+ movaps %xmm2,%xmm5
+L$key_expansion_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ .byte 0xf3,0xc3
+
+.p2align 4
+L$key_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%rax)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%rax)
+ leaq 32(%rax),%rax
+ jmp L$key_expansion_192b_warm
+
+.p2align 4
+L$key_expansion_256a:
+ movups %xmm2,(%rax)
+ leaq 16(%rax),%rax
+L$key_expansion_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ .byte 0xf3,0xc3
+
+.p2align 4
+L$key_expansion_256b:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ .byte 0xf3,0xc3
+
+
+.p2align 6
+L$bswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$increment32:
+.long 6,6,6,0
+L$increment64:
+.long 1,0,0,0
+L$xts_magic:
+.long 0x87,0,1,0
+L$increment1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+#endif
diff --git a/mac-x86_64/crypto/aes/bsaes-x86_64.S b/mac-x86_64/crypto/aes/bsaes-x86_64.S
new file mode 100644
index 0000000..c2d0477
--- /dev/null
+++ b/mac-x86_64/crypto/aes/bsaes-x86_64.S
@@ -0,0 +1,2504 @@
+#if defined(__x86_64__)
+.text
+
+
+
+
+
+.p2align 6
+_bsaes_encrypt8:
+ leaq L$BS0(%rip),%r11
+
+ movdqa (%rax),%xmm8
+ leaq 16(%rax),%rax
+ movdqa 80(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+ pxor %xmm8,%xmm1
+ pxor %xmm8,%xmm2
+.byte 102,68,15,56,0,255
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm3
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm5
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,223
+.byte 102,15,56,0,231
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+_bsaes_encrypt8_bitslice:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $1,%xmm3
+ pxor %xmm6,%xmm5
+ pxor %xmm4,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm6
+ psllq $1,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $1,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm2,%xmm1
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm15
+ pxor %xmm1,%xmm2
+ psllq $1,%xmm1
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm4,%xmm9
+ psrlq $2,%xmm4
+ movdqa %xmm3,%xmm10
+ psrlq $2,%xmm3
+ pxor %xmm6,%xmm4
+ pxor %xmm5,%xmm3
+ pand %xmm8,%xmm4
+ pand %xmm8,%xmm3
+ pxor %xmm4,%xmm6
+ psllq $2,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $2,%xmm3
+ pxor %xmm9,%xmm4
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm2,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm2
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm2,%xmm9
+ psrlq $4,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $4,%xmm1
+ pxor %xmm6,%xmm2
+ pxor %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $4,%xmm2
+ pxor %xmm1,%xmm5
+ psllq $4,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm4
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ decl %r10d
+ jmp L$enc_sbox
+.p2align 4
+L$enc_loop:
+ pxor 0(%rax),%xmm15
+ pxor 16(%rax),%xmm0
+ pxor 32(%rax),%xmm1
+ pxor 48(%rax),%xmm2
+.byte 102,68,15,56,0,255
+.byte 102,15,56,0,199
+ pxor 64(%rax),%xmm3
+ pxor 80(%rax),%xmm4
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ pxor 96(%rax),%xmm5
+ pxor 112(%rax),%xmm6
+.byte 102,15,56,0,223
+.byte 102,15,56,0,231
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+ leaq 128(%rax),%rax
+L$enc_sbox:
+ pxor %xmm5,%xmm4
+ pxor %xmm0,%xmm1
+ pxor %xmm15,%xmm2
+ pxor %xmm1,%xmm5
+ pxor %xmm15,%xmm4
+
+ pxor %xmm2,%xmm5
+ pxor %xmm6,%xmm2
+ pxor %xmm4,%xmm6
+ pxor %xmm3,%xmm2
+ pxor %xmm4,%xmm3
+ pxor %xmm0,%xmm2
+
+ pxor %xmm6,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm6,%xmm10
+ movdqa %xmm0,%xmm9
+ movdqa %xmm4,%xmm8
+ movdqa %xmm1,%xmm12
+ movdqa %xmm5,%xmm11
+
+ pxor %xmm3,%xmm10
+ pxor %xmm1,%xmm9
+ pxor %xmm2,%xmm8
+ movdqa %xmm10,%xmm13
+ pxor %xmm3,%xmm12
+ movdqa %xmm9,%xmm7
+ pxor %xmm15,%xmm11
+ movdqa %xmm10,%xmm14
+
+ por %xmm8,%xmm9
+ por %xmm11,%xmm10
+ pxor %xmm7,%xmm14
+ pand %xmm11,%xmm13
+ pxor %xmm8,%xmm11
+ pand %xmm8,%xmm7
+ pand %xmm11,%xmm14
+ movdqa %xmm2,%xmm11
+ pxor %xmm15,%xmm11
+ pand %xmm11,%xmm12
+ pxor %xmm12,%xmm10
+ pxor %xmm12,%xmm9
+ movdqa %xmm6,%xmm12
+ movdqa %xmm4,%xmm11
+ pxor %xmm0,%xmm12
+ pxor %xmm5,%xmm11
+ movdqa %xmm12,%xmm8
+ pand %xmm11,%xmm12
+ por %xmm11,%xmm8
+ pxor %xmm12,%xmm7
+ pxor %xmm14,%xmm10
+ pxor %xmm13,%xmm9
+ pxor %xmm14,%xmm8
+ movdqa %xmm1,%xmm11
+ pxor %xmm13,%xmm7
+ movdqa %xmm3,%xmm12
+ pxor %xmm13,%xmm8
+ movdqa %xmm0,%xmm13
+ pand %xmm2,%xmm11
+ movdqa %xmm6,%xmm14
+ pand %xmm15,%xmm12
+ pand %xmm4,%xmm13
+ por %xmm5,%xmm14
+ pxor %xmm11,%xmm10
+ pxor %xmm12,%xmm9
+ pxor %xmm13,%xmm8
+ pxor %xmm14,%xmm7
+
+
+
+
+
+ movdqa %xmm10,%xmm11
+ pand %xmm8,%xmm10
+ pxor %xmm9,%xmm11
+
+ movdqa %xmm7,%xmm13
+ movdqa %xmm11,%xmm14
+ pxor %xmm10,%xmm13
+ pand %xmm13,%xmm14
+
+ movdqa %xmm8,%xmm12
+ pxor %xmm9,%xmm14
+ pxor %xmm7,%xmm12
+
+ pxor %xmm9,%xmm10
+
+ pand %xmm10,%xmm12
+
+ movdqa %xmm13,%xmm9
+ pxor %xmm7,%xmm12
+
+ pxor %xmm12,%xmm9
+ pxor %xmm12,%xmm8
+
+ pand %xmm7,%xmm9
+
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm8
+
+ pand %xmm14,%xmm13
+
+ pxor %xmm11,%xmm13
+ movdqa %xmm5,%xmm11
+ movdqa %xmm4,%xmm7
+ movdqa %xmm14,%xmm9
+ pxor %xmm13,%xmm9
+ pand %xmm5,%xmm9
+ pxor %xmm4,%xmm5
+ pand %xmm14,%xmm4
+ pand %xmm13,%xmm5
+ pxor %xmm4,%xmm5
+ pxor %xmm9,%xmm4
+ pxor %xmm15,%xmm11
+ pxor %xmm2,%xmm7
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm15,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm2,%xmm15
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm2
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm15
+ pxor %xmm11,%xmm7
+ pxor %xmm2,%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm11,%xmm5
+ pxor %xmm11,%xmm15
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm2
+
+ movdqa %xmm6,%xmm11
+ movdqa %xmm0,%xmm7
+ pxor %xmm3,%xmm11
+ pxor %xmm1,%xmm7
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm3,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm1,%xmm3
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm1
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm3
+ pxor %xmm11,%xmm7
+ pxor %xmm1,%xmm3
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm1
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ pxor %xmm13,%xmm10
+ pand %xmm6,%xmm10
+ pxor %xmm0,%xmm6
+ pand %xmm14,%xmm0
+ pand %xmm13,%xmm6
+ pxor %xmm0,%xmm6
+ pxor %xmm10,%xmm0
+ pxor %xmm11,%xmm6
+ pxor %xmm11,%xmm3
+ pxor %xmm7,%xmm0
+ pxor %xmm7,%xmm1
+ pxor %xmm15,%xmm6
+ pxor %xmm5,%xmm0
+ pxor %xmm6,%xmm3
+ pxor %xmm15,%xmm5
+ pxor %xmm0,%xmm15
+
+ pxor %xmm4,%xmm0
+ pxor %xmm1,%xmm4
+ pxor %xmm2,%xmm1
+ pxor %xmm4,%xmm2
+ pxor %xmm4,%xmm3
+
+ pxor %xmm2,%xmm5
+ decl %r10d
+ jl L$enc_done
+ pshufd $147,%xmm15,%xmm7
+ pshufd $147,%xmm0,%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $147,%xmm3,%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $147,%xmm5,%xmm10
+ pxor %xmm9,%xmm3
+ pshufd $147,%xmm2,%xmm11
+ pxor %xmm10,%xmm5
+ pshufd $147,%xmm6,%xmm12
+ pxor %xmm11,%xmm2
+ pshufd $147,%xmm1,%xmm13
+ pxor %xmm12,%xmm6
+ pshufd $147,%xmm4,%xmm14
+ pxor %xmm13,%xmm1
+ pxor %xmm14,%xmm4
+
+ pxor %xmm15,%xmm8
+ pxor %xmm4,%xmm7
+ pxor %xmm4,%xmm8
+ pshufd $78,%xmm15,%xmm15
+ pxor %xmm0,%xmm9
+ pshufd $78,%xmm0,%xmm0
+ pxor %xmm2,%xmm12
+ pxor %xmm7,%xmm15
+ pxor %xmm6,%xmm13
+ pxor %xmm8,%xmm0
+ pxor %xmm5,%xmm11
+ pshufd $78,%xmm2,%xmm7
+ pxor %xmm1,%xmm14
+ pshufd $78,%xmm6,%xmm8
+ pxor %xmm3,%xmm10
+ pshufd $78,%xmm5,%xmm2
+ pxor %xmm4,%xmm10
+ pshufd $78,%xmm4,%xmm6
+ pxor %xmm4,%xmm11
+ pshufd $78,%xmm1,%xmm5
+ pxor %xmm11,%xmm7
+ pshufd $78,%xmm3,%xmm1
+ pxor %xmm12,%xmm8
+ pxor %xmm10,%xmm2
+ pxor %xmm14,%xmm6
+ pxor %xmm13,%xmm5
+ movdqa %xmm7,%xmm3
+ pxor %xmm9,%xmm1
+ movdqa %xmm8,%xmm4
+ movdqa 48(%r11),%xmm7
+ jnz L$enc_loop
+ movdqa 64(%r11),%xmm7
+ jmp L$enc_loop
+.p2align 4
+L$enc_done:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm2,%xmm10
+ psrlq $1,%xmm2
+ pxor %xmm4,%xmm1
+ pxor %xmm6,%xmm2
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm2
+ pxor %xmm1,%xmm4
+ psllq $1,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $1,%xmm2
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm2
+ movdqa %xmm3,%xmm9
+ psrlq $1,%xmm3
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm5,%xmm3
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm3
+ pand %xmm7,%xmm15
+ pxor %xmm3,%xmm5
+ psllq $1,%xmm3
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm3
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm6,%xmm9
+ psrlq $2,%xmm6
+ movdqa %xmm2,%xmm10
+ psrlq $2,%xmm2
+ pxor %xmm4,%xmm6
+ pxor %xmm1,%xmm2
+ pand %xmm8,%xmm6
+ pand %xmm8,%xmm2
+ pxor %xmm6,%xmm4
+ psllq $2,%xmm6
+ pxor %xmm2,%xmm1
+ psllq $2,%xmm2
+ pxor %xmm9,%xmm6
+ pxor %xmm10,%xmm2
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm5,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm5
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm5,%xmm9
+ psrlq $4,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $4,%xmm3
+ pxor %xmm4,%xmm5
+ pxor %xmm1,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm4
+ psllq $4,%xmm5
+ pxor %xmm3,%xmm1
+ psllq $4,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm6,%xmm0
+ pxor %xmm2,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm6
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm2
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa (%rax),%xmm7
+ pxor %xmm7,%xmm3
+ pxor %xmm7,%xmm5
+ pxor %xmm7,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm1
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm15
+ pxor %xmm7,%xmm0
+ .byte 0xf3,0xc3
+
+
+
+.p2align 6
+_bsaes_decrypt8:
+ leaq L$BS0(%rip),%r11
+
+ movdqa (%rax),%xmm8
+ leaq 16(%rax),%rax
+ movdqa -48(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+ pxor %xmm8,%xmm1
+ pxor %xmm8,%xmm2
+.byte 102,68,15,56,0,255
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm3
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm5
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,223
+.byte 102,15,56,0,231
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm3,%xmm10
+ psrlq $1,%xmm3
+ pxor %xmm6,%xmm5
+ pxor %xmm4,%xmm3
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm3
+ pxor %xmm5,%xmm6
+ psllq $1,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $1,%xmm3
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm3
+ movdqa %xmm1,%xmm9
+ psrlq $1,%xmm1
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm2,%xmm1
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm1
+ pand %xmm7,%xmm15
+ pxor %xmm1,%xmm2
+ psllq $1,%xmm1
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm1
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm4,%xmm9
+ psrlq $2,%xmm4
+ movdqa %xmm3,%xmm10
+ psrlq $2,%xmm3
+ pxor %xmm6,%xmm4
+ pxor %xmm5,%xmm3
+ pand %xmm8,%xmm4
+ pand %xmm8,%xmm3
+ pxor %xmm4,%xmm6
+ psllq $2,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $2,%xmm3
+ pxor %xmm9,%xmm4
+ pxor %xmm10,%xmm3
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm2,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm2
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm2,%xmm9
+ psrlq $4,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $4,%xmm1
+ pxor %xmm6,%xmm2
+ pxor %xmm5,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm6
+ psllq $4,%xmm2
+ pxor %xmm1,%xmm5
+ psllq $4,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm4
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm3
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ decl %r10d
+ jmp L$dec_sbox
+.p2align 4
+L$dec_loop:
+ pxor 0(%rax),%xmm15
+ pxor 16(%rax),%xmm0
+ pxor 32(%rax),%xmm1
+ pxor 48(%rax),%xmm2
+.byte 102,68,15,56,0,255
+.byte 102,15,56,0,199
+ pxor 64(%rax),%xmm3
+ pxor 80(%rax),%xmm4
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ pxor 96(%rax),%xmm5
+ pxor 112(%rax),%xmm6
+.byte 102,15,56,0,223
+.byte 102,15,56,0,231
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+ leaq 128(%rax),%rax
+L$dec_sbox:
+ pxor %xmm3,%xmm2
+
+ pxor %xmm6,%xmm3
+ pxor %xmm6,%xmm1
+ pxor %xmm3,%xmm5
+ pxor %xmm5,%xmm6
+ pxor %xmm6,%xmm0
+
+ pxor %xmm0,%xmm15
+ pxor %xmm4,%xmm1
+ pxor %xmm15,%xmm2
+ pxor %xmm15,%xmm4
+ pxor %xmm2,%xmm0
+ movdqa %xmm2,%xmm10
+ movdqa %xmm6,%xmm9
+ movdqa %xmm0,%xmm8
+ movdqa %xmm3,%xmm12
+ movdqa %xmm4,%xmm11
+
+ pxor %xmm15,%xmm10
+ pxor %xmm3,%xmm9
+ pxor %xmm5,%xmm8
+ movdqa %xmm10,%xmm13
+ pxor %xmm15,%xmm12
+ movdqa %xmm9,%xmm7
+ pxor %xmm1,%xmm11
+ movdqa %xmm10,%xmm14
+
+ por %xmm8,%xmm9
+ por %xmm11,%xmm10
+ pxor %xmm7,%xmm14
+ pand %xmm11,%xmm13
+ pxor %xmm8,%xmm11
+ pand %xmm8,%xmm7
+ pand %xmm11,%xmm14
+ movdqa %xmm5,%xmm11
+ pxor %xmm1,%xmm11
+ pand %xmm11,%xmm12
+ pxor %xmm12,%xmm10
+ pxor %xmm12,%xmm9
+ movdqa %xmm2,%xmm12
+ movdqa %xmm0,%xmm11
+ pxor %xmm6,%xmm12
+ pxor %xmm4,%xmm11
+ movdqa %xmm12,%xmm8
+ pand %xmm11,%xmm12
+ por %xmm11,%xmm8
+ pxor %xmm12,%xmm7
+ pxor %xmm14,%xmm10
+ pxor %xmm13,%xmm9
+ pxor %xmm14,%xmm8
+ movdqa %xmm3,%xmm11
+ pxor %xmm13,%xmm7
+ movdqa %xmm15,%xmm12
+ pxor %xmm13,%xmm8
+ movdqa %xmm6,%xmm13
+ pand %xmm5,%xmm11
+ movdqa %xmm2,%xmm14
+ pand %xmm1,%xmm12
+ pand %xmm0,%xmm13
+ por %xmm4,%xmm14
+ pxor %xmm11,%xmm10
+ pxor %xmm12,%xmm9
+ pxor %xmm13,%xmm8
+ pxor %xmm14,%xmm7
+
+
+
+
+
+ movdqa %xmm10,%xmm11
+ pand %xmm8,%xmm10
+ pxor %xmm9,%xmm11
+
+ movdqa %xmm7,%xmm13
+ movdqa %xmm11,%xmm14
+ pxor %xmm10,%xmm13
+ pand %xmm13,%xmm14
+
+ movdqa %xmm8,%xmm12
+ pxor %xmm9,%xmm14
+ pxor %xmm7,%xmm12
+
+ pxor %xmm9,%xmm10
+
+ pand %xmm10,%xmm12
+
+ movdqa %xmm13,%xmm9
+ pxor %xmm7,%xmm12
+
+ pxor %xmm12,%xmm9
+ pxor %xmm12,%xmm8
+
+ pand %xmm7,%xmm9
+
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm8
+
+ pand %xmm14,%xmm13
+
+ pxor %xmm11,%xmm13
+ movdqa %xmm4,%xmm11
+ movdqa %xmm0,%xmm7
+ movdqa %xmm14,%xmm9
+ pxor %xmm13,%xmm9
+ pand %xmm4,%xmm9
+ pxor %xmm0,%xmm4
+ pand %xmm14,%xmm0
+ pand %xmm13,%xmm4
+ pxor %xmm0,%xmm4
+ pxor %xmm9,%xmm0
+ pxor %xmm1,%xmm11
+ pxor %xmm5,%xmm7
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm1,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm5,%xmm1
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm5
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm1
+ pxor %xmm11,%xmm7
+ pxor %xmm5,%xmm1
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm5
+ pxor %xmm11,%xmm4
+ pxor %xmm11,%xmm1
+ pxor %xmm7,%xmm0
+ pxor %xmm7,%xmm5
+
+ movdqa %xmm2,%xmm11
+ movdqa %xmm6,%xmm7
+ pxor %xmm15,%xmm11
+ pxor %xmm3,%xmm7
+ movdqa %xmm14,%xmm10
+ movdqa %xmm12,%xmm9
+ pxor %xmm13,%xmm10
+ pxor %xmm8,%xmm9
+ pand %xmm11,%xmm10
+ pand %xmm15,%xmm9
+ pxor %xmm7,%xmm11
+ pxor %xmm3,%xmm15
+ pand %xmm14,%xmm7
+ pand %xmm12,%xmm3
+ pand %xmm13,%xmm11
+ pand %xmm8,%xmm15
+ pxor %xmm11,%xmm7
+ pxor %xmm3,%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm9,%xmm3
+ pxor %xmm12,%xmm14
+ pxor %xmm8,%xmm13
+ movdqa %xmm14,%xmm10
+ pxor %xmm13,%xmm10
+ pand %xmm2,%xmm10
+ pxor %xmm6,%xmm2
+ pand %xmm14,%xmm6
+ pand %xmm13,%xmm2
+ pxor %xmm6,%xmm2
+ pxor %xmm10,%xmm6
+ pxor %xmm11,%xmm2
+ pxor %xmm11,%xmm15
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm3
+ pxor %xmm6,%xmm0
+ pxor %xmm4,%xmm5
+
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm1
+ pxor %xmm6,%xmm4
+ pxor %xmm1,%xmm3
+ pxor %xmm15,%xmm6
+ pxor %xmm4,%xmm3
+ pxor %xmm5,%xmm2
+ pxor %xmm0,%xmm5
+ pxor %xmm3,%xmm2
+
+ pxor %xmm15,%xmm3
+ pxor %xmm2,%xmm6
+ decl %r10d
+ jl L$dec_done
+
+ pshufd $78,%xmm15,%xmm7
+ pshufd $78,%xmm2,%xmm13
+ pxor %xmm15,%xmm7
+ pshufd $78,%xmm4,%xmm14
+ pxor %xmm2,%xmm13
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm4,%xmm14
+ pshufd $78,%xmm5,%xmm9
+ pxor %xmm0,%xmm8
+ pshufd $78,%xmm3,%xmm10
+ pxor %xmm5,%xmm9
+ pxor %xmm13,%xmm15
+ pxor %xmm13,%xmm0
+ pshufd $78,%xmm1,%xmm11
+ pxor %xmm3,%xmm10
+ pxor %xmm7,%xmm5
+ pxor %xmm8,%xmm3
+ pshufd $78,%xmm6,%xmm12
+ pxor %xmm1,%xmm11
+ pxor %xmm14,%xmm0
+ pxor %xmm9,%xmm1
+ pxor %xmm6,%xmm12
+
+ pxor %xmm14,%xmm5
+ pxor %xmm13,%xmm3
+ pxor %xmm13,%xmm1
+ pxor %xmm10,%xmm6
+ pxor %xmm11,%xmm2
+ pxor %xmm14,%xmm1
+ pxor %xmm14,%xmm6
+ pxor %xmm12,%xmm4
+ pshufd $147,%xmm15,%xmm7
+ pshufd $147,%xmm0,%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $147,%xmm5,%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $147,%xmm3,%xmm10
+ pxor %xmm9,%xmm5
+ pshufd $147,%xmm1,%xmm11
+ pxor %xmm10,%xmm3
+ pshufd $147,%xmm6,%xmm12
+ pxor %xmm11,%xmm1
+ pshufd $147,%xmm2,%xmm13
+ pxor %xmm12,%xmm6
+ pshufd $147,%xmm4,%xmm14
+ pxor %xmm13,%xmm2
+ pxor %xmm14,%xmm4
+
+ pxor %xmm15,%xmm8
+ pxor %xmm4,%xmm7
+ pxor %xmm4,%xmm8
+ pshufd $78,%xmm15,%xmm15
+ pxor %xmm0,%xmm9
+ pshufd $78,%xmm0,%xmm0
+ pxor %xmm1,%xmm12
+ pxor %xmm7,%xmm15
+ pxor %xmm6,%xmm13
+ pxor %xmm8,%xmm0
+ pxor %xmm3,%xmm11
+ pshufd $78,%xmm1,%xmm7
+ pxor %xmm2,%xmm14
+ pshufd $78,%xmm6,%xmm8
+ pxor %xmm5,%xmm10
+ pshufd $78,%xmm3,%xmm1
+ pxor %xmm4,%xmm10
+ pshufd $78,%xmm4,%xmm6
+ pxor %xmm4,%xmm11
+ pshufd $78,%xmm2,%xmm3
+ pxor %xmm11,%xmm7
+ pshufd $78,%xmm5,%xmm2
+ pxor %xmm12,%xmm8
+ pxor %xmm1,%xmm10
+ pxor %xmm14,%xmm6
+ pxor %xmm3,%xmm13
+ movdqa %xmm7,%xmm3
+ pxor %xmm9,%xmm2
+ movdqa %xmm13,%xmm5
+ movdqa %xmm8,%xmm4
+ movdqa %xmm2,%xmm1
+ movdqa %xmm10,%xmm2
+ movdqa -16(%r11),%xmm7
+ jnz L$dec_loop
+ movdqa -32(%r11),%xmm7
+ jmp L$dec_loop
+.p2align 4
+L$dec_done:
+ movdqa 0(%r11),%xmm7
+ movdqa 16(%r11),%xmm8
+ movdqa %xmm2,%xmm9
+ psrlq $1,%xmm2
+ movdqa %xmm1,%xmm10
+ psrlq $1,%xmm1
+ pxor %xmm4,%xmm2
+ pxor %xmm6,%xmm1
+ pand %xmm7,%xmm2
+ pand %xmm7,%xmm1
+ pxor %xmm2,%xmm4
+ psllq $1,%xmm2
+ pxor %xmm1,%xmm6
+ psllq $1,%xmm1
+ pxor %xmm9,%xmm2
+ pxor %xmm10,%xmm1
+ movdqa %xmm5,%xmm9
+ psrlq $1,%xmm5
+ movdqa %xmm15,%xmm10
+ psrlq $1,%xmm15
+ pxor %xmm3,%xmm5
+ pxor %xmm0,%xmm15
+ pand %xmm7,%xmm5
+ pand %xmm7,%xmm15
+ pxor %xmm5,%xmm3
+ psllq $1,%xmm5
+ pxor %xmm15,%xmm0
+ psllq $1,%xmm15
+ pxor %xmm9,%xmm5
+ pxor %xmm10,%xmm15
+ movdqa 32(%r11),%xmm7
+ movdqa %xmm6,%xmm9
+ psrlq $2,%xmm6
+ movdqa %xmm1,%xmm10
+ psrlq $2,%xmm1
+ pxor %xmm4,%xmm6
+ pxor %xmm2,%xmm1
+ pand %xmm8,%xmm6
+ pand %xmm8,%xmm1
+ pxor %xmm6,%xmm4
+ psllq $2,%xmm6
+ pxor %xmm1,%xmm2
+ psllq $2,%xmm1
+ pxor %xmm9,%xmm6
+ pxor %xmm10,%xmm1
+ movdqa %xmm0,%xmm9
+ psrlq $2,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $2,%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm15
+ pand %xmm8,%xmm0
+ pand %xmm8,%xmm15
+ pxor %xmm0,%xmm3
+ psllq $2,%xmm0
+ pxor %xmm15,%xmm5
+ psllq $2,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa %xmm3,%xmm9
+ psrlq $4,%xmm3
+ movdqa %xmm5,%xmm10
+ psrlq $4,%xmm5
+ pxor %xmm4,%xmm3
+ pxor %xmm2,%xmm5
+ pand %xmm7,%xmm3
+ pand %xmm7,%xmm5
+ pxor %xmm3,%xmm4
+ psllq $4,%xmm3
+ pxor %xmm5,%xmm2
+ psllq $4,%xmm5
+ pxor %xmm9,%xmm3
+ pxor %xmm10,%xmm5
+ movdqa %xmm0,%xmm9
+ psrlq $4,%xmm0
+ movdqa %xmm15,%xmm10
+ psrlq $4,%xmm15
+ pxor %xmm6,%xmm0
+ pxor %xmm1,%xmm15
+ pand %xmm7,%xmm0
+ pand %xmm7,%xmm15
+ pxor %xmm0,%xmm6
+ psllq $4,%xmm0
+ pxor %xmm15,%xmm1
+ psllq $4,%xmm15
+ pxor %xmm9,%xmm0
+ pxor %xmm10,%xmm15
+ movdqa (%rax),%xmm7
+ pxor %xmm7,%xmm5
+ pxor %xmm7,%xmm3
+ pxor %xmm7,%xmm1
+ pxor %xmm7,%xmm6
+ pxor %xmm7,%xmm2
+ pxor %xmm7,%xmm4
+ pxor %xmm7,%xmm15
+ pxor %xmm7,%xmm0
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+_bsaes_key_convert:
+ leaq L$masks(%rip),%r11
+ movdqu (%rcx),%xmm7
+ leaq 16(%rcx),%rcx
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ movdqa 48(%r11),%xmm3
+ movdqa 64(%r11),%xmm4
+ pcmpeqd %xmm5,%xmm5
+
+ movdqu (%rcx),%xmm6
+ movdqa %xmm7,(%rax)
+ leaq 16(%rax),%rax
+ decl %r10d
+ jmp L$key_loop
+.p2align 4
+L$key_loop:
+.byte 102,15,56,0,244
+
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+
+ pand %xmm6,%xmm8
+ pand %xmm6,%xmm9
+ movdqa %xmm2,%xmm10
+ pcmpeqb %xmm0,%xmm8
+ psllq $4,%xmm0
+ movdqa %xmm3,%xmm11
+ pcmpeqb %xmm1,%xmm9
+ psllq $4,%xmm1
+
+ pand %xmm6,%xmm10
+ pand %xmm6,%xmm11
+ movdqa %xmm0,%xmm12
+ pcmpeqb %xmm2,%xmm10
+ psllq $4,%xmm2
+ movdqa %xmm1,%xmm13
+ pcmpeqb %xmm3,%xmm11
+ psllq $4,%xmm3
+
+ movdqa %xmm2,%xmm14
+ movdqa %xmm3,%xmm15
+ pxor %xmm5,%xmm8
+ pxor %xmm5,%xmm9
+
+ pand %xmm6,%xmm12
+ pand %xmm6,%xmm13
+ movdqa %xmm8,0(%rax)
+ pcmpeqb %xmm0,%xmm12
+ psrlq $4,%xmm0
+ movdqa %xmm9,16(%rax)
+ pcmpeqb %xmm1,%xmm13
+ psrlq $4,%xmm1
+ leaq 16(%rcx),%rcx
+
+ pand %xmm6,%xmm14
+ pand %xmm6,%xmm15
+ movdqa %xmm10,32(%rax)
+ pcmpeqb %xmm2,%xmm14
+ psrlq $4,%xmm2
+ movdqa %xmm11,48(%rax)
+ pcmpeqb %xmm3,%xmm15
+ psrlq $4,%xmm3
+ movdqu (%rcx),%xmm6
+
+ pxor %xmm5,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm12,64(%rax)
+ movdqa %xmm13,80(%rax)
+ movdqa %xmm14,96(%rax)
+ movdqa %xmm15,112(%rax)
+ leaq 128(%rax),%rax
+ decl %r10d
+ jnz L$key_loop
+
+ movdqa 80(%r11),%xmm7
+
+ .byte 0xf3,0xc3
+
+
+.globl _bsaes_cbc_encrypt
+.private_extern _bsaes_cbc_encrypt
+
+.p2align 4
+_bsaes_cbc_encrypt:
+ cmpl $0,%r9d
+ jne _asm_AES_cbc_encrypt
+ cmpq $128,%rdx
+ jb _asm_AES_cbc_encrypt
+
+ movq %rsp,%rax
+L$cbc_dec_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movl 240(%rcx),%eax
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ movq %r8,%rbx
+ shrq $4,%r14
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7
+ movdqa %xmm6,(%rax)
+ movdqa %xmm7,(%rsp)
+
+ movdqu (%rbx),%xmm14
+ subq $8,%r14
+L$cbc_dec_loop:
+ movdqu 0(%r12),%xmm15
+ movdqu 16(%r12),%xmm0
+ movdqu 32(%r12),%xmm1
+ movdqu 48(%r12),%xmm2
+ movdqu 64(%r12),%xmm3
+ movdqu 80(%r12),%xmm4
+ movq %rsp,%rax
+ movdqu 96(%r12),%xmm5
+ movl %edx,%r10d
+ movdqu 112(%r12),%xmm6
+ movdqa %xmm14,32(%rbp)
+
+ call _bsaes_decrypt8
+
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm6
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm2
+ movdqu 112(%r12),%xmm14
+ pxor %xmm13,%xmm4
+ movdqu %xmm15,0(%r13)
+ leaq 128(%r12),%r12
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+ subq $8,%r14
+ jnc L$cbc_dec_loop
+
+ addq $8,%r14
+ jz L$cbc_dec_done
+
+ movdqu 0(%r12),%xmm15
+ movq %rsp,%rax
+ movl %edx,%r10d
+ cmpq $2,%r14
+ jb L$cbc_dec_one
+ movdqu 16(%r12),%xmm0
+ je L$cbc_dec_two
+ movdqu 32(%r12),%xmm1
+ cmpq $4,%r14
+ jb L$cbc_dec_three
+ movdqu 48(%r12),%xmm2
+ je L$cbc_dec_four
+ movdqu 64(%r12),%xmm3
+ cmpq $6,%r14
+ jb L$cbc_dec_five
+ movdqu 80(%r12),%xmm4
+ je L$cbc_dec_six
+ movdqu 96(%r12),%xmm5
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm6
+ movdqu 96(%r12),%xmm14
+ pxor %xmm12,%xmm2
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_six:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm1
+ movdqu 80(%r12),%xmm14
+ pxor %xmm11,%xmm6
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_five:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm3
+ movdqu 64(%r12),%xmm14
+ pxor %xmm10,%xmm1
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_four:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm5
+ movdqu 48(%r12),%xmm14
+ pxor %xmm9,%xmm3
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_three:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm0
+ movdqu 32(%r12),%xmm14
+ pxor %xmm8,%xmm5
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_two:
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm14
+ pxor %xmm7,%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ jmp L$cbc_dec_done
+.p2align 4
+L$cbc_dec_one:
+ leaq (%r12),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call _asm_AES_decrypt
+ pxor 32(%rbp),%xmm14
+ movdqu %xmm14,(%r13)
+ movdqa %xmm15,%xmm14
+
+L$cbc_dec_done:
+ movdqu %xmm14,(%rbx)
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+L$cbc_dec_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja L$cbc_dec_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+L$cbc_dec_epilogue:
+ .byte 0xf3,0xc3
+
+
+.globl _bsaes_ctr32_encrypt_blocks
+.private_extern _bsaes_ctr32_encrypt_blocks
+
+.p2align 4
+_bsaes_ctr32_encrypt_blocks:
+ movq %rsp,%rax
+L$ctr_enc_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movdqu (%r8),%xmm0
+ movl 240(%rcx),%eax
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+ movdqa %xmm0,32(%rbp)
+ cmpq $8,%rdx
+ jb L$ctr_enc_short
+
+ movl %eax,%ebx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %ebx,%r10d
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7
+ movdqa %xmm7,(%rax)
+
+ movdqa (%rsp),%xmm8
+ leaq L$ADD1(%rip),%r11
+ movdqa 32(%rbp),%xmm15
+ movdqa -32(%r11),%xmm7
+.byte 102,68,15,56,0,199
+.byte 102,68,15,56,0,255
+ movdqa %xmm8,(%rsp)
+ jmp L$ctr_enc_loop
+.p2align 4
+L$ctr_enc_loop:
+ movdqa %xmm15,32(%rbp)
+ movdqa %xmm15,%xmm0
+ movdqa %xmm15,%xmm1
+ paddd 0(%r11),%xmm0
+ movdqa %xmm15,%xmm2
+ paddd 16(%r11),%xmm1
+ movdqa %xmm15,%xmm3
+ paddd 32(%r11),%xmm2
+ movdqa %xmm15,%xmm4
+ paddd 48(%r11),%xmm3
+ movdqa %xmm15,%xmm5
+ paddd 64(%r11),%xmm4
+ movdqa %xmm15,%xmm6
+ paddd 80(%r11),%xmm5
+ paddd 96(%r11),%xmm6
+
+
+
+ movdqa (%rsp),%xmm8
+ leaq 16(%rsp),%rax
+ movdqa -16(%r11),%xmm7
+ pxor %xmm8,%xmm15
+ pxor %xmm8,%xmm0
+ pxor %xmm8,%xmm1
+ pxor %xmm8,%xmm2
+.byte 102,68,15,56,0,255
+.byte 102,15,56,0,199
+ pxor %xmm8,%xmm3
+ pxor %xmm8,%xmm4
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ pxor %xmm8,%xmm5
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,223
+.byte 102,15,56,0,231
+.byte 102,15,56,0,239
+.byte 102,15,56,0,247
+ leaq L$BS0(%rip),%r11
+ movl %ebx,%r10d
+
+ call _bsaes_encrypt8_bitslice
+
+ subq $8,%r14
+ jc L$ctr_enc_loop_done
+
+ movdqu 0(%r12),%xmm7
+ movdqu 16(%r12),%xmm8
+ movdqu 32(%r12),%xmm9
+ movdqu 48(%r12),%xmm10
+ movdqu 64(%r12),%xmm11
+ movdqu 80(%r12),%xmm12
+ movdqu 96(%r12),%xmm13
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ pxor %xmm15,%xmm7
+ movdqa 32(%rbp),%xmm15
+ pxor %xmm8,%xmm0
+ movdqu %xmm7,0(%r13)
+ pxor %xmm9,%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor %xmm10,%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor %xmm11,%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor %xmm12,%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor %xmm13,%xmm1
+ movdqu %xmm6,80(%r13)
+ pxor %xmm14,%xmm4
+ movdqu %xmm1,96(%r13)
+ leaq L$ADD1(%rip),%r11
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+ paddd 112(%r11),%xmm15
+ jnz L$ctr_enc_loop
+
+ jmp L$ctr_enc_done
+.p2align 4
+L$ctr_enc_loop_done:
+ addq $8,%r14
+ movdqu 0(%r12),%xmm7
+ pxor %xmm7,%xmm15
+ movdqu %xmm15,0(%r13)
+ cmpq $2,%r14
+ jb L$ctr_enc_done
+ movdqu 16(%r12),%xmm8
+ pxor %xmm8,%xmm0
+ movdqu %xmm0,16(%r13)
+ je L$ctr_enc_done
+ movdqu 32(%r12),%xmm9
+ pxor %xmm9,%xmm3
+ movdqu %xmm3,32(%r13)
+ cmpq $4,%r14
+ jb L$ctr_enc_done
+ movdqu 48(%r12),%xmm10
+ pxor %xmm10,%xmm5
+ movdqu %xmm5,48(%r13)
+ je L$ctr_enc_done
+ movdqu 64(%r12),%xmm11
+ pxor %xmm11,%xmm2
+ movdqu %xmm2,64(%r13)
+ cmpq $6,%r14
+ jb L$ctr_enc_done
+ movdqu 80(%r12),%xmm12
+ pxor %xmm12,%xmm6
+ movdqu %xmm6,80(%r13)
+ je L$ctr_enc_done
+ movdqu 96(%r12),%xmm13
+ pxor %xmm13,%xmm1
+ movdqu %xmm1,96(%r13)
+ jmp L$ctr_enc_done
+
+.p2align 4
+L$ctr_enc_short:
+ leaq 32(%rbp),%rdi
+ leaq 48(%rbp),%rsi
+ leaq (%r15),%rdx
+ call _asm_AES_encrypt
+ movdqu (%r12),%xmm0
+ leaq 16(%r12),%r12
+ movl 44(%rbp),%eax
+ bswapl %eax
+ pxor 48(%rbp),%xmm0
+ incl %eax
+ movdqu %xmm0,(%r13)
+ bswapl %eax
+ leaq 16(%r13),%r13
+ movl %eax,44(%rsp)
+ decq %r14
+ jnz L$ctr_enc_short
+
+L$ctr_enc_done:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+L$ctr_enc_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja L$ctr_enc_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+L$ctr_enc_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _bsaes_xts_encrypt
+.private_extern _bsaes_xts_encrypt
+
+.p2align 4
+_bsaes_xts_encrypt:
+ movq %rsp,%rax
+L$xts_enc_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+
+ leaq (%r9),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r8),%rdx
+ call _asm_AES_encrypt
+
+ movl 240(%r15),%eax
+ movq %r14,%rbx
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor %xmm6,%xmm7
+ movdqa %xmm7,(%rax)
+
+ andq $-16,%r14
+ subq $128,%rsp
+ movdqa 32(%rbp),%xmm6
+
+ pxor %xmm14,%xmm14
+ movdqa L$xts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+
+ subq $128,%r14
+ jc L$xts_enc_short
+ jmp L$xts_enc_loop
+
+.p2align 4
+L$xts_enc_loop:
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ movdqa %xmm6,112(%rsp)
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ pxor %xmm14,%xmm6
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor 96(%rsp),%xmm1
+ movdqu %xmm6,80(%r13)
+ pxor 112(%rsp),%xmm4
+ movdqu %xmm1,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ pxor %xmm14,%xmm14
+ movdqa L$xts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+
+ subq $128,%r14
+ jnc L$xts_enc_loop
+
+L$xts_enc_short:
+ addq $128,%r14
+ jz L$xts_enc_done
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ cmpq $16,%r14
+ je L$xts_enc_1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ cmpq $32,%r14
+ je L$xts_enc_2
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ cmpq $48,%r14
+ je L$xts_enc_3
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ cmpq $64,%r14
+ je L$xts_enc_4
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ cmpq $80,%r14
+ je L$xts_enc_5
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ cmpq $96,%r14
+ je L$xts_enc_6
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqa %xmm6,112(%rsp)
+ leaq 112(%r12),%r12
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ pxor 96(%rsp),%xmm1
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm1,96(%r13)
+ leaq 112(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ jmp L$xts_enc_done
+.p2align 4
+L$xts_enc_6:
+ pxor %xmm11,%xmm3
+ leaq 96(%r12),%r12
+ pxor %xmm12,%xmm4
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm2,64(%r13)
+ movdqu %xmm6,80(%r13)
+ leaq 96(%r13),%r13
+
+ movdqa 96(%rsp),%xmm6
+ jmp L$xts_enc_done
+.p2align 4
+L$xts_enc_5:
+ pxor %xmm10,%xmm2
+ leaq 80(%r12),%r12
+ pxor %xmm11,%xmm3
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ pxor 64(%rsp),%xmm2
+ movdqu %xmm5,48(%r13)
+ movdqu %xmm2,64(%r13)
+ leaq 80(%r13),%r13
+
+ movdqa 80(%rsp),%xmm6
+ jmp L$xts_enc_done
+.p2align 4
+L$xts_enc_4:
+ pxor %xmm9,%xmm1
+ leaq 64(%r12),%r12
+ pxor %xmm10,%xmm2
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm5
+ movdqu %xmm3,32(%r13)
+ movdqu %xmm5,48(%r13)
+ leaq 64(%r13),%r13
+
+ movdqa 64(%rsp),%xmm6
+ jmp L$xts_enc_done
+.p2align 4
+L$xts_enc_3:
+ pxor %xmm8,%xmm0
+ leaq 48(%r12),%r12
+ pxor %xmm9,%xmm1
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm3
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm3,32(%r13)
+ leaq 48(%r13),%r13
+
+ movdqa 48(%rsp),%xmm6
+ jmp L$xts_enc_done
+.p2align 4
+L$xts_enc_2:
+ pxor %xmm7,%xmm15
+ leaq 32(%r12),%r12
+ pxor %xmm8,%xmm0
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_encrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ leaq 32(%r13),%r13
+
+ movdqa 32(%rsp),%xmm6
+ jmp L$xts_enc_done
+.p2align 4
+L$xts_enc_1:
+ pxor %xmm15,%xmm7
+ leaq 16(%r12),%r12
+ movdqa %xmm7,32(%rbp)
+ leaq 32(%rbp),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call _asm_AES_encrypt
+ pxor 32(%rbp),%xmm15
+
+
+
+
+
+ movdqu %xmm15,0(%r13)
+ leaq 16(%r13),%r13
+
+ movdqa 16(%rsp),%xmm6
+
+L$xts_enc_done:
+ andl $15,%ebx
+ jz L$xts_enc_ret
+ movq %r13,%rdx
+
+L$xts_enc_steal:
+ movzbl (%r12),%eax
+ movzbl -16(%rdx),%ecx
+ leaq 1(%r12),%r12
+ movb %al,-16(%rdx)
+ movb %cl,0(%rdx)
+ leaq 1(%rdx),%rdx
+ subl $1,%ebx
+ jnz L$xts_enc_steal
+
+ movdqu -16(%r13),%xmm15
+ leaq 32(%rbp),%rdi
+ pxor %xmm6,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call _asm_AES_encrypt
+ pxor 32(%rbp),%xmm6
+ movdqu %xmm6,-16(%r13)
+
+L$xts_enc_ret:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+L$xts_enc_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja L$xts_enc_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+L$xts_enc_epilogue:
+ .byte 0xf3,0xc3
+
+
+.globl _bsaes_xts_decrypt
+.private_extern _bsaes_xts_decrypt
+
+.p2align 4
+_bsaes_xts_decrypt:
+ movq %rsp,%rax
+L$xts_dec_prologue:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq -72(%rsp),%rsp
+ movq %rsp,%rbp
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %rcx,%r15
+
+ leaq (%r9),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r8),%rdx
+ call _asm_AES_encrypt
+
+ movl 240(%r15),%eax
+ movq %r14,%rbx
+
+ movl %eax,%edx
+ shlq $7,%rax
+ subq $96,%rax
+ subq %rax,%rsp
+
+ movq %rsp,%rax
+ movq %r15,%rcx
+ movl %edx,%r10d
+ call _bsaes_key_convert
+ pxor (%rsp),%xmm7
+ movdqa %xmm6,(%rax)
+ movdqa %xmm7,(%rsp)
+
+ xorl %eax,%eax
+ andq $-16,%r14
+ testl $15,%ebx
+ setnz %al
+ shlq $4,%rax
+ subq %rax,%r14
+
+ subq $128,%rsp
+ movdqa 32(%rbp),%xmm6
+
+ pxor %xmm14,%xmm14
+ movdqa L$xts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+
+ subq $128,%r14
+ jc L$xts_dec_short
+ jmp L$xts_dec_loop
+
+.p2align 4
+L$xts_dec_loop:
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqu 112(%r12),%xmm14
+ leaq 128(%r12),%r12
+ movdqa %xmm6,112(%rsp)
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ pxor %xmm14,%xmm6
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ pxor 96(%rsp),%xmm2
+ movdqu %xmm6,80(%r13)
+ pxor 112(%rsp),%xmm4
+ movdqu %xmm2,96(%r13)
+ movdqu %xmm4,112(%r13)
+ leaq 128(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ pxor %xmm14,%xmm14
+ movdqa L$xts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+
+ subq $128,%r14
+ jnc L$xts_dec_loop
+
+L$xts_dec_short:
+ addq $128,%r14
+ jz L$xts_dec_done
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm15
+ movdqa %xmm6,0(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm0
+ movdqa %xmm6,16(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 0(%r12),%xmm7
+ cmpq $16,%r14
+ je L$xts_dec_1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm1
+ movdqa %xmm6,32(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 16(%r12),%xmm8
+ cmpq $32,%r14
+ je L$xts_dec_2
+ pxor %xmm7,%xmm15
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm2
+ movdqa %xmm6,48(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 32(%r12),%xmm9
+ cmpq $48,%r14
+ je L$xts_dec_3
+ pxor %xmm8,%xmm0
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm3
+ movdqa %xmm6,64(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 48(%r12),%xmm10
+ cmpq $64,%r14
+ je L$xts_dec_4
+ pxor %xmm9,%xmm1
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm4
+ movdqa %xmm6,80(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 64(%r12),%xmm11
+ cmpq $80,%r14
+ je L$xts_dec_5
+ pxor %xmm10,%xmm2
+ pshufd $19,%xmm14,%xmm13
+ pxor %xmm14,%xmm14
+ movdqa %xmm6,%xmm5
+ movdqa %xmm6,96(%rsp)
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ pcmpgtd %xmm6,%xmm14
+ pxor %xmm13,%xmm6
+ movdqu 80(%r12),%xmm12
+ cmpq $96,%r14
+ je L$xts_dec_6
+ pxor %xmm11,%xmm3
+ movdqu 96(%r12),%xmm13
+ pxor %xmm12,%xmm4
+ movdqa %xmm6,112(%rsp)
+ leaq 112(%r12),%r12
+ pxor %xmm13,%xmm5
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ pxor 96(%rsp),%xmm2
+ movdqu %xmm6,80(%r13)
+ movdqu %xmm2,96(%r13)
+ leaq 112(%r13),%r13
+
+ movdqa 112(%rsp),%xmm6
+ jmp L$xts_dec_done
+.p2align 4
+L$xts_dec_6:
+ pxor %xmm11,%xmm3
+ leaq 96(%r12),%r12
+ pxor %xmm12,%xmm4
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ pxor 80(%rsp),%xmm6
+ movdqu %xmm1,64(%r13)
+ movdqu %xmm6,80(%r13)
+ leaq 96(%r13),%r13
+
+ movdqa 96(%rsp),%xmm6
+ jmp L$xts_dec_done
+.p2align 4
+L$xts_dec_5:
+ pxor %xmm10,%xmm2
+ leaq 80(%r12),%r12
+ pxor %xmm11,%xmm3
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ pxor 64(%rsp),%xmm1
+ movdqu %xmm3,48(%r13)
+ movdqu %xmm1,64(%r13)
+ leaq 80(%r13),%r13
+
+ movdqa 80(%rsp),%xmm6
+ jmp L$xts_dec_done
+.p2align 4
+L$xts_dec_4:
+ pxor %xmm9,%xmm1
+ leaq 64(%r12),%r12
+ pxor %xmm10,%xmm2
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ pxor 48(%rsp),%xmm3
+ movdqu %xmm5,32(%r13)
+ movdqu %xmm3,48(%r13)
+ leaq 64(%r13),%r13
+
+ movdqa 64(%rsp),%xmm6
+ jmp L$xts_dec_done
+.p2align 4
+L$xts_dec_3:
+ pxor %xmm8,%xmm0
+ leaq 48(%r12),%r12
+ pxor %xmm9,%xmm1
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ pxor 32(%rsp),%xmm5
+ movdqu %xmm0,16(%r13)
+ movdqu %xmm5,32(%r13)
+ leaq 48(%r13),%r13
+
+ movdqa 48(%rsp),%xmm6
+ jmp L$xts_dec_done
+.p2align 4
+L$xts_dec_2:
+ pxor %xmm7,%xmm15
+ leaq 32(%r12),%r12
+ pxor %xmm8,%xmm0
+ leaq 128(%rsp),%rax
+ movl %edx,%r10d
+
+ call _bsaes_decrypt8
+
+ pxor 0(%rsp),%xmm15
+ pxor 16(%rsp),%xmm0
+ movdqu %xmm15,0(%r13)
+ movdqu %xmm0,16(%r13)
+ leaq 32(%r13),%r13
+
+ movdqa 32(%rsp),%xmm6
+ jmp L$xts_dec_done
+.p2align 4
+L$xts_dec_1:
+ pxor %xmm15,%xmm7
+ leaq 16(%r12),%r12
+ movdqa %xmm7,32(%rbp)
+ leaq 32(%rbp),%rdi
+ leaq 32(%rbp),%rsi
+ leaq (%r15),%rdx
+ call _asm_AES_decrypt
+ pxor 32(%rbp),%xmm15
+
+
+
+
+
+ movdqu %xmm15,0(%r13)
+ leaq 16(%r13),%r13
+
+ movdqa 16(%rsp),%xmm6
+
+L$xts_dec_done:
+ andl $15,%ebx
+ jz L$xts_dec_ret
+
+ pxor %xmm14,%xmm14
+ movdqa L$xts_magic(%rip),%xmm12
+ pcmpgtd %xmm6,%xmm14
+ pshufd $19,%xmm14,%xmm13
+ movdqa %xmm6,%xmm5
+ paddq %xmm6,%xmm6
+ pand %xmm12,%xmm13
+ movdqu (%r12),%xmm15
+ pxor %xmm13,%xmm6
+
+ leaq 32(%rbp),%rdi
+ pxor %xmm6,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call _asm_AES_decrypt
+ pxor 32(%rbp),%xmm6
+ movq %r13,%rdx
+ movdqu %xmm6,(%r13)
+
+L$xts_dec_steal:
+ movzbl 16(%r12),%eax
+ movzbl (%rdx),%ecx
+ leaq 1(%r12),%r12
+ movb %al,(%rdx)
+ movb %cl,16(%rdx)
+ leaq 1(%rdx),%rdx
+ subl $1,%ebx
+ jnz L$xts_dec_steal
+
+ movdqu (%r13),%xmm15
+ leaq 32(%rbp),%rdi
+ pxor %xmm5,%xmm15
+ leaq 32(%rbp),%rsi
+ movdqa %xmm15,32(%rbp)
+ leaq (%r15),%rdx
+ call _asm_AES_decrypt
+ pxor 32(%rbp),%xmm5
+ movdqu %xmm5,(%r13)
+
+L$xts_dec_ret:
+ leaq (%rsp),%rax
+ pxor %xmm0,%xmm0
+L$xts_dec_bzero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ leaq 32(%rax),%rax
+ cmpq %rax,%rbp
+ ja L$xts_dec_bzero
+
+ leaq (%rbp),%rsp
+ movq 72(%rsp),%r15
+ movq 80(%rsp),%r14
+ movq 88(%rsp),%r13
+ movq 96(%rsp),%r12
+ movq 104(%rsp),%rbx
+ movq 112(%rsp),%rax
+ leaq 120(%rsp),%rsp
+ movq %rax,%rbp
+L$xts_dec_epilogue:
+ .byte 0xf3,0xc3
+
+
+.p2align 6
+_bsaes_const:
+L$M0ISR:
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+L$ISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+L$ISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+L$BS0:
+.quad 0x5555555555555555, 0x5555555555555555
+L$BS1:
+.quad 0x3333333333333333, 0x3333333333333333
+L$BS2:
+.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+L$SR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+L$SRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+L$M0SR:
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+L$SWPUP:
+.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+L$SWPUPM0SR:
+.quad 0x0a0d02060c03070b, 0x0004080f05090e01
+L$ADD1:
+.quad 0x0000000000000000, 0x0000000100000000
+L$ADD2:
+.quad 0x0000000000000000, 0x0000000200000000
+L$ADD3:
+.quad 0x0000000000000000, 0x0000000300000000
+L$ADD4:
+.quad 0x0000000000000000, 0x0000000400000000
+L$ADD5:
+.quad 0x0000000000000000, 0x0000000500000000
+L$ADD6:
+.quad 0x0000000000000000, 0x0000000600000000
+L$ADD7:
+.quad 0x0000000000000000, 0x0000000700000000
+L$ADD8:
+.quad 0x0000000000000000, 0x0000000800000000
+L$xts_magic:
+.long 0x87,0,1,0
+L$masks:
+.quad 0x0101010101010101, 0x0101010101010101
+.quad 0x0202020202020202, 0x0202020202020202
+.quad 0x0404040404040404, 0x0404040404040404
+.quad 0x0808080808080808, 0x0808080808080808
+L$M0:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+L$63:
+.quad 0x6363636363636363, 0x6363636363636363
+.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0
+.p2align 6
+
+#endif
diff --git a/mac-x86_64/crypto/aes/vpaes-x86_64.S b/mac-x86_64/crypto/aes/vpaes-x86_64.S
new file mode 100644
index 0000000..711ea43
--- /dev/null
+++ b/mac-x86_64/crypto/aes/vpaes-x86_64.S
@@ -0,0 +1,834 @@
+#if defined(__x86_64__)
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_encrypt_core:
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa L$k_ipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%r9),%xmm5
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa L$k_ipt+16(%rip),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ leaq L$k_mc_backward(%rip),%r10
+ jmp L$enc_entry
+
+.p2align 4
+L$enc_loop:
+
+ movdqa %xmm13,%xmm4
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa %xmm15,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%r11,%r10,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa (%r11,%r10,1),%xmm4
+ movdqa %xmm14,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andq $48,%r11
+ subq $1,%rax
+ pxor %xmm3,%xmm0
+
+L$enc_entry:
+
+ movdqa %xmm9,%xmm1
+ movdqa %xmm11,%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm10,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm5
+ pxor %xmm1,%xmm3
+ jnz L$enc_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%r11,%r10,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_decrypt_core:
+ movq %rdx,%r9
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa L$k_dipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movq %rax,%r11
+ psrld $4,%xmm1
+ movdqu (%r9),%xmm5
+ shlq $4,%r11
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa L$k_dipt+16(%rip),%xmm0
+ xorq $48,%r11
+ leaq L$k_dsbd(%rip),%r10
+.byte 102,15,56,0,193
+ andq $48,%r11
+ pxor %xmm5,%xmm2
+ movdqa L$k_mc_forward+48(%rip),%xmm5
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ addq %r10,%r11
+ jmp L$dec_entry
+
+.p2align 4
+L$dec_loop:
+
+
+
+ movdqa -32(%r10),%xmm4
+ movdqa -16(%r10),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 0(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addq $16,%r9
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subq $1,%rax
+
+L$dec_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ movdqa %xmm11,%xmm2
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm0
+ pxor %xmm1,%xmm3
+ jnz L$dec_loop
+
+
+ movdqa 96(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%r10),%xmm0
+ movdqa -352(%r11),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_core:
+
+
+
+
+
+ call _vpaes_preheat
+ movdqa L$k_rcon(%rip),%xmm8
+ movdqu (%rdi),%xmm0
+
+
+ movdqa %xmm0,%xmm3
+ leaq L$k_ipt(%rip),%r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+
+ leaq L$k_sr(%rip),%r10
+ testq %rcx,%rcx
+ jnz L$schedule_am_decrypting
+
+
+ movdqu %xmm0,(%rdx)
+ jmp L$schedule_go
+
+L$schedule_am_decrypting:
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%rdx)
+ xorq $48,%r8
+
+L$schedule_go:
+ cmpl $192,%esi
+ ja L$schedule_256
+ je L$schedule_192
+
+
+
+
+
+
+
+
+
+
+L$schedule_128:
+ movl $10,%esi
+
+L$oop_schedule_128:
+ call _vpaes_schedule_round
+ decq %rsi
+ jz L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp L$oop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+L$schedule_192:
+ movdqu 8(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%esi
+
+L$oop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decq %rsi
+ jz L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp L$oop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+L$schedule_256:
+ movdqu 16(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%esi
+
+L$oop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+
+
+ call _vpaes_schedule_round
+ decq %rsi
+ jz L$schedule_mangle_last
+ call _vpaes_schedule_mangle
+
+
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,%xmm5
+ movdqa %xmm6,%xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5,%xmm7
+
+ jmp L$oop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+L$schedule_mangle_last:
+
+ leaq L$k_deskew(%rip),%r11
+ testq %rcx,%rcx
+ jnz L$schedule_mangle_last_dec
+
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,193
+ leaq L$k_opt(%rip),%r11
+ addq $32,%rdx
+
+L$schedule_mangle_last_dec:
+ addq $-16,%rdx
+ pxor L$k_s63(%rip),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%rdx)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm1
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_round:
+
+ pxor %xmm1,%xmm1
+.byte 102,65,15,58,15,200,15
+.byte 102,69,15,58,15,192,15
+ pxor %xmm1,%xmm7
+
+
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor L$k_s63(%rip),%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_transform:
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa (%r11),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%r11),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa L$k_mc_forward(%rip),%xmm5
+ testq %rcx,%rcx
+ jnz L$schedule_mangle_dec
+
+
+ addq $16,%rdx
+ pxor L$k_s63(%rip),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+
+ jmp L$schedule_mangle_both
+.p2align 4
+L$schedule_mangle_dec:
+
+ leaq L$k_dksd(%rip),%r11
+ movdqa %xmm9,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm4
+
+ movdqa 0(%r11),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 32(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 64(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 96(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+
+ addq $-16,%rdx
+
+L$schedule_mangle_both:
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ addq $-16,%r8
+ andq $48,%r8
+ movdqu %xmm3,(%rdx)
+ .byte 0xf3,0xc3
+
+
+
+
+
+.globl _vpaes_set_encrypt_key
+.private_extern _vpaes_set_encrypt_key
+
+.p2align 4
+_vpaes_set_encrypt_key:
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+
+ movl $0,%ecx
+ movl $48,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+
+
+.globl _vpaes_set_decrypt_key
+.private_extern _vpaes_set_decrypt_key
+
+.p2align 4
+_vpaes_set_decrypt_key:
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+ shll $4,%eax
+ leaq 16(%rdx,%rax,1),%rdx
+
+ movl $1,%ecx
+ movl %esi,%r8d
+ shrl $1,%r8d
+ andl $32,%r8d
+ xorl $32,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+
+
+.globl _vpaes_encrypt
+.private_extern _vpaes_encrypt
+
+.p2align 4
+_vpaes_encrypt:
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+ .byte 0xf3,0xc3
+
+
+.globl _vpaes_decrypt
+.private_extern _vpaes_decrypt
+
+.p2align 4
+_vpaes_decrypt:
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+ .byte 0xf3,0xc3
+
+.globl _vpaes_cbc_encrypt
+.private_extern _vpaes_cbc_encrypt
+
+.p2align 4
+_vpaes_cbc_encrypt:
+ xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc L$cbc_abort
+ movdqu (%r8),%xmm6
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ cmpl $0,%r9d
+ je L$cbc_dec_loop
+ jmp L$cbc_enc_loop
+.p2align 4
+L$cbc_enc_loop:
+ movdqu (%rdi),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc L$cbc_enc_loop
+ jmp L$cbc_done
+.p2align 4
+L$cbc_dec_loop:
+ movdqu (%rdi),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc L$cbc_dec_loop
+L$cbc_done:
+ movdqu %xmm6,(%r8)
+L$cbc_abort:
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+
+.p2align 4
+_vpaes_preheat:
+ leaq L$k_s0F(%rip),%r10
+ movdqa -32(%r10),%xmm10
+ movdqa -16(%r10),%xmm11
+ movdqa 0(%r10),%xmm9
+ movdqa 48(%r10),%xmm13
+ movdqa 64(%r10),%xmm12
+ movdqa 80(%r10),%xmm15
+ movdqa 96(%r10),%xmm14
+ .byte 0xf3,0xc3
+
+
+
+
+
+
+
+.p2align 6
+_vpaes_consts:
+L$k_inv:
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+L$k_s0F:
+.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+L$k_ipt:
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+L$k_sb1:
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+L$k_sb2:
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+L$k_sbo:
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+L$k_mc_forward:
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+L$k_mc_backward:
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+L$k_sr:
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+L$k_rcon:
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+L$k_s63:
+.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+L$k_opt:
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+L$k_deskew:
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+L$k_dksd:
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+L$k_dksb:
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+L$k_dkse:
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+L$k_dks9:
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+L$k_dipt:
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+
+L$k_dsb9:
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+L$k_dsbd:
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+L$k_dsbb:
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+L$k_dsbe:
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+L$k_dsbo:
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.p2align 6
+
+#endif
diff --git a/mac-x86_64/crypto/bn/modexp512-x86_64.S b/mac-x86_64/crypto/bn/modexp512-x86_64.S
new file mode 100644
index 0000000..beb133e
--- /dev/null
+++ b/mac-x86_64/crypto/bn/modexp512-x86_64.S
@@ -0,0 +1,1776 @@
+#if defined(__x86_64__)
+.text
+
+
+.p2align 4
+MULADD_128x512:
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,0(%rcx)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq 8(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ movq %r9,8(%rcx)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+mont_reduce:
+ leaq 192(%rsp),%rdi
+ movq 32(%rsp),%rsi
+ addq $576,%rsi
+ leaq 520(%rsp),%rcx
+
+ movq 96(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ movq (%rcx),%r8
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,0(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ movq 8(%rcx),%r9
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ movq 16(%rcx),%r10
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ movq 24(%rcx),%r11
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ movq 32(%rcx),%r12
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ movq 40(%rcx),%r13
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ movq 48(%rcx),%r14
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ movq 56(%rcx),%r15
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq 104(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ movq %r9,8(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ movq 112(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %r10,16(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 120(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,24(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ xorq %rax,%rax
+
+ addq 64(%rcx),%r8
+ adcq 72(%rcx),%r9
+ adcq 80(%rcx),%r10
+ adcq 88(%rcx),%r11
+ adcq $0,%rax
+
+
+
+
+ movq %r8,64(%rdi)
+ movq %r9,72(%rdi)
+ movq %r10,%rbp
+ movq %r11,88(%rdi)
+
+ movq %rax,384(%rsp)
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+
+
+
+
+
+
+
+
+ addq $80,%rdi
+
+ addq $64,%rsi
+ leaq 296(%rsp),%rcx
+
+ call MULADD_128x512
+
+ movq 384(%rsp),%rax
+
+
+ addq -16(%rdi),%r8
+ adcq -8(%rdi),%r9
+ movq %r8,64(%rcx)
+ movq %r9,72(%rcx)
+
+ adcq %rax,%rax
+ movq %rax,384(%rsp)
+
+ leaq 192(%rsp),%rdi
+ addq $64,%rsi
+
+
+
+
+
+ movq (%rsi),%r8
+ movq 8(%rsi),%rbx
+
+ movq (%rcx),%rax
+ mulq %r8
+ movq %rax,%rbp
+ movq %rdx,%r9
+
+ movq 8(%rcx),%rax
+ mulq %r8
+ addq %rax,%r9
+
+ movq (%rcx),%rax
+ mulq %rbx
+ addq %rax,%r9
+
+ movq %r9,8(%rdi)
+
+
+ subq $192,%rsi
+
+ movq (%rcx),%r8
+ movq 8(%rcx),%r9
+
+ call MULADD_128x512
+
+
+
+
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rdi
+ movq 24(%rsi),%rdx
+
+
+ movq 384(%rsp),%rbp
+
+ addq 64(%rcx),%r8
+ adcq 72(%rcx),%r9
+
+
+ adcq %rbp,%rbp
+
+
+
+ shlq $3,%rbp
+ movq 32(%rsp),%rcx
+ addq %rcx,%rbp
+
+
+ xorq %rsi,%rsi
+
+ addq 0(%rbp),%r10
+ adcq 64(%rbp),%r11
+ adcq 128(%rbp),%r12
+ adcq 192(%rbp),%r13
+ adcq 256(%rbp),%r14
+ adcq 320(%rbp),%r15
+ adcq 384(%rbp),%r8
+ adcq 448(%rbp),%r9
+
+
+
+ sbbq $0,%rsi
+
+
+ andq %rsi,%rax
+ andq %rsi,%rbx
+ andq %rsi,%rdi
+ andq %rsi,%rdx
+
+ movq $1,%rbp
+ subq %rax,%r10
+ sbbq %rbx,%r11
+ sbbq %rdi,%r12
+ sbbq %rdx,%r13
+
+
+
+
+ sbbq $0,%rbp
+
+
+
+ addq $512,%rcx
+ movq 32(%rcx),%rax
+ movq 40(%rcx),%rbx
+ movq 48(%rcx),%rdi
+ movq 56(%rcx),%rdx
+
+
+
+ andq %rsi,%rax
+ andq %rsi,%rbx
+ andq %rsi,%rdi
+ andq %rsi,%rdx
+
+
+
+ subq $1,%rbp
+
+ sbbq %rax,%r14
+ sbbq %rbx,%r15
+ sbbq %rdi,%r8
+ sbbq %rdx,%r9
+
+
+
+ movq 144(%rsp),%rsi
+ movq %r10,0(%rsi)
+ movq %r11,8(%rsi)
+ movq %r12,16(%rsi)
+ movq %r13,24(%rsi)
+ movq %r14,32(%rsi)
+ movq %r15,40(%rsi)
+ movq %r8,48(%rsi)
+ movq %r9,56(%rsi)
+
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+mont_mul_a3b:
+
+
+
+
+ movq 0(%rdi),%rbp
+
+ movq %r10,%rax
+ mulq %rbp
+ movq %rax,520(%rsp)
+ movq %rdx,%r10
+ movq %r11,%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq %r12,%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq %r13,%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq %r14,%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq %r15,%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq %r8,%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq %r9,%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ movq 8(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %r10,528(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 16(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,536(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq 24(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %r12,544(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq 32(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %r13,552(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq 40(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %r14,560(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq 48(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,568(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq 56(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,576(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq %r9,584(%rsp)
+ movq %r10,592(%rsp)
+ movq %r11,600(%rsp)
+ movq %r12,608(%rsp)
+ movq %r13,616(%rsp)
+ movq %r14,624(%rsp)
+ movq %r15,632(%rsp)
+ movq %r8,640(%rsp)
+
+
+
+
+
+ jmp mont_reduce
+
+
+
+
+.p2align 4
+sqr_reduce:
+ movq 16(%rsp),%rcx
+
+
+
+ movq %r10,%rbx
+
+ movq %r11,%rax
+ mulq %rbx
+ movq %rax,528(%rsp)
+ movq %rdx,%r10
+ movq %r12,%rax
+ mulq %rbx
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq %r13,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq %r14,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq %r15,%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %rdx,%rsi
+
+ movq %r10,536(%rsp)
+
+
+
+
+
+ movq 8(%rcx),%rbx
+
+ movq 16(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,544(%rsp)
+
+ movq %rdx,%r10
+ movq 24(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+ movq %r12,552(%rsp)
+
+ movq %rdx,%r10
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %r10,%r14
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+
+ movq %rdx,%r11
+
+
+
+
+ movq 16(%rcx),%rbx
+
+ movq 24(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %r13,560(%rsp)
+
+ movq %rdx,%r10
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %r10,%r14
+ adcq $0,%rdx
+ movq %r14,568(%rsp)
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%r12
+
+
+
+
+
+ movq 24(%rcx),%rbx
+
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,576(%rsp)
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+ movq %rsi,584(%rsp)
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+
+ movq %rdx,%r15
+
+
+
+
+ movq 32(%rcx),%rbx
+
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,592(%rsp)
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+ movq %r12,600(%rsp)
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r11
+
+
+
+
+ movq 40(%rcx),%rbx
+
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,608(%rsp)
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+ movq %r11,616(%rsp)
+
+ movq %rdx,%r12
+
+
+
+
+ movq %r8,%rbx
+
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %r12,624(%rsp)
+
+ movq %rdx,632(%rsp)
+
+
+ movq 528(%rsp),%r10
+ movq 536(%rsp),%r11
+ movq 544(%rsp),%r12
+ movq 552(%rsp),%r13
+ movq 560(%rsp),%r14
+ movq 568(%rsp),%r15
+
+ movq 24(%rcx),%rax
+ mulq %rax
+ movq %rax,%rdi
+ movq %rdx,%r8
+
+ addq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq %r15,%r15
+ adcq $0,%r8
+
+ movq 0(%rcx),%rax
+ mulq %rax
+ movq %rax,520(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rcx),%rax
+ mulq %rax
+
+ addq %rbx,%r10
+ adcq %rax,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%rbx
+ movq %r10,528(%rsp)
+ movq %r11,536(%rsp)
+
+ movq 16(%rcx),%rax
+ mulq %rax
+
+ addq %rbx,%r12
+ adcq %rax,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%rbx
+
+ movq %r12,544(%rsp)
+ movq %r13,552(%rsp)
+
+ xorq %rbp,%rbp
+ addq %rbx,%r14
+ adcq %rdi,%r15
+ adcq $0,%rbp
+
+ movq %r14,560(%rsp)
+ movq %r15,568(%rsp)
+
+
+
+
+ movq 576(%rsp),%r10
+ movq 584(%rsp),%r11
+ movq 592(%rsp),%r12
+ movq 600(%rsp),%r13
+ movq 608(%rsp),%r14
+ movq 616(%rsp),%r15
+ movq 624(%rsp),%rdi
+ movq 632(%rsp),%rsi
+
+ movq %r9,%rax
+ mulq %rax
+ movq %rax,%r9
+ movq %rdx,%rbx
+
+ addq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq %r15,%r15
+ adcq %rdi,%rdi
+ adcq %rsi,%rsi
+ adcq $0,%rbx
+
+ addq %rbp,%r10
+
+ movq 32(%rcx),%rax
+ mulq %rax
+
+ addq %r8,%r10
+ adcq %rax,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%rbp
+
+ movq %r10,576(%rsp)
+ movq %r11,584(%rsp)
+
+ movq 40(%rcx),%rax
+ mulq %rax
+
+ addq %rbp,%r12
+ adcq %rax,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%rbp
+
+ movq %r12,592(%rsp)
+ movq %r13,600(%rsp)
+
+ movq 48(%rcx),%rax
+ mulq %rax
+
+ addq %rbp,%r14
+ adcq %rax,%r15
+ adcq $0,%rdx
+
+ movq %r14,608(%rsp)
+ movq %r15,616(%rsp)
+
+ addq %rdx,%rdi
+ adcq %r9,%rsi
+ adcq $0,%rbx
+
+ movq %rdi,624(%rsp)
+ movq %rsi,632(%rsp)
+ movq %rbx,640(%rsp)
+
+ jmp mont_reduce
+
+
+
+.globl _mod_exp_512
+.private_extern _mod_exp_512
+
+_mod_exp_512:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r8
+ subq $2688,%rsp
+ andq $-64,%rsp
+
+
+ movq %r8,0(%rsp)
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rcx,24(%rsp)
+L$body:
+
+
+
+ pxor %xmm4,%xmm4
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqa %xmm4,512(%rsp)
+ movdqa %xmm4,528(%rsp)
+ movdqa %xmm4,608(%rsp)
+ movdqa %xmm4,624(%rsp)
+ movdqa %xmm0,544(%rsp)
+ movdqa %xmm1,560(%rsp)
+ movdqa %xmm2,576(%rsp)
+ movdqa %xmm3,592(%rsp)
+
+
+ movdqu 0(%rdx),%xmm0
+ movdqu 16(%rdx),%xmm1
+ movdqu 32(%rdx),%xmm2
+ movdqu 48(%rdx),%xmm3
+
+ leaq 384(%rsp),%rbx
+ movq %rbx,136(%rsp)
+ call mont_reduce
+
+
+ leaq 448(%rsp),%rcx
+ xorq %rax,%rax
+ movq %rax,0(%rcx)
+ movq %rax,8(%rcx)
+ movq %rax,24(%rcx)
+ movq %rax,32(%rcx)
+ movq %rax,40(%rcx)
+ movq %rax,48(%rcx)
+ movq %rax,56(%rcx)
+ movq %rax,128(%rsp)
+ movq $1,16(%rcx)
+
+ leaq 640(%rsp),%rbp
+ movq %rcx,%rsi
+ movq %rbp,%rdi
+ movq $8,%rax
+loop_0:
+ movq (%rcx),%rbx
+ movw %bx,(%rdi)
+ shrq $16,%rbx
+ movw %bx,64(%rdi)
+ shrq $16,%rbx
+ movw %bx,128(%rdi)
+ shrq $16,%rbx
+ movw %bx,192(%rdi)
+ leaq 8(%rcx),%rcx
+ leaq 256(%rdi),%rdi
+ decq %rax
+ jnz loop_0
+ movq $31,%rax
+ movq %rax,32(%rsp)
+ movq %rbp,40(%rsp)
+
+ movq %rsi,136(%rsp)
+ movq 0(%rsi),%r10
+ movq 8(%rsi),%r11
+ movq 16(%rsi),%r12
+ movq 24(%rsi),%r13
+ movq 32(%rsi),%r14
+ movq 40(%rsi),%r15
+ movq 48(%rsi),%r8
+ movq 56(%rsi),%r9
+init_loop:
+ leaq 384(%rsp),%rdi
+ call mont_mul_a3b
+ leaq 448(%rsp),%rsi
+ movq 40(%rsp),%rbp
+ addq $2,%rbp
+ movq %rbp,40(%rsp)
+ movq %rsi,%rcx
+ movq $8,%rax
+loop_1:
+ movq (%rcx),%rbx
+ movw %bx,(%rbp)
+ shrq $16,%rbx
+ movw %bx,64(%rbp)
+ shrq $16,%rbx
+ movw %bx,128(%rbp)
+ shrq $16,%rbx
+ movw %bx,192(%rbp)
+ leaq 8(%rcx),%rcx
+ leaq 256(%rbp),%rbp
+ decq %rax
+ jnz loop_1
+ movq 32(%rsp),%rax
+ subq $1,%rax
+ movq %rax,32(%rsp)
+ jne init_loop
+
+
+
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm1,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movdqa %xmm3,112(%rsp)
+
+
+
+
+
+ movl 126(%rsp),%eax
+ movq %rax,%rdx
+ shrq $11,%rax
+ andl $2047,%edx
+ movl %edx,126(%rsp)
+ leaq 640(%rsp,%rax,2),%rsi
+ movq 8(%rsp),%rdx
+ movq $4,%rbp
+loop_2:
+ movzwq 192(%rsi),%rbx
+ movzwq 448(%rsi),%rax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 128(%rsi),%bx
+ movw 384(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 64(%rsi),%bx
+ movw 320(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 0(%rsi),%bx
+ movw 256(%rsi),%ax
+ movq %rbx,0(%rdx)
+ movq %rax,8(%rdx)
+ leaq 512(%rsi),%rsi
+ leaq 16(%rdx),%rdx
+ subq $1,%rbp
+ jnz loop_2
+ movq $505,48(%rsp)
+
+ movq 8(%rsp),%rcx
+ movq %rcx,136(%rsp)
+ movq 0(%rcx),%r10
+ movq 8(%rcx),%r11
+ movq 16(%rcx),%r12
+ movq 24(%rcx),%r13
+ movq 32(%rcx),%r14
+ movq 40(%rcx),%r15
+ movq 48(%rcx),%r8
+ movq 56(%rcx),%r9
+ jmp sqr_2
+
+main_loop_a3b:
+ call sqr_reduce
+ call sqr_reduce
+ call sqr_reduce
+sqr_2:
+ call sqr_reduce
+ call sqr_reduce
+
+
+
+ movq 48(%rsp),%rcx
+ movq %rcx,%rax
+ shrq $4,%rax
+ movl 64(%rsp,%rax,2),%edx
+ andq $15,%rcx
+ shrq %cl,%rdx
+ andq $31,%rdx
+
+ leaq 640(%rsp,%rdx,2),%rsi
+ leaq 448(%rsp),%rdx
+ movq %rdx,%rdi
+ movq $4,%rbp
+loop_3:
+ movzwq 192(%rsi),%rbx
+ movzwq 448(%rsi),%rax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 128(%rsi),%bx
+ movw 384(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 64(%rsi),%bx
+ movw 320(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 0(%rsi),%bx
+ movw 256(%rsi),%ax
+ movq %rbx,0(%rdx)
+ movq %rax,8(%rdx)
+ leaq 512(%rsi),%rsi
+ leaq 16(%rdx),%rdx
+ subq $1,%rbp
+ jnz loop_3
+ movq 8(%rsp),%rsi
+ call mont_mul_a3b
+
+
+
+ movq 48(%rsp),%rcx
+ subq $5,%rcx
+ movq %rcx,48(%rsp)
+ jge main_loop_a3b
+
+
+
+end_main_loop_a3b:
+
+
+ movq 8(%rsp),%rdx
+ pxor %xmm4,%xmm4
+ movdqu 0(%rdx),%xmm0
+ movdqu 16(%rdx),%xmm1
+ movdqu 32(%rdx),%xmm2
+ movdqu 48(%rdx),%xmm3
+ movdqa %xmm4,576(%rsp)
+ movdqa %xmm4,592(%rsp)
+ movdqa %xmm4,608(%rsp)
+ movdqa %xmm4,624(%rsp)
+ movdqa %xmm0,512(%rsp)
+ movdqa %xmm1,528(%rsp)
+ movdqa %xmm2,544(%rsp)
+ movdqa %xmm3,560(%rsp)
+ call mont_reduce
+
+
+
+ movq 8(%rsp),%rax
+ movq 0(%rax),%r8
+ movq 8(%rax),%r9
+ movq 16(%rax),%r10
+ movq 24(%rax),%r11
+ movq 32(%rax),%r12
+ movq 40(%rax),%r13
+ movq 48(%rax),%r14
+ movq 56(%rax),%r15
+
+
+ movq 24(%rsp),%rbx
+ addq $512,%rbx
+
+ subq 0(%rbx),%r8
+ sbbq 8(%rbx),%r9
+ sbbq 16(%rbx),%r10
+ sbbq 24(%rbx),%r11
+ sbbq 32(%rbx),%r12
+ sbbq 40(%rbx),%r13
+ sbbq 48(%rbx),%r14
+ sbbq 56(%rbx),%r15
+
+
+ movq 0(%rax),%rsi
+ movq 8(%rax),%rdi
+ movq 16(%rax),%rcx
+ movq 24(%rax),%rdx
+ cmovncq %r8,%rsi
+ cmovncq %r9,%rdi
+ cmovncq %r10,%rcx
+ cmovncq %r11,%rdx
+ movq %rsi,0(%rax)
+ movq %rdi,8(%rax)
+ movq %rcx,16(%rax)
+ movq %rdx,24(%rax)
+
+ movq 32(%rax),%rsi
+ movq 40(%rax),%rdi
+ movq 48(%rax),%rcx
+ movq 56(%rax),%rdx
+ cmovncq %r12,%rsi
+ cmovncq %r13,%rdi
+ cmovncq %r14,%rcx
+ cmovncq %r15,%rdx
+ movq %rsi,32(%rax)
+ movq %rdi,40(%rax)
+ movq %rcx,48(%rax)
+ movq %rdx,56(%rax)
+
+ movq 0(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbx
+ movq 40(%rsi),%rbp
+ leaq 48(%rsi),%rsp
+L$epilogue:
+ .byte 0xf3,0xc3
+
+#endif
diff --git a/mac-x86_64/crypto/bn/rsaz-avx2.S b/mac-x86_64/crypto/bn/rsaz-avx2.S
new file mode 100644
index 0000000..8ba2019
--- /dev/null
+++ b/mac-x86_64/crypto/bn/rsaz-avx2.S
@@ -0,0 +1,34 @@
+#if defined(__x86_64__)
+.text
+
+.globl _rsaz_avx2_eligible
+.private_extern _rsaz_avx2_eligible
+
+_rsaz_avx2_eligible:
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+
+
+.globl _rsaz_1024_sqr_avx2
+.private_extern _rsaz_1024_sqr_avx2
+.globl _rsaz_1024_mul_avx2
+.private_extern _rsaz_1024_mul_avx2
+.globl _rsaz_1024_norm2red_avx2
+.private_extern _rsaz_1024_norm2red_avx2
+.globl _rsaz_1024_red2norm_avx2
+.private_extern _rsaz_1024_red2norm_avx2
+.globl _rsaz_1024_scatter5_avx2
+.private_extern _rsaz_1024_scatter5_avx2
+.globl _rsaz_1024_gather5_avx2
+.private_extern _rsaz_1024_gather5_avx2
+
+_rsaz_1024_sqr_avx2:
+_rsaz_1024_mul_avx2:
+_rsaz_1024_norm2red_avx2:
+_rsaz_1024_red2norm_avx2:
+_rsaz_1024_scatter5_avx2:
+_rsaz_1024_gather5_avx2:
+.byte 0x0f,0x0b
+ .byte 0xf3,0xc3
+
+#endif
diff --git a/mac-x86_64/crypto/bn/rsaz-x86_64.S b/mac-x86_64/crypto/bn/rsaz-x86_64.S
new file mode 100644
index 0000000..5e9e82f
--- /dev/null
+++ b/mac-x86_64/crypto/bn/rsaz-x86_64.S
@@ -0,0 +1,1126 @@
+#if defined(__x86_64__)
+.text
+
+
+
+.globl _rsaz_512_sqr
+.private_extern _rsaz_512_sqr
+
+.p2align 5
+_rsaz_512_sqr:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ subq $128+24,%rsp
+L$sqr_body:
+ movq %rdx,%rbp
+ movq (%rsi),%rdx
+ movq 8(%rsi),%rax
+ movq %rcx,128(%rsp)
+ jmp L$oop_sqr
+
+.p2align 5
+L$oop_sqr:
+ movl %r8d,128+8(%rsp)
+
+ movq %rdx,%rbx
+ mulq %rdx
+ movq %rax,%r8
+ movq 16(%rsi),%rax
+ movq %rdx,%r9
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 24(%rsi),%rax
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 32(%rsi),%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 40(%rsi),%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 48(%rsi),%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 56(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ addq %rax,%r14
+ movq %rbx,%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ addq %r8,%r8
+ movq %r9,%rcx
+ adcq %r9,%r9
+
+ mulq %rax
+ movq %rax,(%rsp)
+ addq %rdx,%r8
+ adcq $0,%r9
+
+ movq %r8,8(%rsp)
+ shrq $63,%rcx
+
+
+ movq 8(%rsi),%r8
+ movq 16(%rsi),%rax
+ mulq %r8
+ addq %rax,%r10
+ movq 24(%rsi),%rax
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r11
+ movq 32(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r11
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r12
+ movq 40(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r12
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r13
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r13
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r14
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r14
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r15
+ movq %r8,%rax
+ adcq $0,%rdx
+ addq %rbx,%r15
+ movq %rdx,%r8
+ movq %r10,%rdx
+ adcq $0,%r8
+
+ addq %rdx,%rdx
+ leaq (%rcx,%r10,2),%r10
+ movq %r11,%rbx
+ adcq %r11,%r11
+
+ mulq %rax
+ addq %rax,%r9
+ adcq %rdx,%r10
+ adcq $0,%r11
+
+ movq %r9,16(%rsp)
+ movq %r10,24(%rsp)
+ shrq $63,%rbx
+
+
+ movq 16(%rsi),%r9
+ movq 24(%rsi),%rax
+ mulq %r9
+ addq %rax,%r12
+ movq 32(%rsi),%rax
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r9
+ addq %rax,%r13
+ movq 40(%rsi),%rax
+ adcq $0,%rdx
+ addq %rcx,%r13
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r9
+ addq %rax,%r14
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %rcx,%r14
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r9
+ movq %r12,%r10
+ leaq (%rbx,%r12,2),%r12
+ addq %rax,%r15
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %rcx,%r15
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r9
+ shrq $63,%r10
+ addq %rax,%r8
+ movq %r9,%rax
+ adcq $0,%rdx
+ addq %rcx,%r8
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ movq %r13,%rcx
+ leaq (%r10,%r13,2),%r13
+
+ mulq %rax
+ addq %rax,%r11
+ adcq %rdx,%r12
+ adcq $0,%r13
+
+ movq %r11,32(%rsp)
+ movq %r12,40(%rsp)
+ shrq $63,%rcx
+
+
+ movq 24(%rsi),%r10
+ movq 32(%rsi),%rax
+ mulq %r10
+ addq %rax,%r14
+ movq 40(%rsi),%rax
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r10
+ addq %rax,%r15
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r15
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r10
+ movq %r14,%r12
+ leaq (%rcx,%r14,2),%r14
+ addq %rax,%r8
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r8
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r10
+ shrq $63,%r12
+ addq %rax,%r9
+ movq %r10,%rax
+ adcq $0,%rdx
+ addq %rbx,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ movq %r15,%rbx
+ leaq (%r12,%r15,2),%r15
+
+ mulq %rax
+ addq %rax,%r13
+ adcq %rdx,%r14
+ adcq $0,%r15
+
+ movq %r13,48(%rsp)
+ movq %r14,56(%rsp)
+ shrq $63,%rbx
+
+
+ movq 32(%rsi),%r11
+ movq 40(%rsi),%rax
+ mulq %r11
+ addq %rax,%r8
+ movq 48(%rsi),%rax
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r11
+ addq %rax,%r9
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ movq %r8,%r12
+ leaq (%rbx,%r8,2),%r8
+ addq %rcx,%r9
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r11
+ shrq $63,%r12
+ addq %rax,%r10
+ movq %r11,%rax
+ adcq $0,%rdx
+ addq %rcx,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ movq %r9,%rcx
+ leaq (%r12,%r9,2),%r9
+
+ mulq %rax
+ addq %rax,%r15
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+ movq %r15,64(%rsp)
+ movq %r8,72(%rsp)
+ shrq $63,%rcx
+
+
+ movq 40(%rsi),%r12
+ movq 48(%rsi),%rax
+ mulq %r12
+ addq %rax,%r10
+ movq 56(%rsi),%rax
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r12
+ addq %rax,%r11
+ movq %r12,%rax
+ movq %r10,%r15
+ leaq (%rcx,%r10,2),%r10
+ adcq $0,%rdx
+ shrq $63,%r15
+ addq %rbx,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ movq %r11,%rbx
+ leaq (%r15,%r11,2),%r11
+
+ mulq %rax
+ addq %rax,%r9
+ adcq %rdx,%r10
+ adcq $0,%r11
+
+ movq %r9,80(%rsp)
+ movq %r10,88(%rsp)
+
+
+ movq 48(%rsi),%r13
+ movq 56(%rsi),%rax
+ mulq %r13
+ addq %rax,%r12
+ movq %r13,%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ xorq %r14,%r14
+ shlq $1,%rbx
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+
+ mulq %rax
+ addq %rax,%r11
+ adcq %rdx,%r12
+ adcq $0,%r13
+
+ movq %r11,96(%rsp)
+ movq %r12,104(%rsp)
+
+
+ movq 56(%rsi),%rax
+ mulq %rax
+ addq %rax,%r13
+ adcq $0,%rdx
+
+ addq %rdx,%r14
+
+ movq %r13,112(%rsp)
+ movq %r14,120(%rsp)
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reduce
+
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8,%rdx
+ movq %r9,%rax
+ movl 128+8(%rsp),%r8d
+ movq %rdi,%rsi
+
+ decl %r8d
+ jnz L$oop_sqr
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$sqr_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _rsaz_512_mul
+.private_extern _rsaz_512_mul
+
+.p2align 5
+_rsaz_512_mul:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ subq $128+24,%rsp
+L$mul_body:
+.byte 102,72,15,110,199
+.byte 102,72,15,110,201
+ movq %r8,128(%rsp)
+ movq (%rdx),%rbx
+ movq %rdx,%rbp
+ call __rsaz_512_mul
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reduce
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$mul_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _rsaz_512_mul_gather4
+.private_extern _rsaz_512_mul_gather4
+
+.p2align 5
+_rsaz_512_mul_gather4:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ subq $128+24,%rsp
+L$mul_gather4_body:
+ movl 64(%rdx,%r9,4),%eax
+.byte 102,72,15,110,199
+ movl (%rdx,%r9,4),%ebx
+.byte 102,72,15,110,201
+ movq %r8,128(%rsp)
+
+ shlq $32,%rax
+ orq %rax,%rbx
+ movq (%rsi),%rax
+ movq 8(%rsi),%rcx
+ leaq 128(%rdx,%r9,4),%rbp
+ mulq %rbx
+ movq %rax,(%rsp)
+ movq %rcx,%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ movd (%rbp),%xmm4
+ addq %rax,%r8
+ movq 16(%rsi),%rax
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ movd 64(%rbp),%xmm5
+ addq %rax,%r9
+ movq 24(%rsi),%rax
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ pslldq $4,%xmm5
+ addq %rax,%r10
+ movq 32(%rsi),%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ por %xmm5,%xmm4
+ addq %rax,%r11
+ movq 40(%rsi),%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 48(%rsi),%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ leaq 128(%rbp),%rbp
+ addq %rax,%r13
+ movq 56(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+.byte 102,72,15,126,227
+ addq %rax,%r14
+ movq (%rsi),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ leaq 8(%rsp),%rdi
+ movl $7,%ecx
+ jmp L$oop_mul_gather
+
+.p2align 5
+L$oop_mul_gather:
+ mulq %rbx
+ addq %rax,%r8
+ movq 8(%rsi),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ movd (%rbp),%xmm4
+ addq %rax,%r9
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ movd 64(%rbp),%xmm5
+ addq %rax,%r10
+ movq 24(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ pslldq $4,%xmm5
+ addq %rax,%r11
+ movq 32(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ por %xmm5,%xmm4
+ addq %rax,%r12
+ movq 40(%rsi),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+.byte 102,72,15,126,227
+ addq %rax,%r15
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ leaq 128(%rbp),%rbp
+ leaq 8(%rdi),%rdi
+
+ decl %ecx
+ jnz L$oop_mul_gather
+
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reduce
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$mul_gather4_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _rsaz_512_mul_scatter4
+.private_extern _rsaz_512_mul_scatter4
+
+.p2align 5
+_rsaz_512_mul_scatter4:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ subq $128+24,%rsp
+L$mul_scatter4_body:
+ leaq (%r8,%r9,4),%r8
+.byte 102,72,15,110,199
+.byte 102,72,15,110,202
+.byte 102,73,15,110,208
+ movq %rcx,128(%rsp)
+
+ movq %rdi,%rbp
+ movq (%rdi),%rbx
+ call __rsaz_512_mul
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reduce
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+.byte 102,72,15,126,214
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ movl %r8d,0(%rsi)
+ shrq $32,%r8
+ movl %r9d,128(%rsi)
+ shrq $32,%r9
+ movl %r10d,256(%rsi)
+ shrq $32,%r10
+ movl %r11d,384(%rsi)
+ shrq $32,%r11
+ movl %r12d,512(%rsi)
+ shrq $32,%r12
+ movl %r13d,640(%rsi)
+ shrq $32,%r13
+ movl %r14d,768(%rsi)
+ shrq $32,%r14
+ movl %r15d,896(%rsi)
+ shrq $32,%r15
+ movl %r8d,64(%rsi)
+ movl %r9d,192(%rsi)
+ movl %r10d,320(%rsi)
+ movl %r11d,448(%rsi)
+ movl %r12d,576(%rsi)
+ movl %r13d,704(%rsi)
+ movl %r14d,832(%rsi)
+ movl %r15d,960(%rsi)
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$mul_scatter4_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _rsaz_512_mul_by_one
+.private_extern _rsaz_512_mul_by_one
+
+.p2align 5
+_rsaz_512_mul_by_one:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ subq $128+24,%rsp
+L$mul_by_one_body:
+ movq %rdx,%rbp
+ movq %rcx,128(%rsp)
+
+ movq (%rsi),%r8
+ pxor %xmm0,%xmm0
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+ movq 32(%rsi),%r12
+ movq 40(%rsi),%r13
+ movq 48(%rsi),%r14
+ movq 56(%rsi),%r15
+
+ movdqa %xmm0,(%rsp)
+ movdqa %xmm0,16(%rsp)
+ movdqa %xmm0,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm0,80(%rsp)
+ movdqa %xmm0,96(%rsp)
+ call __rsaz_512_reduce
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$mul_by_one_epilogue:
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+__rsaz_512_reduce:
+ movq %r8,%rbx
+ imulq 128+8(%rsp),%rbx
+ movq 0(%rbp),%rax
+ movl $8,%ecx
+ jmp L$reduction_loop
+
+.p2align 5
+L$reduction_loop:
+ mulq %rbx
+ movq 8(%rbp),%rax
+ negq %r8
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq 128+8(%rsp),%rsi
+
+
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rbp),%rax
+ adcq $0,%rdx
+ imulq %r8,%rsi
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq %rsi,%rbx
+ addq %rax,%r15
+ movq 0(%rbp),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jne L$reduction_loop
+
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+__rsaz_512_subtract:
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ movq 0(%rbp),%r8
+ movq 8(%rbp),%r9
+ negq %r8
+ notq %r9
+ andq %rcx,%r8
+ movq 16(%rbp),%r10
+ andq %rcx,%r9
+ notq %r10
+ movq 24(%rbp),%r11
+ andq %rcx,%r10
+ notq %r11
+ movq 32(%rbp),%r12
+ andq %rcx,%r11
+ notq %r12
+ movq 40(%rbp),%r13
+ andq %rcx,%r12
+ notq %r13
+ movq 48(%rbp),%r14
+ andq %rcx,%r13
+ notq %r14
+ movq 56(%rbp),%r15
+ andq %rcx,%r14
+ notq %r15
+ andq %rcx,%r15
+
+ addq (%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+__rsaz_512_mul:
+ leaq 8(%rsp),%rdi
+
+ movq (%rsi),%rax
+ mulq %rbx
+ movq %rax,(%rdi)
+ movq 8(%rsi),%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ addq %rax,%r8
+ movq 16(%rsi),%rax
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 24(%rsi),%rax
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 32(%rsi),%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 40(%rsi),%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 48(%rsi),%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 56(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ addq %rax,%r14
+ movq (%rsi),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ leaq 8(%rbp),%rbp
+ leaq 8(%rdi),%rdi
+
+ movl $7,%ecx
+ jmp L$oop_mul
+
+.p2align 5
+L$oop_mul:
+ movq (%rbp),%rbx
+ mulq %rbx
+ addq %rax,%r8
+ movq 8(%rsi),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rsi),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ leaq 8(%rbp),%rbp
+ adcq $0,%r14
+
+ mulq %rbx
+ addq %rax,%r15
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ leaq 8(%rdi),%rdi
+
+ decl %ecx
+ jnz L$oop_mul
+
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ .byte 0xf3,0xc3
+
+.globl _rsaz_512_scatter4
+.private_extern _rsaz_512_scatter4
+
+.p2align 4
+_rsaz_512_scatter4:
+ leaq (%rdi,%rdx,4),%rdi
+ movl $8,%r9d
+ jmp L$oop_scatter
+.p2align 4
+L$oop_scatter:
+ movq (%rsi),%rax
+ leaq 8(%rsi),%rsi
+ movl %eax,(%rdi)
+ shrq $32,%rax
+ movl %eax,64(%rdi)
+ leaq 128(%rdi),%rdi
+ decl %r9d
+ jnz L$oop_scatter
+ .byte 0xf3,0xc3
+
+
+.globl _rsaz_512_gather4
+.private_extern _rsaz_512_gather4
+
+.p2align 4
+_rsaz_512_gather4:
+ leaq (%rsi,%rdx,4),%rsi
+ movl $8,%r9d
+ jmp L$oop_gather
+.p2align 4
+L$oop_gather:
+ movl (%rsi),%eax
+ movl 64(%rsi),%r8d
+ leaq 128(%rsi),%rsi
+ shlq $32,%r8
+ orq %r8,%rax
+ movq %rax,(%rdi)
+ leaq 8(%rdi),%rdi
+ decl %r9d
+ jnz L$oop_gather
+ .byte 0xf3,0xc3
+
+#endif
diff --git a/mac-x86_64/crypto/bn/x86_64-mont.S b/mac-x86_64/crypto/bn/x86_64-mont.S
new file mode 100644
index 0000000..6b9bc05
--- /dev/null
+++ b/mac-x86_64/crypto/bn/x86_64-mont.S
@@ -0,0 +1,726 @@
+#if defined(__x86_64__)
+.text
+
+
+
+.globl _bn_mul_mont
+.private_extern _bn_mul_mont
+
+.p2align 4
+_bn_mul_mont:
+ testl $3,%r9d
+ jnz L$mul_enter
+ cmpl $8,%r9d
+ jb L$mul_enter
+ cmpq %rsi,%rdx
+ jne L$mul4x_enter
+ testl $7,%r9d
+ jz L$sqr8x_enter
+ jmp L$mul4x_enter
+
+.p2align 4
+L$mul_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 2(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+L$mul_body:
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp L$1st_enter
+
+.p2align 4
+L$1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+L$1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne L$1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp L$outer
+.p2align 4
+L$outer:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp L$inner_enter
+
+.p2align 4
+L$inner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+L$inner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne L$inner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb L$outer
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp L$sub
+.p2align 4
+L$sub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz L$sub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ movq %r9,%r15
+.p2align 4
+L$copy:
+ movq (%rsp,%r14,8),%rsi
+ movq (%rdi,%r14,8),%rcx
+ xorq %rcx,%rsi
+ andq %rax,%rsi
+ xorq %rcx,%rsi
+ movq %r14,(%rsp,%r14,8)
+ movq %rsi,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz L$copy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$mul_epilogue:
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+bn_mul4x_mont:
+L$mul4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 4(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+L$mul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp L$1st4x
+.p2align 4
+L$1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb L$1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.p2align 2
+L$outer4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp L$inner4x
+.p2align 4
+L$inner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb L$inner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jb L$outer4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp L$sub4x
+.p2align 4
+L$sub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz L$sub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rax,%xmm0
+ punpcklqdq %xmm0,%xmm0
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+
+ movq %r9,%r15
+ pxor %xmm5,%xmm5
+ jmp L$copy4x
+.p2align 4
+L$copy4x:
+ movdqu (%rsp,%r14,1),%xmm2
+ movdqu 16(%rsp,%r14,1),%xmm4
+ movdqu (%rdi,%r14,1),%xmm1
+ movdqu 16(%rdi,%r14,1),%xmm3
+ pxor %xmm1,%xmm2
+ pxor %xmm3,%xmm4
+ pand %xmm0,%xmm2
+ pand %xmm0,%xmm4
+ pxor %xmm1,%xmm2
+ pxor %xmm3,%xmm4
+ movdqu %xmm2,(%rdi,%r14,1)
+ movdqu %xmm4,16(%rdi,%r14,1)
+ movdqa %xmm5,(%rsp,%r14,1)
+ movdqa %xmm5,16(%rsp,%r14,1)
+
+ leaq 32(%r14),%r14
+ decq %r15
+ jnz L$copy4x
+
+ shlq $2,%r9
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$mul4x_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+
+.p2align 5
+bn_sqr8x_mont:
+L$sqr8x_enter:
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shlq $3+2,%r10
+ negq %r9
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,4),%r11
+ movq (%r8),%r8
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$sqr8x_sp_alt
+ subq %r11,%rsp
+ leaq -64(%rsp,%r9,4),%rsp
+ jmp L$sqr8x_sp_done
+
+.p2align 5
+L$sqr8x_sp_alt:
+ leaq 4096-64(,%r9,4),%r10
+ leaq -64(%rsp,%r9,4),%rsp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rsp
+L$sqr8x_sp_done:
+ andq $-64,%rsp
+ movq %r9,%r10
+ negq %r9
+
+ leaq 64(%rsp,%r9,2),%r11
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+L$sqr8x_body:
+
+ movq %r9,%rbp
+.byte 102,73,15,110,211
+ shrq $3+2,%rbp
+ movl _OPENSSL_ia32cap_P+8(%rip),%eax
+ jmp L$sqr8x_copy_n
+
+.p2align 5
+L$sqr8x_copy_n:
+ movq 0(%rcx),%xmm0
+ movq 8(%rcx),%xmm1
+ movq 16(%rcx),%xmm3
+ movq 24(%rcx),%xmm4
+ leaq 32(%rcx),%rcx
+ movdqa %xmm0,0(%r11)
+ movdqa %xmm1,16(%r11)
+ movdqa %xmm3,32(%r11)
+ movdqa %xmm4,48(%r11)
+ leaq 64(%r11),%r11
+ decq %rbp
+ jnz L$sqr8x_copy_n
+
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,73,15,110,218
+ call _bn_sqr8x_internal
+
+ pxor %xmm0,%xmm0
+ leaq 48(%rsp),%rax
+ leaq 64(%rsp,%r9,2),%rdx
+ shrq $3+2,%r9
+ movq 40(%rsp),%rsi
+ jmp L$sqr8x_zero
+
+.p2align 5
+L$sqr8x_zero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ movdqa %xmm0,32(%rax)
+ movdqa %xmm0,48(%rax)
+ leaq 64(%rax),%rax
+ movdqa %xmm0,0(%rdx)
+ movdqa %xmm0,16(%rdx)
+ movdqa %xmm0,32(%rdx)
+ movdqa %xmm0,48(%rdx)
+ leaq 64(%rdx),%rdx
+ decq %r9
+ jnz L$sqr8x_zero
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+L$sqr8x_epilogue:
+ .byte 0xf3,0xc3
+
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 4
+#endif
diff --git a/mac-x86_64/crypto/bn/x86_64-mont5.S b/mac-x86_64/crypto/bn/x86_64-mont5.S
new file mode 100644
index 0000000..2e8f469
--- /dev/null
+++ b/mac-x86_64/crypto/bn/x86_64-mont5.S
@@ -0,0 +1,1822 @@
+#if defined(__x86_64__)
+.text
+
+
+
+.globl _bn_mul_mont_gather5
+.private_extern _bn_mul_mont_gather5
+
+.p2align 6
+_bn_mul_mont_gather5:
+ testl $7,%r9d
+ jnz L$mul_enter
+ jmp L$mul4x_enter
+
+.p2align 4
+L$mul_enter:
+ movl %r9d,%r9d
+ movq %rsp,%rax
+ movl 8(%rsp),%r10d
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq 2(%r9),%r11
+ negq %r11
+ leaq (%rsp,%r11,8),%rsp
+ andq $-1024,%rsp
+
+ movq %rax,8(%rsp,%r9,8)
+L$mul_body:
+ movq %rdx,%r12
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq L$magic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%r12,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+.byte 102,72,15,126,195
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp L$1st_enter
+
+.p2align 4
+L$1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+L$1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne L$1st
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp L$outer
+.p2align 4
+L$outer:
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp L$inner_enter
+
+.p2align 4
+L$inner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+L$inner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne L$inner
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb L$outer
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp L$sub
+.p2align 4
+L$sub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz L$sub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ movq %r9,%r15
+.p2align 4
+L$copy:
+ movq (%rsp,%r14,8),%rsi
+ movq (%rdi,%r14,8),%rcx
+ xorq %rcx,%rsi
+ andq %rax,%rsi
+ xorq %rcx,%rsi
+ movq %r14,(%rsp,%r14,8)
+ movq %rsi,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz L$copy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+L$mul_epilogue:
+ .byte 0xf3,0xc3
+
+
+.p2align 5
+bn_mul4x_mont_gather5:
+L$mul4x_enter:
+.byte 0x67
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+.byte 0x67
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shll $3+2,%r10d
+ negq %r9
+
+
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$mul4xsp_alt
+ subq %r11,%rsp
+ leaq -64(%rsp,%r9,2),%rsp
+ jmp L$mul4xsp_done
+
+.p2align 5
+L$mul4xsp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rsp
+L$mul4xsp_done:
+ andq $-64,%rsp
+ negq %r9
+
+ movq %rax,40(%rsp)
+L$mul4x_body:
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+L$mul4x_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+mul4x_internal:
+ shlq $5,%r9
+ movl 8(%rax),%r10d
+ leaq 256(%rdx,%r9,1),%r13
+ shrq $5,%r9
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq L$magic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%rdx,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ addq $7,%r11
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+ andq $7,%r11
+
+ movq -96(%r12),%xmm0
+ leaq 256(%r12),%r14
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+.byte 0x67
+ por %xmm1,%xmm0
+ movq -96(%r14),%xmm1
+.byte 0x67
+ pand %xmm7,%xmm3
+.byte 0x67
+ por %xmm2,%xmm0
+ movq -32(%r14),%xmm2
+.byte 0x67
+ pand %xmm4,%xmm1
+.byte 0x67
+ por %xmm3,%xmm0
+ movq 32(%r14),%xmm3
+
+.byte 102,72,15,126,195
+ movq 96(%r14),%xmm0
+ movq %r13,16+8(%rsp)
+ movq %rdi,56+8(%rsp)
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+ leaq (%rsi,%r9,1),%rsi
+ negq %r9
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ pand %xmm5,%xmm2
+ pand %xmm6,%xmm3
+ por %xmm2,%xmm1
+
+ imulq %r10,%rbp
+
+
+
+
+
+
+
+ leaq 64+8(%rsp,%r11,8),%r14
+ movq %rdx,%r11
+
+ pand %xmm7,%xmm0
+ por %xmm3,%xmm1
+ leaq 512(%r12),%r12
+ por %xmm1,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 16(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 64(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+ jmp L$1st4x
+
+.p2align 5
+L$1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -32(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 16(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 64(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz L$1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -32(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+ leaq (%rcx,%r9,2),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ jmp L$outer4x
+
+.p2align 5
+L$outer4x:
+ movq (%r14,%r9,1),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+
+ imulq %r10,%rbp
+.byte 0x67
+ movq %rdx,%r11
+ movq %rdi,(%r14)
+
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq (%r14,%r9,1),%r14
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 16(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 64(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdx,%r13
+ jmp L$inner4x
+
+.p2align 5
+L$inner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -32(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ addq (%r14),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 16(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 64(%rcx),%rcx
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz L$inner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -32(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq %rbp,%rax
+ movq -16(%rcx),%rbp
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+ movq %rdi,-16(%r14)
+ leaq (%rcx,%r9,2),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%r14),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ cmpq 16+8(%rsp),%r12
+ jb L$outer4x
+ subq %r13,%rbp
+ adcq %r15,%r15
+ orq %r15,%rdi
+ xorq $1,%rdi
+ leaq (%r14,%r9,1),%rbx
+ leaq (%rcx,%rdi,8),%rbp
+ movq %r9,%rcx
+ sarq $3+2,%rcx
+ movq 56+8(%rsp),%rdi
+ jmp L$sqr4x_sub
+
+.globl _bn_power5
+.private_extern _bn_power5
+
+.p2align 5
+_bn_power5:
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shll $3+2,%r10d
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$pwr_sp_alt
+ subq %r11,%rsp
+ leaq -64(%rsp,%r9,2),%rsp
+ jmp L$pwr_sp_done
+
+.p2align 5
+L$pwr_sp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rsp
+L$pwr_sp_done:
+ andq $-64,%rsp
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+L$power5_body:
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq %rsi,%rdi
+ movq 40(%rsp),%rax
+ leaq 32(%rsp),%r8
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+L$power5_epilogue:
+ .byte 0xf3,0xc3
+
+
+.globl _bn_sqr8x_internal
+.private_extern _bn_sqr8x_internal
+.private_extern _bn_sqr8x_internal
+
+.p2align 5
+_bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r11,-16(%rdi,%rbp,1)
+ movq %rdx,%r10
+
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ leaq (%rbp),%rcx
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp L$sqr4x_1st
+
+.p2align 5
+L$sqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq 16(%rsi,%rcx,1),%rbx
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %r10,8(%rdi,%rcx,1)
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 24(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,16(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+ leaq 32(%rcx),%rcx
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne L$sqr4x_1st
+
+ mulq %r15
+ addq %rax,%r13
+ leaq 16(%rbp),%rbp
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+ jmp L$sqr4x_outer
+
+.p2align 5
+L$sqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq -24(%rdi,%rbp,1),%r10
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r10,-24(%rdi,%rbp,1)
+ movq %rdx,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -16(%rdi,%rbp,1),%r11
+ movq %rdx,%r10
+ adcq $0,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ xorq %r12,%r12
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -8(%rdi,%rbp,1),%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rbp,1)
+
+ leaq (%rbp),%rcx
+ jmp L$sqr4x_inner
+
+.p2align 5
+L$sqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+
+.byte 0x67
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %r11,(%rdi,%rcx,1)
+ movq %rbx,%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ leaq 16(%rcx),%rcx
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne L$sqr4x_inner
+
+.byte 0x67
+ mulq %r15
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ addq $16,%rbp
+ jnz L$sqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq %r10,-24(%rdi)
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ movq -8(%rsi),%rbx
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,-16(%rdi)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi)
+
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 48+8(%rsp),%rdi
+ xorq %r10,%r10
+ movq 8(%rdi),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ jmp L$sqr4x_shift_n_add
+
+.p2align 5
+L$sqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ addq $32,%rbp
+ jnz L$sqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+.byte 0x67
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+.byte 102,72,15,126,213
+sqr8x_reduction:
+ xorq %rax,%rax
+ leaq (%rbp,%r9,2),%rcx
+ leaq 48+8(%rsp,%r9,2),%rdx
+ movq %rcx,0+8(%rsp)
+ leaq 48+8(%rsp,%r9,1),%rdi
+ movq %rdx,8+8(%rsp)
+ negq %r9
+ jmp L$8x_reduction_loop
+
+.p2align 5
+L$8x_reduction_loop:
+ leaq (%rdi,%r9,1),%rdi
+.byte 0x66
+ movq 0(%rdi),%rbx
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,(%rdx)
+ leaq 64(%rdi),%rdi
+
+.byte 0x67
+ movq %rbx,%r8
+ imulq 32+8(%rsp),%rbx
+ movq 0(%rbp),%rax
+ movl $8,%ecx
+ jmp L$8x_reduce
+
+.p2align 5
+L$8x_reduce:
+ mulq %rbx
+ movq 16(%rbp),%rax
+ negq %r8
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rbx,48-8+8(%rsp,%rcx,8)
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq 32+8(%rsp),%rsi
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 64(%rbp),%rax
+ adcq $0,%rdx
+ imulq %r8,%rsi
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 80(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 96(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 112(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq %rsi,%rbx
+ addq %rax,%r15
+ movq 0(%rbp),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz L$8x_reduce
+
+ leaq 128(%rbp),%rbp
+ xorq %rax,%rax
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae L$8x_no_tail
+
+.byte 0x66
+ addq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movq 48+56+8(%rsp),%rbx
+ movl $8,%ecx
+ movq 0(%rbp),%rax
+ jmp L$8x_tail
+
+.p2align 5
+L$8x_tail:
+ mulq %rbx
+ addq %rax,%r8
+ movq 16(%rbp),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ leaq 8(%rdi),%rdi
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 64(%rbp),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 80(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 96(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 112(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq 48-16+8(%rsp,%rcx,8),%rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq 0(%rbp),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz L$8x_tail
+
+ leaq 128(%rbp),%rbp
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae L$8x_tail_done
+
+ movq 48+56+8(%rsp),%rbx
+ negq %rsi
+ movq 0(%rbp),%rax
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movl $8,%ecx
+ jmp L$8x_tail
+
+.p2align 5
+L$8x_tail_done:
+ addq (%rdx),%r8
+ xorq %rax,%rax
+
+ negq %rsi
+L$8x_no_tail:
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+ movq -16(%rbp),%rcx
+ xorq %rsi,%rsi
+
+.byte 102,72,15,126,213
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+.byte 102,73,15,126,217
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+ leaq 64(%rdi),%rdi
+
+ cmpq %rdx,%rdi
+ jb L$8x_reduction_loop
+
+ subq %r15,%rcx
+ leaq (%rdi,%r9,1),%rbx
+ adcq %rsi,%rsi
+ movq %r9,%rcx
+ orq %rsi,%rax
+.byte 102,72,15,126,207
+ xorq $1,%rax
+.byte 102,72,15,126,206
+ leaq (%rbp,%rax,8),%rbp
+ sarq $3+2,%rcx
+ jmp L$sqr4x_sub
+
+.p2align 5
+L$sqr4x_sub:
+.byte 0x66
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ sbbq 0(%rbp),%r12
+ movq 16(%rbx),%r14
+ sbbq 16(%rbp),%r13
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 32(%rbp),%r14
+ movq %r12,0(%rdi)
+ sbbq 48(%rbp),%r15
+ leaq 64(%rbp),%rbp
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+
+ incq %rcx
+ jnz L$sqr4x_sub
+ movq %r9,%r10
+ negq %r9
+ .byte 0xf3,0xc3
+
+.globl _bn_from_montgomery
+.private_extern _bn_from_montgomery
+
+.p2align 5
+_bn_from_montgomery:
+ testl $7,%r9d
+ jz bn_from_mont8x
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+bn_from_mont8x:
+.byte 0x67
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+.byte 0x67
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shll $3+2,%r10d
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb L$from_sp_alt
+ subq %r11,%rsp
+ leaq -64(%rsp,%r9,2),%rsp
+ jmp L$from_sp_done
+
+.p2align 5
+L$from_sp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rsp
+L$from_sp_done:
+ andq $-64,%rsp
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+L$from_body:
+ movq %r9,%r11
+ leaq 48(%rsp),%rax
+ pxor %xmm0,%xmm0
+ jmp L$mul_by_1
+
+.p2align 5
+L$mul_by_1:
+ movdqu (%rsi),%xmm1
+ movdqu 16(%rsi),%xmm2
+ movdqu 32(%rsi),%xmm3
+ movdqa %xmm0,(%rax,%r9,1)
+ movdqu 48(%rsi),%xmm4
+ movdqa %xmm0,16(%rax,%r9,1)
+.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
+ movdqa %xmm1,(%rax)
+ movdqa %xmm0,32(%rax,%r9,1)
+ movdqa %xmm2,16(%rax)
+ movdqa %xmm0,48(%rax,%r9,1)
+ movdqa %xmm3,32(%rax)
+ movdqa %xmm4,48(%rax)
+ leaq 64(%rax),%rax
+ subq $64,%r11
+ jnz L$mul_by_1
+
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 0x67
+ movq %rcx,%rbp
+.byte 102,73,15,110,218
+ call sqr8x_reduction
+
+ pxor %xmm0,%xmm0
+ leaq 48(%rsp),%rax
+ movq 40(%rsp),%rsi
+ jmp L$from_mont_zero
+
+.p2align 5
+L$from_mont_zero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ movdqa %xmm0,32(%rax)
+ movdqa %xmm0,48(%rax)
+ leaq 64(%rax),%rax
+ subq $32,%r9
+ jnz L$from_mont_zero
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+L$from_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _bn_scatter5
+.private_extern _bn_scatter5
+
+.p2align 4
+_bn_scatter5:
+ cmpl $0,%esi
+ jz L$scatter_epilogue
+ leaq (%rdx,%rcx,8),%rdx
+L$scatter:
+ movq (%rdi),%rax
+ leaq 8(%rdi),%rdi
+ movq %rax,(%rdx)
+ leaq 256(%rdx),%rdx
+ subl $1,%esi
+ jnz L$scatter
+L$scatter_epilogue:
+ .byte 0xf3,0xc3
+
+
+.globl _bn_gather5
+.private_extern _bn_gather5
+
+.p2align 4
+_bn_gather5:
+ movl %ecx,%r11d
+ shrl $3,%ecx
+ andq $7,%r11
+ notl %ecx
+ leaq L$magic_masks(%rip),%rax
+ andl $3,%ecx
+ leaq 128(%rdx,%r11,8),%rdx
+ movq 0(%rax,%rcx,8),%xmm4
+ movq 8(%rax,%rcx,8),%xmm5
+ movq 16(%rax,%rcx,8),%xmm6
+ movq 24(%rax,%rcx,8),%xmm7
+ jmp L$gather
+.p2align 4
+L$gather:
+ movq -128(%rdx),%xmm0
+ movq -64(%rdx),%xmm1
+ pand %xmm4,%xmm0
+ movq 0(%rdx),%xmm2
+ pand %xmm5,%xmm1
+ movq 64(%rdx),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+.byte 0x67,0x67
+ por %xmm2,%xmm0
+ leaq 256(%rdx),%rdx
+ por %xmm3,%xmm0
+
+ movq %xmm0,(%rdi)
+ leaq 8(%rdi),%rdi
+ subl $1,%esi
+ jnz L$gather
+ .byte 0xf3,0xc3
+L$SEH_end_bn_gather5:
+
+.p2align 6
+L$magic_masks:
+.long 0,0, 0,0, 0,0, -1,-1
+.long 0,0, 0,0, 0,0, 0,0
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif
diff --git a/mac-x86_64/crypto/cpu-x86_64-asm.S b/mac-x86_64/crypto/cpu-x86_64-asm.S
new file mode 100644
index 0000000..faf4e2f
--- /dev/null
+++ b/mac-x86_64/crypto/cpu-x86_64-asm.S
@@ -0,0 +1,147 @@
+#if defined(__x86_64__)
+.text
+
+.globl _OPENSSL_ia32_cpuid
+.private_extern _OPENSSL_ia32_cpuid
+
+.p2align 4
+_OPENSSL_ia32_cpuid:
+
+
+ movq %rdi,%rdi
+ movq %rbx,%r8
+
+ xorl %eax,%eax
+ movl %eax,8(%rdi)
+ cpuid
+ movl %eax,%r11d
+
+ xorl %eax,%eax
+ cmpl $1970169159,%ebx
+ setne %al
+ movl %eax,%r9d
+ cmpl $1231384169,%edx
+ setne %al
+ orl %eax,%r9d
+ cmpl $1818588270,%ecx
+ setne %al
+ orl %eax,%r9d
+ jz L$intel
+
+ cmpl $1752462657,%ebx
+ setne %al
+ movl %eax,%r10d
+ cmpl $1769238117,%edx
+ setne %al
+ orl %eax,%r10d
+ cmpl $1145913699,%ecx
+ setne %al
+ orl %eax,%r10d
+ jnz L$intel
+
+
+
+
+ movl $2147483648,%eax
+ cpuid
+
+
+ cmpl $2147483649,%eax
+ jb L$intel
+ movl %eax,%r10d
+ movl $2147483649,%eax
+ cpuid
+
+
+ orl %ecx,%r9d
+ andl $2049,%r9d
+
+ cmpl $2147483656,%r10d
+ jb L$intel
+
+ movl $2147483656,%eax
+ cpuid
+
+ movzbq %cl,%r10
+ incq %r10
+
+ movl $1,%eax
+ cpuid
+
+ btl $28,%edx
+ jnc L$generic
+ shrl $16,%ebx
+ cmpb %r10b,%bl
+ ja L$generic
+ andl $4026531839,%edx
+ jmp L$generic
+
+L$intel:
+ cmpl $4,%r11d
+ movl $-1,%r10d
+ jb L$nocacheinfo
+
+ movl $4,%eax
+ movl $0,%ecx
+ cpuid
+ movl %eax,%r10d
+ shrl $14,%r10d
+ andl $4095,%r10d
+
+ cmpl $7,%r11d
+ jb L$nocacheinfo
+
+ movl $7,%eax
+ xorl %ecx,%ecx
+ cpuid
+ movl %ebx,8(%rdi)
+
+L$nocacheinfo:
+ movl $1,%eax
+ cpuid
+
+ andl $3220176895,%edx
+ cmpl $0,%r9d
+ jne L$notintel
+ orl $1073741824,%edx
+ andb $15,%ah
+ cmpb $15,%ah
+ jne L$notintel
+ orl $1048576,%edx
+L$notintel:
+ btl $28,%edx
+ jnc L$generic
+ andl $4026531839,%edx
+ cmpl $0,%r10d
+ je L$generic
+
+ orl $268435456,%edx
+ shrl $16,%ebx
+ cmpb $1,%bl
+ ja L$generic
+ andl $4026531839,%edx
+L$generic:
+ andl $2048,%r9d
+ andl $4294965247,%ecx
+ orl %ecx,%r9d
+
+ movl %edx,%r10d
+ btl $27,%r9d
+ jnc L$clear_avx
+ xorl %ecx,%ecx
+.byte 0x0f,0x01,0xd0
+ andl $6,%eax
+ cmpl $6,%eax
+ je L$done
+L$clear_avx:
+ movl $4026525695,%eax
+ andl %eax,%r9d
+ andl $4294967263,8(%rdi)
+L$done:
+ movl %r9d,4(%rdi)
+ movl %r10d,0(%rdi)
+ movq %r8,%rbx
+ .byte 0xf3,0xc3
+
+
+#endif
diff --git a/mac-x86_64/crypto/md5/md5-x86_64.S b/mac-x86_64/crypto/md5/md5-x86_64.S
new file mode 100644
index 0000000..1e61479
--- /dev/null
+++ b/mac-x86_64/crypto/md5/md5-x86_64.S
@@ -0,0 +1,671 @@
+#if defined(__x86_64__)
+.text
+.p2align 4
+
+.globl _md5_block_asm_data_order
+.private_extern _md5_block_asm_data_order
+
+_md5_block_asm_data_order:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r14
+ pushq %r15
+L$prologue:
+
+
+
+
+ movq %rdi,%rbp
+ shlq $6,%rdx
+ leaq (%rsi,%rdx,1),%rdi
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+
+
+
+
+
+
+
+ cmpq %rdi,%rsi
+ je L$end
+
+
+L$loop:
+ movl %eax,%r8d
+ movl %ebx,%r9d
+ movl %ecx,%r14d
+ movl %edx,%r15d
+ movl 0(%rsi),%r10d
+ movl %edx,%r11d
+ xorl %ecx,%r11d
+ leal -680876936(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 4(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -389564586(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal 606105819(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1044525330(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 16(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal -176418897(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 20(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal 1200080426(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1473231341(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -45705983(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 32(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1770035416(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 36(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -1958414417(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -42063(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1990404162(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 48(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1804603682(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 52(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -40341101(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1502002290(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal 1236535329(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 0(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ movl 4(%rsi),%r10d
+ movl %edx,%r11d
+ movl %edx,%r12d
+ notl %r11d
+ leal -165796510(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1069501632(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 643717713(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -373897302(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 20(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -701558691(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal 38016083(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -660478335(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 16(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -405537848(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 36(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal 568446438(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1019803690(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -187363961(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 32(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal 1163531501(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 52(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -1444681467(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -51403784(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 1735328473(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 48(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -1926607734(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ movl 20(%rsi),%r10d
+ movl %ecx,%r11d
+ leal -378558(%rax,%r10,1),%eax
+ movl 32(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -2022574463(%rdx,%r10,1),%edx
+ movl 44(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 1839030562(%rcx,%r10,1),%ecx
+ movl 56(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -35309556(%rbx,%r10,1),%ebx
+ movl 4(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -1530992060(%rax,%r10,1),%eax
+ movl 16(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal 1272893353(%rdx,%r10,1),%edx
+ movl 28(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -155497632(%rcx,%r10,1),%ecx
+ movl 40(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -1094730640(%rbx,%r10,1),%ebx
+ movl 52(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal 681279174(%rax,%r10,1),%eax
+ movl 0(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -358537222(%rdx,%r10,1),%edx
+ movl 12(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -722521979(%rcx,%r10,1),%ecx
+ movl 24(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal 76029189(%rbx,%r10,1),%ebx
+ movl 36(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -640364487(%rax,%r10,1),%eax
+ movl 48(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -421815835(%rdx,%r10,1),%edx
+ movl 60(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 530742520(%rcx,%r10,1),%ecx
+ movl 8(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -995338651(%rbx,%r10,1),%ebx
+ movl 0(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ movl 0(%rsi),%r10d
+ movl $4294967295,%r11d
+ xorl %edx,%r11d
+ leal -198630844(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 28(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal 1126891415(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 56(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1416354905(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 20(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -57434055(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 48(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1700485571(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 12(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1894986606(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 40(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1051523(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 4(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -2054922799(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 32(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1873313359(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 60(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -30611744(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 24(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1560198380(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 52(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal 1309151649(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 16(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal -145523070(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 44(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1120210379(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 8(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal 718787259(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 36(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -343485551(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 0(%rsi),%r10d
+ movl $4294967295,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+
+ addl %r8d,%eax
+ addl %r9d,%ebx
+ addl %r14d,%ecx
+ addl %r15d,%edx
+
+
+ addq $64,%rsi
+ cmpq %rdi,%rsi
+ jb L$loop
+
+
+L$end:
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ movq (%rsp),%r15
+ movq 8(%rsp),%r14
+ movq 16(%rsp),%r12
+ movq 24(%rsp),%rbx
+ movq 32(%rsp),%rbp
+ addq $40,%rsp
+L$epilogue:
+ .byte 0xf3,0xc3
+
+#endif
diff --git a/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S b/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S
new file mode 100644
index 0000000..21d5ad6
--- /dev/null
+++ b/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S
@@ -0,0 +1,19 @@
+#if defined(__x86_64__)
+.text
+
+.globl _aesni_gcm_encrypt
+.private_extern _aesni_gcm_encrypt
+
+_aesni_gcm_encrypt:
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+
+
+.globl _aesni_gcm_decrypt
+.private_extern _aesni_gcm_decrypt
+
+_aesni_gcm_decrypt:
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+
+#endif
diff --git a/mac-x86_64/crypto/modes/ghash-x86_64.S b/mac-x86_64/crypto/modes/ghash-x86_64.S
new file mode 100644
index 0000000..305a91c
--- /dev/null
+++ b/mac-x86_64/crypto/modes/ghash-x86_64.S
@@ -0,0 +1,1328 @@
+#if defined(__x86_64__)
+.text
+
+
+.globl _gcm_gmult_4bit
+.private_extern _gcm_gmult_4bit
+
+.p2align 4
+_gcm_gmult_4bit:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+L$gmult_prologue:
+
+ movzbq 15(%rdi),%r8
+ leaq L$rem_4bit(%rip),%r11
+ xorq %rax,%rax
+ xorq %rbx,%rbx
+ movb %r8b,%al
+ movb %r8b,%bl
+ shlb $4,%al
+ movq $14,%rcx
+ movq 8(%rsi,%rax,1),%r8
+ movq (%rsi,%rax,1),%r9
+ andb $240,%bl
+ movq %r8,%rdx
+ jmp L$oop1
+
+.p2align 4
+L$oop1:
+ shrq $4,%r8
+ andq $15,%rdx
+ movq %r9,%r10
+ movb (%rdi,%rcx,1),%al
+ shrq $4,%r9
+ xorq 8(%rsi,%rbx,1),%r8
+ shlq $60,%r10
+ xorq (%rsi,%rbx,1),%r9
+ movb %al,%bl
+ xorq (%r11,%rdx,8),%r9
+ movq %r8,%rdx
+ shlb $4,%al
+ xorq %r10,%r8
+ decq %rcx
+ js L$break1
+
+ shrq $4,%r8
+ andq $15,%rdx
+ movq %r9,%r10
+ shrq $4,%r9
+ xorq 8(%rsi,%rax,1),%r8
+ shlq $60,%r10
+ xorq (%rsi,%rax,1),%r9
+ andb $240,%bl
+ xorq (%r11,%rdx,8),%r9
+ movq %r8,%rdx
+ xorq %r10,%r8
+ jmp L$oop1
+
+.p2align 4
+L$break1:
+ shrq $4,%r8
+ andq $15,%rdx
+ movq %r9,%r10
+ shrq $4,%r9
+ xorq 8(%rsi,%rax,1),%r8
+ shlq $60,%r10
+ xorq (%rsi,%rax,1),%r9
+ andb $240,%bl
+ xorq (%r11,%rdx,8),%r9
+ movq %r8,%rdx
+ xorq %r10,%r8
+
+ shrq $4,%r8
+ andq $15,%rdx
+ movq %r9,%r10
+ shrq $4,%r9
+ xorq 8(%rsi,%rbx,1),%r8
+ shlq $60,%r10
+ xorq (%rsi,%rbx,1),%r9
+ xorq %r10,%r8
+ xorq (%r11,%rdx,8),%r9
+
+ bswapq %r8
+ bswapq %r9
+ movq %r8,8(%rdi)
+ movq %r9,(%rdi)
+
+ movq 16(%rsp),%rbx
+ leaq 24(%rsp),%rsp
+L$gmult_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _gcm_ghash_4bit
+.private_extern _gcm_ghash_4bit
+
+.p2align 4
+_gcm_ghash_4bit:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $280,%rsp
+L$ghash_prologue:
+ movq %rdx,%r14
+ movq %rcx,%r15
+ subq $-128,%rsi
+ leaq 16+128(%rsp),%rbp
+ xorl %edx,%edx
+ movq 0+0-128(%rsi),%r8
+ movq 0+8-128(%rsi),%rax
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq 16+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq 16+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,0(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,0(%rbp)
+ movq 32+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,0-128(%rbp)
+ movq 32+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,1(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,8(%rbp)
+ movq 48+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,8-128(%rbp)
+ movq 48+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,2(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,16(%rbp)
+ movq 64+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,16-128(%rbp)
+ movq 64+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,3(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,24(%rbp)
+ movq 80+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,24-128(%rbp)
+ movq 80+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,4(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,32(%rbp)
+ movq 96+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,32-128(%rbp)
+ movq 96+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,5(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,40(%rbp)
+ movq 112+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,40-128(%rbp)
+ movq 112+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,6(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,48(%rbp)
+ movq 128+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,48-128(%rbp)
+ movq 128+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,7(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,56(%rbp)
+ movq 144+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,56-128(%rbp)
+ movq 144+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,8(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,64(%rbp)
+ movq 160+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,64-128(%rbp)
+ movq 160+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,9(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,72(%rbp)
+ movq 176+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,72-128(%rbp)
+ movq 176+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,10(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,80(%rbp)
+ movq 192+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,80-128(%rbp)
+ movq 192+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,11(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,88(%rbp)
+ movq 208+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,88-128(%rbp)
+ movq 208+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,12(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,96(%rbp)
+ movq 224+0-128(%rsi),%r8
+ shlb $4,%dl
+ movq %rax,96-128(%rbp)
+ movq 224+8-128(%rsi),%rax
+ shlq $60,%r10
+ movb %dl,13(%rsp)
+ orq %r10,%rbx
+ movb %al,%dl
+ shrq $4,%rax
+ movq %r8,%r10
+ shrq $4,%r8
+ movq %r9,104(%rbp)
+ movq 240+0-128(%rsi),%r9
+ shlb $4,%dl
+ movq %rbx,104-128(%rbp)
+ movq 240+8-128(%rsi),%rbx
+ shlq $60,%r10
+ movb %dl,14(%rsp)
+ orq %r10,%rax
+ movb %bl,%dl
+ shrq $4,%rbx
+ movq %r9,%r10
+ shrq $4,%r9
+ movq %r8,112(%rbp)
+ shlb $4,%dl
+ movq %rax,112-128(%rbp)
+ shlq $60,%r10
+ movb %dl,15(%rsp)
+ orq %r10,%rbx
+ movq %r9,120(%rbp)
+ movq %rbx,120-128(%rbp)
+ addq $-128,%rsi
+ movq 8(%rdi),%r8
+ movq 0(%rdi),%r9
+ addq %r14,%r15
+ leaq L$rem_8bit(%rip),%r11
+ jmp L$outer_loop
+.p2align 4
+L$outer_loop:
+ xorq (%r14),%r9
+ movq 8(%r14),%rdx
+ leaq 16(%r14),%r14
+ xorq %r8,%rdx
+ movq %r9,(%rdi)
+ movq %rdx,8(%rdi)
+ shrq $32,%rdx
+ xorq %rax,%rax
+ roll $8,%edx
+ movb %dl,%al
+ movzbl %dl,%ebx
+ shlb $4,%al
+ shrl $4,%ebx
+ roll $8,%edx
+ movq 8(%rsi,%rax,1),%r8
+ movq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ xorq %r8,%r12
+ movq %r9,%r10
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ movl 8(%rdi),%edx
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ movl 4(%rdi),%edx
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ movl 0(%rdi),%edx
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ shrl $4,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r12,2),%r12
+ movzbl %dl,%ebx
+ shlb $4,%al
+ movzbq (%rsp,%rcx,1),%r13
+ shrl $4,%ebx
+ shlq $48,%r12
+ xorq %r8,%r13
+ movq %r9,%r10
+ xorq %r12,%r9
+ shrq $8,%r8
+ movzbq %r13b,%r13
+ shrq $8,%r9
+ xorq -128(%rbp,%rcx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rcx,8),%r9
+ roll $8,%edx
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ movb %dl,%al
+ xorq %r10,%r8
+ movzwq (%r11,%r13,2),%r13
+ movzbl %dl,%ecx
+ shlb $4,%al
+ movzbq (%rsp,%rbx,1),%r12
+ andl $240,%ecx
+ shlq $48,%r13
+ xorq %r8,%r12
+ movq %r9,%r10
+ xorq %r13,%r9
+ shrq $8,%r8
+ movzbq %r12b,%r12
+ movl -4(%rdi),%edx
+ shrq $8,%r9
+ xorq -128(%rbp,%rbx,8),%r8
+ shlq $56,%r10
+ xorq (%rbp,%rbx,8),%r9
+ movzwq (%r11,%r12,2),%r12
+ xorq 8(%rsi,%rax,1),%r8
+ xorq (%rsi,%rax,1),%r9
+ shlq $48,%r12
+ xorq %r10,%r8
+ xorq %r12,%r9
+ movzbq %r8b,%r13
+ shrq $4,%r8
+ movq %r9,%r10
+ shlb $4,%r13b
+ shrq $4,%r9
+ xorq 8(%rsi,%rcx,1),%r8
+ movzwq (%r11,%r13,2),%r13
+ shlq $60,%r10
+ xorq (%rsi,%rcx,1),%r9
+ xorq %r10,%r8
+ shlq $48,%r13
+ bswapq %r8
+ xorq %r13,%r9
+ bswapq %r9
+ cmpq %r15,%r14
+ jb L$outer_loop
+ movq %r8,8(%rdi)
+ movq %r9,(%rdi)
+
+ leaq 280(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$ghash_epilogue:
+ .byte 0xf3,0xc3
+
+.globl _gcm_init_clmul
+.private_extern _gcm_init_clmul
+
+.p2align 4
+_gcm_init_clmul:
+L$_init_clmul:
+ movdqu (%rsi),%xmm2
+ pshufd $78,%xmm2,%xmm2
+
+
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+
+
+ pand L$0x1c2_polynomial(%rip),%xmm5
+ pxor %xmm5,%xmm2
+
+
+ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rdi)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rdi)
+ .byte 0xf3,0xc3
+
+.globl _gcm_gmult_clmul
+.private_extern _gcm_gmult_clmul
+
+.p2align 4
+_gcm_gmult_clmul:
+L$_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa L$bswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
+.byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+
+.globl _gcm_ghash_clmul
+.private_extern _gcm_ghash_clmul
+
+.p2align 5
+_gcm_ghash_clmul:
+L$_ghash_clmul:
+ movdqa L$bswap_mask(%rip),%xmm10
+
+ movdqu (%rdi),%xmm0
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
+
+ subq $16,%rcx
+ jz L$odd_tail
+
+ movdqu 16(%rsi),%xmm6
+ movl _OPENSSL_ia32cap_P+4(%rip),%eax
+ cmpq $48,%rcx
+ jb L$skip4x
+
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je L$skip4x
+
+ subq $48,%rcx
+ movq $11547335547999543296,%rax
+ movdqu 48(%rsi),%xmm14
+ movdqu 64(%rsi),%xmm15
+
+
+
+
+ movdqu 48(%rdx),%xmm3
+ movdqu 32(%rdx),%xmm11
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
+
+ movdqu 16(%rdx),%xmm11
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm8,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+.byte 102,69,15,58,68,238,17
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jc L$tail4x
+
+ jmp L$mod4_loop
+.p2align 5
+L$mod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+ movdqu 48(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
+ movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
+ pshufd $78,%xmm11,%xmm12
+ xorps %xmm5,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+ xorps %xmm4,%xmm8
+.byte 102,68,15,58,68,218,0
+ pshufd $78,%xmm3,%xmm4
+
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
+.byte 102,68,15,58,68,234,17
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa L$7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ movdqu 16(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm4
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $64,%rcx
+ jnc L$mod4_loop
+
+L$tail4x:
+.byte 102,65,15,58,68,199,0
+.byte 102,65,15,58,68,207,17
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm4,%xmm8
+
+ pxor %xmm1,%xmm8
+ pxor %xmm0,%xmm1
+
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $64,%rcx
+ jz L$done
+ movdqu 32(%rsi),%xmm7
+ subq $16,%rcx
+ jz L$odd_tail
+L$skip4x:
+
+
+
+
+
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
+
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ leaq 32(%rdx),%rdx
+ nop
+ subq $32,%rcx
+ jbe L$even_tail
+ nop
+ jmp L$mod_loop
+
+.p2align 5
+L$mod_loop:
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm3,%xmm5
+
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
+ psllq $1,%xmm0
+ pxor %xmm8,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm8
+ pslldq $8,%xmm0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
+ psrlq $1,%xmm0
+.byte 102,15,58,68,231,0
+ pxor %xmm1,%xmm0
+
+ subq $32,%rcx
+ ja L$mod_loop
+
+L$even_tail:
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testq %rcx,%rcx
+ jnz L$done
+
+L$odd_tail:
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,223,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+L$done:
+.byte 102,65,15,56,0,194
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+
+.globl _gcm_init_avx
+.private_extern _gcm_init_avx
+
+.p2align 5
+_gcm_init_avx:
+ jmp L$_init_clmul
+
+.globl _gcm_gmult_avx
+.private_extern _gcm_gmult_avx
+
+.p2align 5
+_gcm_gmult_avx:
+ jmp L$_gmult_clmul
+
+.globl _gcm_ghash_avx
+.private_extern _gcm_ghash_avx
+
+.p2align 5
+_gcm_ghash_avx:
+ jmp L$_ghash_clmul
+
+.p2align 6
+L$bswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+L$0x1c2_polynomial:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+L$7_mask:
+.long 7,0,7,0
+L$7_mask_poly:
+.long 7,0,450,0
+.p2align 6
+
+L$rem_4bit:
+.long 0,0,0,471859200,0,943718400,0,610271232
+.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
+
+L$rem_8bit:
+.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+#endif
diff --git a/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S b/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S
new file mode 100644
index 0000000..31ee7d2
--- /dev/null
+++ b/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S
@@ -0,0 +1,1262 @@
+#if defined(__x86_64__)
+.text
+.p2align 4
+
+.globl _rc4_md5_enc
+.private_extern _rc4_md5_enc
+
+_rc4_md5_enc:
+ cmpq $0,%r9
+ je L$abort
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $40,%rsp
+L$body:
+ movq %rcx,%r11
+ movq %r9,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ movq %r8,%r15
+ xorq %rbp,%rbp
+ xorq %rcx,%rcx
+
+ leaq 8(%rdi),%rdi
+ movb -8(%rdi),%bpl
+ movb -4(%rdi),%cl
+
+ incb %bpl
+ subq %r13,%r14
+ movl (%rdi,%rbp,4),%eax
+ addb %al,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ shlq $6,%r12
+ addq %r15,%r12
+ movq %r12,16(%rsp)
+
+ movq %r11,24(%rsp)
+ movl 0(%r11),%r8d
+ movl 4(%r11),%r9d
+ movl 8(%r11),%r10d
+ movl 12(%r11),%r11d
+ jmp L$oop
+
+.p2align 4
+L$oop:
+ movl %r8d,0(%rsp)
+ movl %r9d,4(%rsp)
+ movl %r10d,8(%rsp)
+ movl %r11d,%r12d
+ movl %r11d,12(%rsp)
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 0(%r15),%r8d
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ addl $3614090360,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,0(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 4(%r15),%r11d
+ addb %dl,%bl
+ movl 8(%rsi),%eax
+ addl $3905402710,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,4(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 8(%r15),%r10d
+ addb %dl,%al
+ movl 12(%rsi),%ebx
+ addl $606105819,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,8(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 12(%r15),%r9d
+ addb %dl,%bl
+ movl 16(%rsi),%eax
+ addl $3250441966,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,12(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 16(%r15),%r8d
+ addb %dl,%al
+ movl 20(%rsi),%ebx
+ addl $4118548399,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,16(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 20(%r15),%r11d
+ addb %dl,%bl
+ movl 24(%rsi),%eax
+ addl $1200080426,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,20(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 24(%r15),%r10d
+ addb %dl,%al
+ movl 28(%rsi),%ebx
+ addl $2821735955,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,24(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 28(%r15),%r9d
+ addb %dl,%bl
+ movl 32(%rsi),%eax
+ addl $4249261313,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,28(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 32(%r15),%r8d
+ addb %dl,%al
+ movl 36(%rsi),%ebx
+ addl $1770035416,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,32(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 36(%r15),%r11d
+ addb %dl,%bl
+ movl 40(%rsi),%eax
+ addl $2336552879,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,36(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 40(%r15),%r10d
+ addb %dl,%al
+ movl 44(%rsi),%ebx
+ addl $4294925233,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,40(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 44(%r15),%r9d
+ addb %dl,%bl
+ movl 48(%rsi),%eax
+ addl $2304563134,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,44(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 48(%r15),%r8d
+ addb %dl,%al
+ movl 52(%rsi),%ebx
+ addl $1804603682,%r8d
+ xorl %r11d,%r12d
+ movzbl %al,%eax
+ movl %edx,48(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $7,%r8d
+ movl %r10d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 52(%r15),%r11d
+ addb %dl,%bl
+ movl 56(%rsi),%eax
+ addl $4254626195,%r11d
+ xorl %r10d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,52(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $12,%r11d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 56(%r15),%r10d
+ addb %dl,%al
+ movl 60(%rsi),%ebx
+ addl $2792965006,%r10d
+ xorl %r9d,%r12d
+ movzbl %al,%eax
+ movl %edx,56(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $17,%r10d
+ movl %r8d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu (%r13),%xmm2
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 60(%r15),%r9d
+ addb %dl,%bl
+ movl 64(%rsi),%eax
+ addl $1236535329,%r9d
+ xorl %r8d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,60(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $22,%r9d
+ movl %r10d,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm1,%xmm2
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 4(%r15),%r8d
+ addb %dl,%al
+ movl 68(%rsi),%ebx
+ addl $4129170786,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,64(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 24(%r15),%r11d
+ addb %dl,%bl
+ movl 72(%rsi),%eax
+ addl $3225465664,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,68(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 44(%r15),%r10d
+ addb %dl,%al
+ movl 76(%rsi),%ebx
+ addl $643717713,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,72(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 0(%r15),%r9d
+ addb %dl,%bl
+ movl 80(%rsi),%eax
+ addl $3921069994,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,76(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 20(%r15),%r8d
+ addb %dl,%al
+ movl 84(%rsi),%ebx
+ addl $3593408605,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,80(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 40(%r15),%r11d
+ addb %dl,%bl
+ movl 88(%rsi),%eax
+ addl $38016083,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,84(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 60(%r15),%r10d
+ addb %dl,%al
+ movl 92(%rsi),%ebx
+ addl $3634488961,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,88(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 16(%r15),%r9d
+ addb %dl,%bl
+ movl 96(%rsi),%eax
+ addl $3889429448,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,92(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 36(%r15),%r8d
+ addb %dl,%al
+ movl 100(%rsi),%ebx
+ addl $568446438,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,96(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 56(%r15),%r11d
+ addb %dl,%bl
+ movl 104(%rsi),%eax
+ addl $3275163606,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,100(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 12(%r15),%r10d
+ addb %dl,%al
+ movl 108(%rsi),%ebx
+ addl $4107603335,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,104(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 32(%r15),%r9d
+ addb %dl,%bl
+ movl 112(%rsi),%eax
+ addl $1163531501,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,108(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r10d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r11d,%r12d
+ addl 52(%r15),%r8d
+ addb %dl,%al
+ movl 116(%rsi),%ebx
+ addl $2850285829,%r8d
+ xorl %r10d,%r12d
+ movzbl %al,%eax
+ movl %edx,112(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $5,%r8d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r10d,%r12d
+ addl 8(%r15),%r11d
+ addb %dl,%bl
+ movl 120(%rsi),%eax
+ addl $4243563512,%r11d
+ xorl %r9d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,116(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $9,%r11d
+ movl %r8d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ andl %r9d,%r12d
+ addl 28(%r15),%r10d
+ addb %dl,%al
+ movl 124(%rsi),%ebx
+ addl $1735328473,%r10d
+ xorl %r8d,%r12d
+ movzbl %al,%eax
+ movl %edx,120(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $14,%r10d
+ movl %r11d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 16(%r13),%xmm3
+ addb $32,%bpl
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ andl %r8d,%r12d
+ addl 48(%r15),%r9d
+ addb %dl,%bl
+ movl 0(%rdi,%rbp,4),%eax
+ addl $2368359562,%r9d
+ xorl %r11d,%r12d
+ movzbl %bl,%ebx
+ movl %edx,124(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $20,%r9d
+ movl %r11d,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movq %rcx,%rsi
+ xorq %rcx,%rcx
+ movb %sil,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 20(%r15),%r8d
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ addl $4294588738,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,0(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 32(%r15),%r11d
+ addb %dl,%bl
+ movl 8(%rsi),%eax
+ addl $2272392833,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,4(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 44(%r15),%r10d
+ addb %dl,%al
+ movl 12(%rsi),%ebx
+ addl $1839030562,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,8(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 56(%r15),%r9d
+ addb %dl,%bl
+ movl 16(%rsi),%eax
+ addl $4259657740,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,12(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 4(%r15),%r8d
+ addb %dl,%al
+ movl 20(%rsi),%ebx
+ addl $2763975236,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,16(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 16(%r15),%r11d
+ addb %dl,%bl
+ movl 24(%rsi),%eax
+ addl $1272893353,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,20(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 28(%r15),%r10d
+ addb %dl,%al
+ movl 28(%rsi),%ebx
+ addl $4139469664,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,24(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 40(%r15),%r9d
+ addb %dl,%bl
+ movl 32(%rsi),%eax
+ addl $3200236656,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,28(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 52(%r15),%r8d
+ addb %dl,%al
+ movl 36(%rsi),%ebx
+ addl $681279174,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,32(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 0(%r15),%r11d
+ addb %dl,%bl
+ movl 40(%rsi),%eax
+ addl $3936430074,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,36(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 12(%r15),%r10d
+ addb %dl,%al
+ movl 44(%rsi),%ebx
+ addl $3572445317,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,40(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 24(%r15),%r9d
+ addb %dl,%bl
+ movl 48(%rsi),%eax
+ addl $76029189,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,44(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl %r11d,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r9d,%r12d
+ addl 36(%r15),%r8d
+ addb %dl,%al
+ movl 52(%rsi),%ebx
+ addl $3654602809,%r8d
+ movzbl %al,%eax
+ addl %r12d,%r8d
+ movl %edx,48(%rsi)
+ addb %bl,%cl
+ roll $4,%r8d
+ movl %r10d,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r8d,%r12d
+ addl 48(%r15),%r11d
+ addb %dl,%bl
+ movl 56(%rsi),%eax
+ addl $3873151461,%r11d
+ movzbl %bl,%ebx
+ addl %r12d,%r11d
+ movl %edx,52(%rsi)
+ addb %al,%cl
+ roll $11,%r11d
+ movl %r9d,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ xorl %r11d,%r12d
+ addl 60(%r15),%r10d
+ addb %dl,%al
+ movl 60(%rsi),%ebx
+ addl $530742520,%r10d
+ movzbl %al,%eax
+ addl %r12d,%r10d
+ movl %edx,56(%rsi)
+ addb %bl,%cl
+ roll $16,%r10d
+ movl %r8d,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 32(%r13),%xmm4
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ xorl %r10d,%r12d
+ addl 8(%r15),%r9d
+ addb %dl,%bl
+ movl 64(%rsi),%eax
+ addl $3299628645,%r9d
+ movzbl %bl,%ebx
+ addl %r12d,%r9d
+ movl %edx,60(%rsi)
+ addb %al,%cl
+ roll $23,%r9d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm4
+ pxor %xmm1,%xmm4
+ pxor %xmm0,%xmm0
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 0(%r15),%r8d
+ addb %dl,%al
+ movl 68(%rsi),%ebx
+ addl $4096336452,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,64(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ movd (%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ pxor %xmm1,%xmm1
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 28(%r15),%r11d
+ addb %dl,%bl
+ movl 72(%rsi),%eax
+ addl $1126891415,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,68(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ movd (%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 56(%r15),%r10d
+ addb %dl,%al
+ movl 76(%rsi),%ebx
+ addl $2878612391,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,72(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 20(%r15),%r9d
+ addb %dl,%bl
+ movl 80(%rsi),%eax
+ addl $4237533241,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,76(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 48(%r15),%r8d
+ addb %dl,%al
+ movl 84(%rsi),%ebx
+ addl $1700485571,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,80(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 12(%r15),%r11d
+ addb %dl,%bl
+ movl 88(%rsi),%eax
+ addl $2399980690,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,84(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 40(%r15),%r10d
+ addb %dl,%al
+ movl 92(%rsi),%ebx
+ addl $4293915773,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,88(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 4(%r15),%r9d
+ addb %dl,%bl
+ movl 96(%rsi),%eax
+ addl $2240044497,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,92(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 32(%r15),%r8d
+ addb %dl,%al
+ movl 100(%rsi),%ebx
+ addl $1873313359,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,96(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 60(%r15),%r11d
+ addb %dl,%bl
+ movl 104(%rsi),%eax
+ addl $4264355552,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,100(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 24(%r15),%r10d
+ addb %dl,%al
+ movl 108(%rsi),%ebx
+ addl $2734768916,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,104(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 52(%r15),%r9d
+ addb %dl,%bl
+ movl 112(%rsi),%eax
+ addl $1309151649,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,108(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r11d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r9d,%r12d
+ addl 16(%r15),%r8d
+ addb %dl,%al
+ movl 116(%rsi),%ebx
+ addl $4149444226,%r8d
+ movzbl %al,%eax
+ xorl %r10d,%r12d
+ movl %edx,112(%rsi)
+ addl %r12d,%r8d
+ addb %bl,%cl
+ roll $6,%r8d
+ movl $-1,%r12d
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+
+ addl %r9d,%r8d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r10d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r8d,%r12d
+ addl 44(%r15),%r11d
+ addb %dl,%bl
+ movl 120(%rsi),%eax
+ addl $3174756917,%r11d
+ movzbl %bl,%ebx
+ xorl %r9d,%r12d
+ movl %edx,116(%rsi)
+ addl %r12d,%r11d
+ addb %al,%cl
+ roll $10,%r11d
+ movl $-1,%r12d
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+
+ addl %r8d,%r11d
+ movl (%rdi,%rcx,4),%edx
+ xorl %r9d,%r12d
+ movl %eax,(%rdi,%rcx,4)
+ orl %r11d,%r12d
+ addl 8(%r15),%r10d
+ addb %dl,%al
+ movl 124(%rsi),%ebx
+ addl $718787259,%r10d
+ movzbl %al,%eax
+ xorl %r8d,%r12d
+ movl %edx,120(%rsi)
+ addl %r12d,%r10d
+ addb %bl,%cl
+ roll $15,%r10d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+
+ addl %r11d,%r10d
+ movdqu 48(%r13),%xmm5
+ addb $32,%bpl
+ movl (%rdi,%rcx,4),%edx
+ xorl %r8d,%r12d
+ movl %ebx,(%rdi,%rcx,4)
+ orl %r10d,%r12d
+ addl 36(%r15),%r9d
+ addb %dl,%bl
+ movl 0(%rdi,%rbp,4),%eax
+ addl $3951481745,%r9d
+ movzbl %bl,%ebx
+ xorl %r11d,%r12d
+ movl %edx,124(%rsi)
+ addl %r12d,%r9d
+ addb %al,%cl
+ roll $21,%r9d
+ movl $-1,%r12d
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+
+ addl %r10d,%r9d
+ movq %rbp,%rsi
+ xorq %rbp,%rbp
+ movb %sil,%bpl
+ movq %rcx,%rsi
+ xorq %rcx,%rcx
+ movb %sil,%cl
+ leaq (%rdi,%rbp,4),%rsi
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm5
+ pxor %xmm1,%xmm5
+ addl 0(%rsp),%r8d
+ addl 4(%rsp),%r9d
+ addl 8(%rsp),%r10d
+ addl 12(%rsp),%r11d
+
+ movdqu %xmm2,(%r14,%r13,1)
+ movdqu %xmm3,16(%r14,%r13,1)
+ movdqu %xmm4,32(%r14,%r13,1)
+ movdqu %xmm5,48(%r14,%r13,1)
+ leaq 64(%r15),%r15
+ leaq 64(%r13),%r13
+ cmpq 16(%rsp),%r15
+ jb L$oop
+
+ movq 24(%rsp),%r12
+ subb %al,%cl
+ movl %r8d,0(%r12)
+ movl %r9d,4(%r12)
+ movl %r10d,8(%r12)
+ movl %r11d,12(%r12)
+ subb $1,%bpl
+ movl %ebp,-8(%rdi)
+ movl %ecx,-4(%rdi)
+
+ movq 40(%rsp),%r15
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r13
+ movq 64(%rsp),%r12
+ movq 72(%rsp),%rbp
+ movq 80(%rsp),%rbx
+ leaq 88(%rsp),%rsp
+L$epilogue:
+L$abort:
+ .byte 0xf3,0xc3
+
+#endif
diff --git a/mac-x86_64/crypto/rc4/rc4-x86_64.S b/mac-x86_64/crypto/rc4/rc4-x86_64.S
new file mode 100644
index 0000000..44147ff
--- /dev/null
+++ b/mac-x86_64/crypto/rc4/rc4-x86_64.S
@@ -0,0 +1,622 @@
+#if defined(__x86_64__)
+.text
+
+
+.globl _asm_RC4
+.private_extern _asm_RC4
+
+.p2align 4
+_asm_RC4:
+ orq %rsi,%rsi
+ jne L$entry
+ .byte 0xf3,0xc3
+L$entry:
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+L$prologue:
+ movq %rsi,%r11
+ movq %rdx,%r12
+ movq %rcx,%r13
+ xorq %r10,%r10
+ xorq %rcx,%rcx
+
+ leaq 8(%rdi),%rdi
+ movb -8(%rdi),%r10b
+ movb -4(%rdi),%cl
+ cmpl $-1,256(%rdi)
+ je L$RC4_CHAR
+ movl _OPENSSL_ia32cap_P(%rip),%r8d
+ xorq %rbx,%rbx
+ incb %r10b
+ subq %r10,%rbx
+ subq %r12,%r13
+ movl (%rdi,%r10,4),%eax
+ testq $-16,%r11
+ jz L$loop1
+ btl $30,%r8d
+ jc L$intel
+ andq $7,%rbx
+ leaq 1(%r10),%rsi
+ jz L$oop8
+ subq %rbx,%r11
+L$oop8_warmup:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl %edx,(%rdi,%r10,4)
+ addb %dl,%al
+ incb %r10b
+ movl (%rdi,%rax,4),%edx
+ movl (%rdi,%r10,4),%eax
+ xorb (%r12),%dl
+ movb %dl,(%r12,%r13,1)
+ leaq 1(%r12),%r12
+ decq %rbx
+ jnz L$oop8_warmup
+
+ leaq 1(%r10),%rsi
+ jmp L$oop8
+.p2align 4
+L$oop8:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl 0(%rdi,%rsi,4),%ebx
+ rorq $8,%r8
+ movl %edx,0(%rdi,%r10,4)
+ addb %al,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %bl,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ movl 4(%rdi,%rsi,4),%eax
+ rorq $8,%r8
+ movl %edx,4(%rdi,%r10,4)
+ addb %bl,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl 8(%rdi,%rsi,4),%ebx
+ rorq $8,%r8
+ movl %edx,8(%rdi,%r10,4)
+ addb %al,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %bl,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ movl 12(%rdi,%rsi,4),%eax
+ rorq $8,%r8
+ movl %edx,12(%rdi,%r10,4)
+ addb %bl,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl 16(%rdi,%rsi,4),%ebx
+ rorq $8,%r8
+ movl %edx,16(%rdi,%r10,4)
+ addb %al,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %bl,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ movl 20(%rdi,%rsi,4),%eax
+ rorq $8,%r8
+ movl %edx,20(%rdi,%r10,4)
+ addb %bl,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl 24(%rdi,%rsi,4),%ebx
+ rorq $8,%r8
+ movl %edx,24(%rdi,%r10,4)
+ addb %al,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb $8,%sil
+ addb %bl,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ movl -4(%rdi,%rsi,4),%eax
+ rorq $8,%r8
+ movl %edx,28(%rdi,%r10,4)
+ addb %bl,%dl
+ movb (%rdi,%rdx,4),%r8b
+ addb $8,%r10b
+ rorq $8,%r8
+ subq $8,%r11
+
+ xorq (%r12),%r8
+ movq %r8,(%r12,%r13,1)
+ leaq 8(%r12),%r12
+
+ testq $-8,%r11
+ jnz L$oop8
+ cmpq $0,%r11
+ jne L$loop1
+ jmp L$exit
+
+.p2align 4
+L$intel:
+ testq $-32,%r11
+ jz L$loop1
+ andq $15,%rbx
+ jz L$oop16_is_hot
+ subq %rbx,%r11
+L$oop16_warmup:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl %edx,(%rdi,%r10,4)
+ addb %dl,%al
+ incb %r10b
+ movl (%rdi,%rax,4),%edx
+ movl (%rdi,%r10,4),%eax
+ xorb (%r12),%dl
+ movb %dl,(%r12,%r13,1)
+ leaq 1(%r12),%r12
+ decq %rbx
+ jnz L$oop16_warmup
+
+ movq %rcx,%rbx
+ xorq %rcx,%rcx
+ movb %bl,%cl
+
+L$oop16_is_hot:
+ leaq (%rdi,%r10,4),%rsi
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ pxor %xmm0,%xmm0
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,0(%rsi)
+ addb %bl,%cl
+ pinsrw $0,(%rdi,%rax,4),%xmm0
+ jmp L$oop16_enter
+.p2align 4
+L$oop16:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ pxor %xmm0,%xmm2
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm0
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 4(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,0(%rsi)
+ pxor %xmm1,%xmm2
+ addb %bl,%cl
+ pinsrw $0,(%rdi,%rax,4),%xmm0
+ movdqu %xmm2,(%r12,%r13,1)
+ leaq 16(%r12),%r12
+L$oop16_enter:
+ movl (%rdi,%rcx,4),%edx
+ pxor %xmm1,%xmm1
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 8(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,4(%rsi)
+ addb %al,%cl
+ pinsrw $0,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 12(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,8(%rsi)
+ addb %bl,%cl
+ pinsrw $1,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 16(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,12(%rsi)
+ addb %al,%cl
+ pinsrw $1,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 20(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,16(%rsi)
+ addb %bl,%cl
+ pinsrw $2,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 24(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,20(%rsi)
+ addb %al,%cl
+ pinsrw $2,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 28(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,24(%rsi)
+ addb %bl,%cl
+ pinsrw $3,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 32(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,28(%rsi)
+ addb %al,%cl
+ pinsrw $3,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 36(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,32(%rsi)
+ addb %bl,%cl
+ pinsrw $4,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 40(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,36(%rsi)
+ addb %al,%cl
+ pinsrw $4,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 44(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,40(%rsi)
+ addb %bl,%cl
+ pinsrw $5,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 48(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,44(%rsi)
+ addb %al,%cl
+ pinsrw $5,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 52(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,48(%rsi)
+ addb %bl,%cl
+ pinsrw $6,(%rdi,%rax,4),%xmm0
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movl 56(%rsi),%eax
+ movzbl %bl,%ebx
+ movl %edx,52(%rsi)
+ addb %al,%cl
+ pinsrw $6,(%rdi,%rbx,4),%xmm1
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ addb %dl,%al
+ movl 60(%rsi),%ebx
+ movzbl %al,%eax
+ movl %edx,56(%rsi)
+ addb %bl,%cl
+ pinsrw $7,(%rdi,%rax,4),%xmm0
+ addb $16,%r10b
+ movdqu (%r12),%xmm2
+ movl (%rdi,%rcx,4),%edx
+ movl %ebx,(%rdi,%rcx,4)
+ addb %dl,%bl
+ movzbl %bl,%ebx
+ movl %edx,60(%rsi)
+ leaq (%rdi,%r10,4),%rsi
+ pinsrw $7,(%rdi,%rbx,4),%xmm1
+ movl (%rsi),%eax
+ movq %rcx,%rbx
+ xorq %rcx,%rcx
+ subq $16,%r11
+ movb %bl,%cl
+ testq $-16,%r11
+ jnz L$oop16
+
+ psllq $8,%xmm1
+ pxor %xmm0,%xmm2
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,(%r12,%r13,1)
+ leaq 16(%r12),%r12
+
+ cmpq $0,%r11
+ jne L$loop1
+ jmp L$exit
+
+.p2align 4
+L$loop1:
+ addb %al,%cl
+ movl (%rdi,%rcx,4),%edx
+ movl %eax,(%rdi,%rcx,4)
+ movl %edx,(%rdi,%r10,4)
+ addb %dl,%al
+ incb %r10b
+ movl (%rdi,%rax,4),%edx
+ movl (%rdi,%r10,4),%eax
+ xorb (%r12),%dl
+ movb %dl,(%r12,%r13,1)
+ leaq 1(%r12),%r12
+ decq %r11
+ jnz L$loop1
+ jmp L$exit
+
+.p2align 4
+L$RC4_CHAR:
+ addb $1,%r10b
+ movzbl (%rdi,%r10,1),%eax
+ testq $-8,%r11
+ jz L$cloop1
+ jmp L$cloop8
+.p2align 4
+L$cloop8:
+ movl (%r12),%r8d
+ movl 4(%r12),%r9d
+ addb %al,%cl
+ leaq 1(%r10),%rsi
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %sil,%esi
+ movzbl (%rdi,%rsi,1),%ebx
+ movb %al,(%rdi,%rcx,1)
+ cmpq %rsi,%rcx
+ movb %dl,(%rdi,%r10,1)
+ jne L$cmov0
+ movq %rax,%rbx
+L$cmov0:
+ addb %al,%dl
+ xorb (%rdi,%rdx,1),%r8b
+ rorl $8,%r8d
+ addb %bl,%cl
+ leaq 1(%rsi),%r10
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%r10,1),%eax
+ movb %bl,(%rdi,%rcx,1)
+ cmpq %r10,%rcx
+ movb %dl,(%rdi,%rsi,1)
+ jne L$cmov1
+ movq %rbx,%rax
+L$cmov1:
+ addb %bl,%dl
+ xorb (%rdi,%rdx,1),%r8b
+ rorl $8,%r8d
+ addb %al,%cl
+ leaq 1(%r10),%rsi
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %sil,%esi
+ movzbl (%rdi,%rsi,1),%ebx
+ movb %al,(%rdi,%rcx,1)
+ cmpq %rsi,%rcx
+ movb %dl,(%rdi,%r10,1)
+ jne L$cmov2
+ movq %rax,%rbx
+L$cmov2:
+ addb %al,%dl
+ xorb (%rdi,%rdx,1),%r8b
+ rorl $8,%r8d
+ addb %bl,%cl
+ leaq 1(%rsi),%r10
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%r10,1),%eax
+ movb %bl,(%rdi,%rcx,1)
+ cmpq %r10,%rcx
+ movb %dl,(%rdi,%rsi,1)
+ jne L$cmov3
+ movq %rbx,%rax
+L$cmov3:
+ addb %bl,%dl
+ xorb (%rdi,%rdx,1),%r8b
+ rorl $8,%r8d
+ addb %al,%cl
+ leaq 1(%r10),%rsi
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %sil,%esi
+ movzbl (%rdi,%rsi,1),%ebx
+ movb %al,(%rdi,%rcx,1)
+ cmpq %rsi,%rcx
+ movb %dl,(%rdi,%r10,1)
+ jne L$cmov4
+ movq %rax,%rbx
+L$cmov4:
+ addb %al,%dl
+ xorb (%rdi,%rdx,1),%r9b
+ rorl $8,%r9d
+ addb %bl,%cl
+ leaq 1(%rsi),%r10
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%r10,1),%eax
+ movb %bl,(%rdi,%rcx,1)
+ cmpq %r10,%rcx
+ movb %dl,(%rdi,%rsi,1)
+ jne L$cmov5
+ movq %rbx,%rax
+L$cmov5:
+ addb %bl,%dl
+ xorb (%rdi,%rdx,1),%r9b
+ rorl $8,%r9d
+ addb %al,%cl
+ leaq 1(%r10),%rsi
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %sil,%esi
+ movzbl (%rdi,%rsi,1),%ebx
+ movb %al,(%rdi,%rcx,1)
+ cmpq %rsi,%rcx
+ movb %dl,(%rdi,%r10,1)
+ jne L$cmov6
+ movq %rax,%rbx
+L$cmov6:
+ addb %al,%dl
+ xorb (%rdi,%rdx,1),%r9b
+ rorl $8,%r9d
+ addb %bl,%cl
+ leaq 1(%rsi),%r10
+ movzbl (%rdi,%rcx,1),%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%r10,1),%eax
+ movb %bl,(%rdi,%rcx,1)
+ cmpq %r10,%rcx
+ movb %dl,(%rdi,%rsi,1)
+ jne L$cmov7
+ movq %rbx,%rax
+L$cmov7:
+ addb %bl,%dl
+ xorb (%rdi,%rdx,1),%r9b
+ rorl $8,%r9d
+ leaq -8(%r11),%r11
+ movl %r8d,(%r13)
+ leaq 8(%r12),%r12
+ movl %r9d,4(%r13)
+ leaq 8(%r13),%r13
+
+ testq $-8,%r11
+ jnz L$cloop8
+ cmpq $0,%r11
+ jne L$cloop1
+ jmp L$exit
+.p2align 4
+L$cloop1:
+ addb %al,%cl
+ movzbl %cl,%ecx
+ movzbl (%rdi,%rcx,1),%edx
+ movb %al,(%rdi,%rcx,1)
+ movb %dl,(%rdi,%r10,1)
+ addb %al,%dl
+ addb $1,%r10b
+ movzbl %dl,%edx
+ movzbl %r10b,%r10d
+ movzbl (%rdi,%rdx,1),%edx
+ movzbl (%rdi,%r10,1),%eax
+ xorb (%r12),%dl
+ leaq 1(%r12),%r12
+ movb %dl,(%r13)
+ leaq 1(%r13),%r13
+ subq $1,%r11
+ jnz L$cloop1
+ jmp L$exit
+
+.p2align 4
+L$exit:
+ subb $1,%r10b
+ movl %r10d,-8(%rdi)
+ movl %ecx,-4(%rdi)
+
+ movq (%rsp),%r13
+ movq 8(%rsp),%r12
+ movq 16(%rsp),%rbx
+ addq $24,%rsp
+L$epilogue:
+ .byte 0xf3,0xc3
+
+.globl _asm_RC4_set_key
+.private_extern _asm_RC4_set_key
+
+.p2align 4
+_asm_RC4_set_key:
+ leaq 8(%rdi),%rdi
+ leaq (%rdx,%rsi,1),%rdx
+ negq %rsi
+ movq %rsi,%rcx
+ xorl %eax,%eax
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+
+ movl _OPENSSL_ia32cap_P(%rip),%r8d
+ btl $20,%r8d
+ jc L$c1stloop
+ jmp L$w1stloop
+
+.p2align 4
+L$w1stloop:
+ movl %eax,(%rdi,%rax,4)
+ addb $1,%al
+ jnc L$w1stloop
+
+ xorq %r9,%r9
+ xorq %r8,%r8
+.p2align 4
+L$w2ndloop:
+ movl (%rdi,%r9,4),%r10d
+ addb (%rdx,%rsi,1),%r8b
+ addb %r10b,%r8b
+ addq $1,%rsi
+ movl (%rdi,%r8,4),%r11d
+ cmovzq %rcx,%rsi
+ movl %r10d,(%rdi,%r8,4)
+ movl %r11d,(%rdi,%r9,4)
+ addb $1,%r9b
+ jnc L$w2ndloop
+ jmp L$exit_key
+
+.p2align 4
+L$c1stloop:
+ movb %al,(%rdi,%rax,1)
+ addb $1,%al
+ jnc L$c1stloop
+
+ xorq %r9,%r9
+ xorq %r8,%r8
+.p2align 4
+L$c2ndloop:
+ movb (%rdi,%r9,1),%r10b
+ addb (%rdx,%rsi,1),%r8b
+ addb %r10b,%r8b
+ addq $1,%rsi
+ movb (%rdi,%r8,1),%r11b
+ jnz L$cnowrap
+ movq %rcx,%rsi
+L$cnowrap:
+ movb %r10b,(%rdi,%r8,1)
+ movb %r11b,(%rdi,%r9,1)
+ addb $1,%r9b
+ jnc L$c2ndloop
+ movl $-1,256(%rdi)
+
+.p2align 4
+L$exit_key:
+ xorl %eax,%eax
+ movl %eax,-8(%rdi)
+ movl %eax,-4(%rdi)
+ .byte 0xf3,0xc3
+
+
+.globl _RC4_options
+.private_extern _RC4_options
+
+.p2align 4
+_RC4_options:
+ leaq L$opts(%rip),%rax
+ movq _OPENSSL_ia32cap_P(%rip),%rdx
+ movl (%rdx),%edx
+ btl $20,%edx
+ jc L$8xchar
+ btl $30,%edx
+ jnc L$done
+ addq $25,%rax
+ .byte 0xf3,0xc3
+L$8xchar:
+ addq $12,%rax
+L$done:
+ .byte 0xf3,0xc3
+.p2align 6
+L$opts:
+.byte 114,99,52,40,56,120,44,105,110,116,41,0
+.byte 114,99,52,40,56,120,44,99,104,97,114,41,0
+.byte 114,99,52,40,49,54,120,44,105,110,116,41,0
+.byte 82,67,52,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+
+#endif
diff --git a/mac-x86_64/crypto/sha/sha1-x86_64.S b/mac-x86_64/crypto/sha/sha1-x86_64.S
new file mode 100644
index 0000000..044dc5b
--- /dev/null
+++ b/mac-x86_64/crypto/sha/sha1-x86_64.S
@@ -0,0 +1,2425 @@
+#if defined(__x86_64__)
+.text
+
+
+.globl _sha1_block_data_order
+.private_extern _sha1_block_data_order
+
+.p2align 4
+_sha1_block_data_order:
+ movl _OPENSSL_ia32cap_P+0(%rip),%r9d
+ movl _OPENSSL_ia32cap_P+4(%rip),%r8d
+ movl _OPENSSL_ia32cap_P+8(%rip),%r10d
+ testl $512,%r8d
+ jz L$ialu
+ jmp _ssse3_shortcut
+
+.p2align 4
+L$ialu:
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ movq %rdi,%r8
+ subq $72,%rsp
+ movq %rsi,%r9
+ andq $-64,%rsp
+ movq %rdx,%r10
+ movq %rax,64(%rsp)
+L$prologue:
+
+ movl 0(%r8),%esi
+ movl 4(%r8),%edi
+ movl 8(%r8),%r11d
+ movl 12(%r8),%r12d
+ movl 16(%r8),%r13d
+ jmp L$loop
+
+.p2align 4
+L$loop:
+ movl 0(%r9),%edx
+ bswapl %edx
+ movl 4(%r9),%ebp
+ movl %r12d,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ecx
+ bswapl %ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 8(%r9),%r14d
+ movl %r11d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ecx
+ bswapl %r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 12(%r9),%edx
+ movl %edi,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ecx
+ bswapl %edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 16(%r9),%ebp
+ movl %esi,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ecx
+ bswapl %ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 20(%r9),%r14d
+ movl %r13d,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ecx
+ bswapl %r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 24(%r9),%edx
+ movl %r12d,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ecx
+ bswapl %edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%r14,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 28(%r9),%ebp
+ movl %r11d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ecx
+ bswapl %ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rdx,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 32(%r9),%r14d
+ movl %edi,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ecx
+ bswapl %r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rbp,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 36(%r9),%edx
+ movl %esi,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ecx
+ bswapl %edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%r14,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 40(%r9),%ebp
+ movl %r13d,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ecx
+ bswapl %ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rdx,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 44(%r9),%r14d
+ movl %r12d,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ecx
+ bswapl %r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rbp,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 48(%r9),%edx
+ movl %r11d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ecx
+ bswapl %edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%r14,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 52(%r9),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %r12d,%ecx
+ bswapl %ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rdx,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 56(%r9),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r11d,%ecx
+ bswapl %r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rbp,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 60(%r9),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %edi,%ecx
+ bswapl %edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%r14,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %esi,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ roll $30,%edi
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ roll $1,%ebp
+ addl %eax,%r13d
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %r13d,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ roll $30,%esi
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ roll $1,%r14d
+ addl %eax,%r12d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %r12d,%ecx
+ xorl 16(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ roll $30,%r13d
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ roll $1,%edx
+ addl %eax,%r11d
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r11d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ roll $30,%r12d
+ xorl %esi,%eax
+ addl %ecx,%edi
+ roll $1,%ebp
+ addl %eax,%edi
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %edi,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ roll $30,%r11d
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ roll $1,%r14d
+ addl %eax,%esi
+ xorl 20(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,16(%rsp)
+ movl %esi,%ecx
+ xorl 28(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,20(%rsp)
+ movl %r13d,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,24(%rsp)
+ movl %r12d,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,28(%rsp)
+ movl %r11d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,32(%rsp)
+ movl %edi,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,36(%rsp)
+ movl %esi,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal 1859775393(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,40(%rsp)
+ movl %r13d,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal 1859775393(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,44(%rsp)
+ movl %r12d,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,48(%rsp)
+ movl %r11d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,52(%rsp)
+ movl %edi,%ecx
+ xorl 0(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal 1859775393(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,56(%rsp)
+ movl %esi,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 0(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,60(%rsp)
+ movl %r13d,%ecx
+ xorl 8(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%r14d
+ leal 1859775393(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 4(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,0(%rsp)
+ movl %r12d,%ecx
+ xorl 12(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ leal 1859775393(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 8(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,4(%rsp)
+ movl %r11d,%ecx
+ xorl 16(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 12(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,8(%rsp)
+ movl %edi,%ecx
+ xorl 20(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%r14d
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,12(%rsp)
+ movl %esi,%ecx
+ xorl 24(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 20(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,16(%rsp)
+ movl %r13d,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 24(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,20(%rsp)
+ movl %r12d,%ecx
+ xorl 32(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 28(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,24(%rsp)
+ movl %r11d,%ecx
+ xorl 36(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 32(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,28(%rsp)
+ movl %edi,%ecx
+ xorl 40(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 36(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,32(%rsp)
+ movl %r12d,%ebx
+ xorl 44(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 4(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,36(%rsp)
+ movl %r11d,%ebx
+ xorl 48(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 8(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 44(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,40(%rsp)
+ movl %edi,%ebx
+ xorl 52(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 12(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 48(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,44(%rsp)
+ movl %esi,%ebx
+ xorl 56(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 16(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 52(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,48(%rsp)
+ movl %r13d,%ebx
+ xorl 60(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 20(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 56(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,52(%rsp)
+ movl %r12d,%ebx
+ xorl 0(%rsp),%ebp
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 24(%rsp),%ebp
+ leal -1894007588(%rdx,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%ebp
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 60(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,56(%rsp)
+ movl %r11d,%ebx
+ xorl 4(%rsp),%r14d
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 28(%rsp),%r14d
+ leal -1894007588(%rbp,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%r14d
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,60(%rsp)
+ movl %edi,%ebx
+ xorl 8(%rsp),%edx
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 32(%rsp),%edx
+ leal -1894007588(%r14,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%edx
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 4(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ebx
+ xorl 12(%rsp),%ebp
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 36(%rsp),%ebp
+ leal -1894007588(%rdx,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%ebp
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 8(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ebx
+ xorl 16(%rsp),%r14d
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 40(%rsp),%r14d
+ leal -1894007588(%rbp,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%r14d
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 12(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ebx
+ xorl 20(%rsp),%edx
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 44(%rsp),%edx
+ leal -1894007588(%r14,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%edx
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 16(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ebx
+ xorl 24(%rsp),%ebp
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 48(%rsp),%ebp
+ leal -1894007588(%rdx,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%ebp
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 20(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ebx
+ xorl 28(%rsp),%r14d
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 52(%rsp),%r14d
+ leal -1894007588(%rbp,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%r14d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ebx
+ xorl 32(%rsp),%edx
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 56(%rsp),%edx
+ leal -1894007588(%r14,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%edx
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 28(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ebx
+ xorl 36(%rsp),%ebp
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 60(%rsp),%ebp
+ leal -1894007588(%rdx,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%ebp
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 32(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ebx
+ xorl 40(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 0(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 36(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ebx
+ xorl 44(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 4(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 40(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ebx
+ xorl 48(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 8(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 44(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ebx
+ xorl 52(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 12(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ebx
+ xorl 56(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 16(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 52(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %esi,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 56(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r13d,%ecx
+ xorl 0(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 60(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %r12d,%ecx
+ xorl 4(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %r11d,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %edi,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %esi,%ecx
+ xorl 16(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ leal -899497514(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r13d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %r12d,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ leal -899497514(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 20(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,16(%rsp)
+ movl %r11d,%ecx
+ xorl 28(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal -899497514(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,20(%rsp)
+ movl %edi,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,24(%rsp)
+ movl %esi,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,28(%rsp)
+ movl %r13d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal -899497514(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal -899497514(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %r11d,%eax
+
+ movl %edi,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal -899497514(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %edi,%eax
+
+ movl %esi,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %esi,%eax
+
+ movl %r13d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 0(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %r11d,%eax
+ movl %edi,%ecx
+ xorl %r13d,%eax
+ leal -899497514(%rbp,%rsi,1),%esi
+ roll $5,%ecx
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ addl 0(%r8),%esi
+ addl 4(%r8),%edi
+ addl 8(%r8),%r11d
+ addl 12(%r8),%r12d
+ addl 16(%r8),%r13d
+ movl %esi,0(%r8)
+ movl %edi,4(%r8)
+ movl %r11d,8(%r8)
+ movl %r12d,12(%r8)
+ movl %r13d,16(%r8)
+
+ subq $1,%r10
+ leaq 64(%r9),%r9
+ jnz L$loop
+
+ movq 64(%rsp),%rsi
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+L$epilogue:
+ .byte 0xf3,0xc3
+
+
+.p2align 4
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ leaq -64(%rsp),%rsp
+ movq %rax,%r14
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r11
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ movdqa 64(%r11),%xmm6
+ movdqa -64(%r11),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ addq $64,%r9
+ paddd %xmm9,%xmm0
+.byte 102,15,56,0,222
+ paddd %xmm9,%xmm1
+ paddd %xmm9,%xmm2
+ movdqa %xmm0,0(%rsp)
+ psubd %xmm9,%xmm0
+ movdqa %xmm1,16(%rsp)
+ psubd %xmm9,%xmm1
+ movdqa %xmm2,32(%rsp)
+ psubd %xmm9,%xmm2
+ jmp L$oop_ssse3
+.p2align 4
+L$oop_ssse3:
+ rorl $2,%ebx
+ pshufd $238,%xmm0,%xmm4
+ xorl %edx,%esi
+ movdqa %xmm3,%xmm8
+ paddd %xmm3,%xmm9
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ punpcklqdq %xmm1,%xmm4
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ psrldq $4,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm2,%xmm8
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ pxor %xmm8,%xmm4
+ xorl %ebx,%eax
+ roll $5,%ebp
+ movdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm10
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ movdqa %xmm4,%xmm8
+ xorl %ebx,%esi
+ pslldq $12,%xmm10
+ paddd %xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ psrld $31,%xmm8
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm9
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ psrld $30,%xmm10
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm8,%xmm4
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm4
+ xorl %ebp,%edx
+ movdqa -64(%r11),%xmm10
+ roll $5,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ pxor %xmm9,%xmm4
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pshufd $238,%xmm1,%xmm5
+ xorl %ebp,%esi
+ movdqa %xmm4,%xmm9
+ paddd %xmm4,%xmm10
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ punpcklqdq %xmm2,%xmm5
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm9
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ pxor %xmm9,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm10,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ movdqa %xmm5,%xmm9
+ xorl %ecx,%esi
+ pslldq $12,%xmm8
+ paddd %xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ psrld $31,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm10
+ andl %eax,%edi
+ xorl %ebx,%eax
+ psrld $30,%xmm8
+ addl %ebp,%edx
+ rorl $7,%ebp
+ por %xmm9,%xmm5
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ pslld $2,%xmm10
+ pxor %xmm8,%xmm5
+ xorl %eax,%ebp
+ movdqa -32(%r11),%xmm8
+ roll $5,%edx
+ addl %edi,%ecx
+ andl %ebp,%esi
+ pxor %xmm10,%xmm5
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ rorl $7,%edx
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%esi
+ movdqa %xmm5,%xmm10
+ paddd %xmm5,%xmm8
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ punpcklqdq %xmm3,%xmm6
+ xorl %ebp,%edx
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm10
+ andl %edx,%edi
+ xorl %ebp,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm10
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ pxor %xmm10,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm8,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm9
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm10
+ xorl %edx,%esi
+ pslldq $12,%xmm9
+ paddd %xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ psrld $31,%xmm10
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm9,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ psrld $30,%xmm9
+ addl %eax,%ebp
+ rorl $7,%eax
+ por %xmm10,%xmm6
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ pslld $2,%xmm8
+ pxor %xmm9,%xmm6
+ xorl %ebx,%eax
+ movdqa -32(%r11),%xmm9
+ roll $5,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ pxor %xmm8,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%esi
+ movdqa %xmm6,%xmm8
+ paddd %xmm6,%xmm9
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ punpcklqdq %xmm4,%xmm7
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm8
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm8
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ pxor %xmm8,%xmm7
+ xorl %ebp,%edx
+ roll $5,%ecx
+ movdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm10
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm8
+ xorl %ebp,%esi
+ pslldq $12,%xmm10
+ paddd %xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ psrld $31,%xmm8
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa %xmm10,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ psrld $30,%xmm10
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm8,%xmm7
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm7
+ xorl %ecx,%ebx
+ movdqa -32(%r11),%xmm10
+ roll $5,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ pxor %xmm9,%xmm7
+ pshufd $238,%xmm6,%xmm9
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ punpcklqdq %xmm7,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ pxor %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%edi
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%eax
+ paddd %xmm7,%xmm10
+ addl %ebp,%edx
+ pxor %xmm9,%xmm0
+ rorl $7,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ movdqa %xmm0,%xmm9
+ xorl %eax,%ebp
+ roll $5,%edx
+ movdqa %xmm10,48(%rsp)
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ pslld $2,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ psrld $30,%xmm9
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ por %xmm9,%xmm0
+ xorl %ebp,%edx
+ roll $5,%ecx
+ pshufd $238,%xmm7,%xmm10
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm0,%xmm10
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebx
+ paddd %xmm0,%xmm8
+ addl %eax,%ebp
+ pxor %xmm10,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm1,%xmm10
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm8,0(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 24(%rsp),%ecx
+ pslld $2,%xmm1
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm10
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm10,%xmm1
+ addl %edx,%ecx
+ addl 28(%rsp),%ebx
+ pshufd $238,%xmm0,%xmm8
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ punpcklqdq %xmm1,%xmm8
+ movl %ebx,%edi
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa 0(%r11),%xmm10
+ rorl $7,%ecx
+ paddd %xmm1,%xmm9
+ addl %ebx,%eax
+ pxor %xmm8,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm8
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ movdqa %xmm9,16(%rsp)
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 40(%rsp),%edx
+ pslld $2,%xmm2
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ psrld $30,%xmm8
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ por %xmm8,%xmm2
+ addl %ebp,%edx
+ addl 44(%rsp),%ecx
+ pshufd $238,%xmm1,%xmm9
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ punpcklqdq %xmm2,%xmm9
+ movl %ecx,%edi
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ movdqa %xmm10,%xmm8
+ rorl $7,%edx
+ paddd %xmm2,%xmm10
+ addl %ecx,%ebx
+ pxor %xmm9,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm9
+ addl %edi,%eax
+ xorl %edx,%esi
+ movdqa %xmm10,32(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 56(%rsp),%ebp
+ pslld $2,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%edi
+ psrld $30,%xmm9
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ por %xmm9,%xmm3
+ addl %eax,%ebp
+ addl 60(%rsp),%edx
+ pshufd $238,%xmm2,%xmm10
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ pxor %xmm0,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ punpcklqdq %xmm3,%xmm10
+ movl %edx,%edi
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebp
+ paddd %xmm3,%xmm8
+ addl %edx,%ecx
+ pxor %xmm10,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm10
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ movdqa %xmm8,48(%rsp)
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 8(%rsp),%eax
+ pslld $2,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%edi
+ psrld $30,%xmm10
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ por %xmm10,%xmm4
+ addl %ebx,%eax
+ addl 12(%rsp),%ebp
+ pshufd $238,%xmm3,%xmm8
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ pxor %xmm1,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ punpcklqdq %xmm4,%xmm8
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm6,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%eax
+ paddd %xmm4,%xmm9
+ addl %ebp,%edx
+ pxor %xmm8,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm8
+ addl %edi,%ecx
+ xorl %eax,%esi
+ movdqa %xmm9,0(%rsp)
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 24(%rsp),%ebx
+ pslld $2,%xmm5
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ psrld $30,%xmm8
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ por %xmm8,%xmm5
+ addl %ecx,%ebx
+ addl 28(%rsp),%eax
+ pshufd $238,%xmm4,%xmm9
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pxor %xmm2,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ punpcklqdq %xmm5,%xmm9
+ movl %eax,%edi
+ xorl %ecx,%esi
+ pxor %xmm7,%xmm6
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%edi
+ paddd %xmm5,%xmm10
+ xorl %ecx,%ebx
+ pxor %xmm9,%xmm6
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movdqa %xmm6,%xmm9
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ movdqa %xmm10,16(%rsp)
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ pslld $2,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ psrld $30,%xmm9
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ por %xmm9,%xmm6
+ rorl $7,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ roll $5,%edx
+ pshufd $238,%xmm5,%xmm10
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ pxor %xmm3,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ punpcklqdq %xmm6,%xmm10
+ movl %ebx,%edi
+ xorl %edx,%esi
+ pxor %xmm0,%xmm7
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa 32(%r11),%xmm9
+ xorl %ecx,%edi
+ paddd %xmm6,%xmm8
+ xorl %edx,%ecx
+ pxor %xmm10,%xmm7
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movdqa %xmm7,%xmm10
+ movl %eax,%esi
+ xorl %ecx,%edi
+ movdqa %xmm8,32(%rsp)
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ pslld $2,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ psrld $30,%xmm10
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ por %xmm10,%xmm7
+ rorl $7,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ roll $5,%ebp
+ pshufd $238,%xmm6,%xmm8
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ pxor %xmm4,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ punpcklqdq %xmm7,%xmm8
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ pxor %xmm1,%xmm0
+ roll $5,%ecx
+ addl %esi,%ebx
+ movdqa %xmm9,%xmm10
+ xorl %edx,%edi
+ paddd %xmm7,%xmm9
+ xorl %ebp,%edx
+ pxor %xmm8,%xmm0
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movdqa %xmm0,%xmm8
+ movl %ebx,%esi
+ xorl %edx,%edi
+ movdqa %xmm9,48(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ pslld $2,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ psrld $30,%xmm8
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ por %xmm8,%xmm0
+ rorl $7,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ roll $5,%eax
+ pshufd $238,%xmm7,%xmm9
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ punpcklqdq %xmm0,%xmm9
+ movl %edx,%edi
+ xorl %eax,%esi
+ pxor %xmm2,%xmm1
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm8
+ xorl %ebp,%edi
+ paddd %xmm0,%xmm10
+ xorl %eax,%ebp
+ pxor %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movdqa %xmm1,%xmm9
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ movdqa %xmm10,0(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ pslld $2,%xmm1
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ psrld $30,%xmm9
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ por %xmm9,%xmm1
+ rorl $7,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ roll $5,%ebx
+ pshufd $238,%xmm0,%xmm10
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ punpcklqdq %xmm1,%xmm10
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ pxor %xmm3,%xmm2
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm9
+ xorl %eax,%edi
+ paddd %xmm1,%xmm8
+ xorl %ebx,%eax
+ pxor %xmm10,%xmm2
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movdqa %xmm2,%xmm10
+ movl %edx,%esi
+ xorl %eax,%edi
+ movdqa %xmm8,16(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ pslld $2,%xmm2
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ psrld $30,%xmm10
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ por %xmm10,%xmm2
+ rorl $7,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ roll $5,%ecx
+ pshufd $238,%xmm1,%xmm8
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm2,%xmm8
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%ebx
+ paddd %xmm2,%xmm9
+ addl %eax,%ebp
+ pxor %xmm8,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm3,%xmm8
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm9,32(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 56(%rsp),%ecx
+ pslld $2,%xmm3
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm8
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm8,%xmm3
+ addl %edx,%ecx
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ paddd %xmm3,%xmm10
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa %xmm10,48(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je L$done_ssse3
+ movdqa 64(%r11),%xmm6
+ movdqa -64(%r11),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+.byte 102,15,56,0,206
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ paddd %xmm9,%xmm0
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ movdqa %xmm0,0(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ psubd %xmm9,%xmm0
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+.byte 102,15,56,0,214
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ paddd %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ movdqa %xmm1,16(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ psubd %xmm9,%xmm1
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+.byte 102,15,56,0,222
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ paddd %xmm9,%xmm2
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ movdqa %xmm2,32(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ psubd %xmm9,%xmm2
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp L$oop_ssse3
+
+.p2align 4
+L$done_ssse3:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ leaq (%r14),%rsi
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+L$epilogue_ssse3:
+ .byte 0xf3,0xc3
+
+.p2align 6
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.p2align 6
+#endif
diff --git a/mac-x86_64/crypto/sha/sha256-x86_64.S b/mac-x86_64/crypto/sha/sha256-x86_64.S
new file mode 100644
index 0000000..da02d4c
--- /dev/null
+++ b/mac-x86_64/crypto/sha/sha256-x86_64.S
@@ -0,0 +1,2843 @@
+#if defined(__x86_64__)
+.text
+
+
+.globl _sha256_block_data_order
+.private_extern _sha256_block_data_order
+
+.p2align 4
+_sha256_block_data_order:
+ leaq _OPENSSL_ia32cap_P(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $512,%r10d
+ jnz L$ssse3_shortcut
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $64+32,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+L$prologue:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ jmp L$loop
+
+.p2align 4
+L$loop:
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
+ movl 0(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 4(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 8(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 12(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 16(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 20(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 24(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 28(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
+ movl 32(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 36(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 40(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 44(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 48(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 52(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 56(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 60(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ jmp L$rounds_16_xx
+.p2align 4
+L$rounds_16_xx:
+ movl 4(%rsp),%r13d
+ movl 56(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
+
+ addl 0(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 8(%rsp),%r13d
+ movl 60(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
+
+ addl 4(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 12(%rsp),%r13d
+ movl 0(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
+
+ addl 8(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 16(%rsp),%r13d
+ movl 4(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
+
+ addl 12(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 20(%rsp),%r13d
+ movl 8(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
+
+ addl 16(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 24(%rsp),%r13d
+ movl 12(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
+
+ addl 20(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 28(%rsp),%r13d
+ movl 16(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
+
+ addl 24(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 32(%rsp),%r13d
+ movl 20(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
+
+ addl 28(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ movl 36(%rsp),%r13d
+ movl 24(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
+
+ addl 32(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 40(%rsp),%r13d
+ movl 28(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
+
+ addl 36(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 44(%rsp),%r13d
+ movl 32(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
+
+ addl 40(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 48(%rsp),%r13d
+ movl 36(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
+
+ addl 44(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 52(%rsp),%r13d
+ movl 40(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
+
+ addl 48(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 56(%rsp),%r13d
+ movl 44(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
+
+ addl 52(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 60(%rsp),%r13d
+ movl 48(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
+
+ addl 56(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 0(%rsp),%r13d
+ movl 52(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
+
+ addl 60(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz L$rounds_16_xx
+
+ movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
+ leaq 64(%rsi),%rsi
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb L$loop
+
+ movq 64+24(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$epilogue:
+ .byte 0xf3,0xc3
+
+.p2align 6
+
+K256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+
+.p2align 6
+sha256_block_data_order_ssse3:
+L$ssse3_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+L$prologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+
+ jmp L$loop_ssse3
+.p2align 4
+L$loop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+ movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp L$ssse3_00_47
+
+.p2align 4
+L$ssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne L$ssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb L$loop_ssse3
+
+ movq 64+24(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$epilogue_ssse3:
+ .byte 0xf3,0xc3
+
+#endif
diff --git a/mac-x86_64/crypto/sha/sha512-x86_64.S b/mac-x86_64/crypto/sha/sha512-x86_64.S
new file mode 100644
index 0000000..2f5d912
--- /dev/null
+++ b/mac-x86_64/crypto/sha/sha512-x86_64.S
@@ -0,0 +1,1786 @@
+#if defined(__x86_64__)
+.text
+
+
+.globl _sha512_block_data_order
+.private_extern _sha512_block_data_order
+
+.p2align 4
+_sha512_block_data_order:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $128+32,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %r11,128+24(%rsp)
+L$prologue:
+
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp L$loop
+
+.p2align 4
+L$loop:
+ movq %rbx,%rdi
+ leaq K512(%rip),%rbp
+ xorq %rcx,%rdi
+ movq 0(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 8(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 16(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 24(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 32(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 40(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 48(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 56(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rax
+ movq 64(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 72(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 80(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 88(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 96(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 104(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 112(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 120(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ jmp L$rounds_16_xx
+.p2align 4
+L$rounds_16_xx:
+ movq 8(%rsp),%r13
+ movq 112(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 72(%rsp),%r12
+
+ addq 0(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 16(%rsp),%r13
+ movq 120(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 80(%rsp),%r12
+
+ addq 8(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 24(%rsp),%r13
+ movq 0(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 88(%rsp),%r12
+
+ addq 16(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 32(%rsp),%r13
+ movq 8(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 96(%rsp),%r12
+
+ addq 24(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 40(%rsp),%r13
+ movq 16(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 104(%rsp),%r12
+
+ addq 32(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 48(%rsp),%r13
+ movq 24(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 112(%rsp),%r12
+
+ addq 40(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 56(%rsp),%r13
+ movq 32(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 120(%rsp),%r12
+
+ addq 48(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 64(%rsp),%r13
+ movq 40(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 0(%rsp),%r12
+
+ addq 56(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ movq 72(%rsp),%r13
+ movq 48(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 8(%rsp),%r12
+
+ addq 64(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 80(%rsp),%r13
+ movq 56(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 16(%rsp),%r12
+
+ addq 72(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 88(%rsp),%r13
+ movq 64(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 24(%rsp),%r12
+
+ addq 80(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 96(%rsp),%r13
+ movq 72(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 32(%rsp),%r12
+
+ addq 88(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 104(%rsp),%r13
+ movq 80(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 40(%rsp),%r12
+
+ addq 96(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 112(%rsp),%r13
+ movq 88(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 48(%rsp),%r12
+
+ addq 104(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 120(%rsp),%r13
+ movq 96(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 56(%rsp),%r12
+
+ addq 112(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 0(%rsp),%r13
+ movq 104(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 64(%rsp),%r12
+
+ addq 120(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ cmpb $0,7(%rbp)
+ jnz L$rounds_16_xx
+
+ movq 128+0(%rsp),%rdi
+ addq %r14,%rax
+ leaq 128(%rsi),%rsi
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb L$loop
+
+ movq 128+24(%rsp),%rsi
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$epilogue:
+ .byte 0xf3,0xc3
+
+.p2align 6
+
+K512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif