summaryrefslogtreecommitdiffstats
path: root/linux-x86
diff options
context:
space:
mode:
authorAdam Langley <agl@google.com>2015-05-11 17:20:37 -0700
committerKenny Root <kroot@google.com>2015-05-12 23:06:14 +0000
commite9ada863a7b3e81f5d2b1e3bdd2305da902a87f5 (patch)
tree6e43e34595ecf887c26c32b86d8ab097fe8cac64 /linux-x86
parentb3106a0cc1493bbe0505c0ec0ce3da4ca90a29ae (diff)
downloadexternal_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.zip
external_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.tar.gz
external_boringssl-e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5.tar.bz2
external/boringssl: bump revision.
This change bumps the BoringSSL revision to the current tip-of-tree. Change-Id: I91d5bf467e16e8d86cb19a4de873985f524e5faa
Diffstat (limited to 'linux-x86')
-rw-r--r--linux-x86/crypto/aes/aesni-x86.S789
-rw-r--r--linux-x86/crypto/bn/bn-586.S437
-rw-r--r--linux-x86/crypto/bn/x86-mont.S172
-rw-r--r--linux-x86/crypto/cpu-x86-asm.S28
-rw-r--r--linux-x86/crypto/sha/sha1-586.S1422
-rw-r--r--linux-x86/crypto/sha/sha256-586.S1643
-rw-r--r--linux-x86/crypto/sha/sha512-586.S2271
7 files changed, 6097 insertions, 665 deletions
diff --git a/linux-x86/crypto/aes/aesni-x86.S b/linux-x86/crypto/aes/aesni-x86.S
index 5aee116..aec110d 100644
--- a/linux-x86/crypto/aes/aesni-x86.S
+++ b/linux-x86/crypto/aes/aesni-x86.S
@@ -23,7 +23,10 @@ aesni_encrypt:
leal 16(%edx),%edx
jnz .L000enc1_loop_1
.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_encrypt,.-.L_aesni_encrypt_begin
.globl aesni_decrypt
@@ -48,7 +51,10 @@ aesni_decrypt:
leal 16(%edx),%edx
jnz .L001dec1_loop_2
.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
ret
.size aesni_decrypt,.-.L_aesni_decrypt_begin
.hidden _aesni_encrypt2
@@ -269,17 +275,15 @@ _aesni_encrypt6:
negl %ecx
.byte 102,15,56,220,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,220,233
-.byte 102,15,56,220,241
-.byte 102,15,56,220,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp .L_aesni_encrypt6_enter
+ jmp .L008_aesni_encrypt6_inner
.align 16
-.L008enc6_loop:
+.L009enc6_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
+.L008_aesni_encrypt6_inner:
.byte 102,15,56,220,233
.byte 102,15,56,220,241
.byte 102,15,56,220,249
@@ -293,7 +297,7 @@ _aesni_encrypt6:
.byte 102,15,56,220,240
.byte 102,15,56,220,248
movups -16(%edx,%ecx,1),%xmm0
- jnz .L008enc6_loop
+ jnz .L009enc6_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
.byte 102,15,56,220,225
@@ -326,17 +330,15 @@ _aesni_decrypt6:
negl %ecx
.byte 102,15,56,222,225
pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
addl $16,%ecx
-.byte 102,15,56,222,233
-.byte 102,15,56,222,241
-.byte 102,15,56,222,249
- movups -16(%edx,%ecx,1),%xmm0
- jmp .L_aesni_decrypt6_enter
+ jmp .L010_aesni_decrypt6_inner
.align 16
-.L009dec6_loop:
+.L011dec6_loop:
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
+.L010_aesni_decrypt6_inner:
.byte 102,15,56,222,233
.byte 102,15,56,222,241
.byte 102,15,56,222,249
@@ -350,7 +352,7 @@ _aesni_decrypt6:
.byte 102,15,56,222,240
.byte 102,15,56,222,248
movups -16(%edx,%ecx,1),%xmm0
- jnz .L009dec6_loop
+ jnz .L011dec6_loop
.byte 102,15,56,222,209
.byte 102,15,56,222,217
.byte 102,15,56,222,225
@@ -381,14 +383,14 @@ aesni_ecb_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebx
andl $-16,%eax
- jz .L010ecb_ret
+ jz .L012ecb_ret
movl 240(%edx),%ecx
testl %ebx,%ebx
- jz .L011ecb_decrypt
+ jz .L013ecb_decrypt
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L012ecb_enc_tail
+ jb .L014ecb_enc_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -397,9 +399,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L013ecb_enc_loop6_enter
+ jmp .L015ecb_enc_loop6_enter
.align 16
-.L014ecb_enc_loop6:
+.L016ecb_enc_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -414,12 +416,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L013ecb_enc_loop6_enter:
+.L015ecb_enc_loop6_enter:
call _aesni_encrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L014ecb_enc_loop6
+ jnc .L016ecb_enc_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -428,18 +430,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L010ecb_ret
-.L012ecb_enc_tail:
+ jz .L012ecb_ret
+.L014ecb_enc_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L015ecb_enc_one
+ jb .L017ecb_enc_one
movups 16(%esi),%xmm3
- je .L016ecb_enc_two
+ je .L018ecb_enc_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L017ecb_enc_three
+ jb .L019ecb_enc_three
movups 48(%esi),%xmm5
- je .L018ecb_enc_four
+ je .L020ecb_enc_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_encrypt6
@@ -448,49 +450,49 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L015ecb_enc_one:
+.L017ecb_enc_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L019enc1_loop_3:
+.L021enc1_loop_3:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L019enc1_loop_3
+ jnz .L021enc1_loop_3
.byte 102,15,56,221,209
movups %xmm2,(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L016ecb_enc_two:
+.L018ecb_enc_two:
call _aesni_encrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L017ecb_enc_three:
+.L019ecb_enc_three:
call _aesni_encrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L018ecb_enc_four:
+.L020ecb_enc_four:
call _aesni_encrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L011ecb_decrypt:
+.L013ecb_decrypt:
movl %edx,%ebp
movl %ecx,%ebx
cmpl $96,%eax
- jb .L020ecb_dec_tail
+ jb .L022ecb_dec_tail
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -499,9 +501,9 @@ aesni_ecb_encrypt:
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
subl $96,%eax
- jmp .L021ecb_dec_loop6_enter
+ jmp .L023ecb_dec_loop6_enter
.align 16
-.L022ecb_dec_loop6:
+.L024ecb_dec_loop6:
movups %xmm2,(%edi)
movdqu (%esi),%xmm2
movups %xmm3,16(%edi)
@@ -516,12 +518,12 @@ aesni_ecb_encrypt:
leal 96(%edi),%edi
movdqu 80(%esi),%xmm7
leal 96(%esi),%esi
-.L021ecb_dec_loop6_enter:
+.L023ecb_dec_loop6_enter:
call _aesni_decrypt6
movl %ebp,%edx
movl %ebx,%ecx
subl $96,%eax
- jnc .L022ecb_dec_loop6
+ jnc .L024ecb_dec_loop6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
@@ -530,18 +532,18 @@ aesni_ecb_encrypt:
movups %xmm7,80(%edi)
leal 96(%edi),%edi
addl $96,%eax
- jz .L010ecb_ret
-.L020ecb_dec_tail:
+ jz .L012ecb_ret
+.L022ecb_dec_tail:
movups (%esi),%xmm2
cmpl $32,%eax
- jb .L023ecb_dec_one
+ jb .L025ecb_dec_one
movups 16(%esi),%xmm3
- je .L024ecb_dec_two
+ je .L026ecb_dec_two
movups 32(%esi),%xmm4
cmpl $64,%eax
- jb .L025ecb_dec_three
+ jb .L027ecb_dec_three
movups 48(%esi),%xmm5
- je .L026ecb_dec_four
+ je .L028ecb_dec_four
movups 64(%esi),%xmm6
xorps %xmm7,%xmm7
call _aesni_decrypt6
@@ -550,43 +552,51 @@ aesni_ecb_encrypt:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L023ecb_dec_one:
+.L025ecb_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L027dec1_loop_4:
+.L029dec1_loop_4:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L027dec1_loop_4
+ jnz .L029dec1_loop_4
.byte 102,15,56,223,209
movups %xmm2,(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L024ecb_dec_two:
+.L026ecb_dec_two:
call _aesni_decrypt2
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L025ecb_dec_three:
+.L027ecb_dec_three:
call _aesni_decrypt3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L010ecb_ret
+ jmp .L012ecb_ret
.align 16
-.L026ecb_dec_four:
+.L028ecb_dec_four:
call _aesni_decrypt4
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L010ecb_ret:
+.L012ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -634,7 +644,7 @@ aesni_ccm64_encrypt_blocks:
leal 32(%edx,%ecx,1),%edx
subl %ecx,%ebx
.byte 102,15,56,0,253
-.L028ccm64_enc_outer:
+.L030ccm64_enc_outer:
movups (%ebp),%xmm0
movl %ebx,%ecx
movups (%esi),%xmm6
@@ -643,7 +653,7 @@ aesni_ccm64_encrypt_blocks:
xorps %xmm6,%xmm0
xorps %xmm0,%xmm3
movups 32(%ebp),%xmm0
-.L029ccm64_enc2_loop:
+.L031ccm64_enc2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -651,7 +661,7 @@ aesni_ccm64_encrypt_blocks:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz .L029ccm64_enc2_loop
+ jnz .L031ccm64_enc2_loop
.byte 102,15,56,220,209
.byte 102,15,56,220,217
paddq 16(%esp),%xmm7
@@ -664,10 +674,18 @@ aesni_ccm64_encrypt_blocks:
movups %xmm6,(%edi)
.byte 102,15,56,0,213
leal 16(%edi),%edi
- jnz .L028ccm64_enc_outer
+ jnz .L030ccm64_enc_outer
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -716,12 +734,12 @@ aesni_ccm64_decrypt_blocks:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L030enc1_loop_5:
+.L032enc1_loop_5:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L030enc1_loop_5
+ jnz .L032enc1_loop_5
.byte 102,15,56,221,209
shll $4,%ebx
movl $16,%ecx
@@ -731,16 +749,16 @@ aesni_ccm64_decrypt_blocks:
subl %ebx,%ecx
leal 32(%ebp,%ebx,1),%edx
movl %ecx,%ebx
- jmp .L031ccm64_dec_outer
+ jmp .L033ccm64_dec_outer
.align 16
-.L031ccm64_dec_outer:
+.L033ccm64_dec_outer:
xorps %xmm2,%xmm6
movdqa %xmm7,%xmm2
movups %xmm6,(%edi)
leal 16(%edi),%edi
.byte 102,15,56,0,213
subl $1,%eax
- jz .L032ccm64_dec_break
+ jz .L034ccm64_dec_break
movups (%ebp),%xmm0
movl %ebx,%ecx
movups 16(%ebp),%xmm1
@@ -748,7 +766,7 @@ aesni_ccm64_decrypt_blocks:
xorps %xmm0,%xmm2
xorps %xmm6,%xmm3
movups 32(%ebp),%xmm0
-.L033ccm64_dec2_loop:
+.L035ccm64_dec2_loop:
.byte 102,15,56,220,209
.byte 102,15,56,220,217
movups (%edx,%ecx,1),%xmm1
@@ -756,7 +774,7 @@ aesni_ccm64_decrypt_blocks:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
movups -16(%edx,%ecx,1),%xmm0
- jnz .L033ccm64_dec2_loop
+ jnz .L035ccm64_dec2_loop
movups (%esi),%xmm6
paddq 16(%esp),%xmm7
.byte 102,15,56,220,209
@@ -764,9 +782,9 @@ aesni_ccm64_decrypt_blocks:
.byte 102,15,56,221,208
.byte 102,15,56,221,216
leal 16(%esi),%esi
- jmp .L031ccm64_dec_outer
+ jmp .L033ccm64_dec_outer
.align 16
-.L032ccm64_dec_break:
+.L034ccm64_dec_break:
movl 240(%ebp),%ecx
movl %ebp,%edx
movups (%edx),%xmm0
@@ -774,16 +792,24 @@ aesni_ccm64_decrypt_blocks:
xorps %xmm0,%xmm6
leal 32(%edx),%edx
xorps %xmm6,%xmm3
-.L034enc1_loop_6:
+.L036enc1_loop_6:
.byte 102,15,56,220,217
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L034enc1_loop_6
+ jnz .L036enc1_loop_6
.byte 102,15,56,221,217
movl 48(%esp),%esp
movl 40(%esp),%edi
movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
popl %edi
popl %esi
popl %ebx
@@ -810,7 +836,7 @@ aesni_ctr32_encrypt_blocks:
andl $-16,%esp
movl %ebp,80(%esp)
cmpl $1,%eax
- je .L035ctr32_one_shortcut
+ je .L037ctr32_one_shortcut
movdqu (%ebx),%xmm7
movl $202182159,(%esp)
movl $134810123,4(%esp)
@@ -848,7 +874,7 @@ aesni_ctr32_encrypt_blocks:
pshufd $192,%xmm0,%xmm2
pshufd $128,%xmm0,%xmm3
cmpl $6,%eax
- jb .L036ctr32_tail
+ jb .L038ctr32_tail
pxor %xmm6,%xmm7
shll $4,%ecx
movl $16,%ebx
@@ -857,9 +883,9 @@ aesni_ctr32_encrypt_blocks:
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
subl $6,%eax
- jmp .L037ctr32_loop6
+ jmp .L039ctr32_loop6
.align 16
-.L037ctr32_loop6:
+.L039ctr32_loop6:
pshufd $64,%xmm0,%xmm4
movdqa 32(%esp),%xmm0
pshufd $192,%xmm1,%xmm5
@@ -913,27 +939,27 @@ aesni_ctr32_encrypt_blocks:
leal 96(%edi),%edi
pshufd $128,%xmm0,%xmm3
subl $6,%eax
- jnc .L037ctr32_loop6
+ jnc .L039ctr32_loop6
addl $6,%eax
- jz .L038ctr32_ret
+ jz .L040ctr32_ret
movdqu (%ebp),%xmm7
movl %ebp,%edx
pxor 32(%esp),%xmm7
movl 240(%ebp),%ecx
-.L036ctr32_tail:
+.L038ctr32_tail:
por %xmm7,%xmm2
cmpl $2,%eax
- jb .L039ctr32_one
+ jb .L041ctr32_one
pshufd $64,%xmm0,%xmm4
por %xmm7,%xmm3
- je .L040ctr32_two
+ je .L042ctr32_two
pshufd $192,%xmm1,%xmm5
por %xmm7,%xmm4
cmpl $4,%eax
- jb .L041ctr32_three
+ jb .L043ctr32_three
pshufd $128,%xmm1,%xmm6
por %xmm7,%xmm5
- je .L042ctr32_four
+ je .L044ctr32_four
por %xmm7,%xmm6
call _aesni_encrypt6
movups (%esi),%xmm1
@@ -951,29 +977,29 @@ aesni_ctr32_encrypt_blocks:
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L035ctr32_one_shortcut:
+.L037ctr32_one_shortcut:
movups (%ebx),%xmm2
movl 240(%edx),%ecx
-.L039ctr32_one:
+.L041ctr32_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L043enc1_loop_7:
+.L045enc1_loop_7:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L043enc1_loop_7
+ jnz .L045enc1_loop_7
.byte 102,15,56,221,209
movups (%esi),%xmm6
xorps %xmm2,%xmm6
movups %xmm6,(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L040ctr32_two:
+.L042ctr32_two:
call _aesni_encrypt2
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -981,9 +1007,9 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L041ctr32_three:
+.L043ctr32_three:
call _aesni_encrypt3
movups (%esi),%xmm5
movups 16(%esi),%xmm6
@@ -994,9 +1020,9 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm7,%xmm4
movups %xmm3,16(%edi)
movups %xmm4,32(%edi)
- jmp .L038ctr32_ret
+ jmp .L040ctr32_ret
.align 16
-.L042ctr32_four:
+.L044ctr32_four:
call _aesni_encrypt4
movups (%esi),%xmm6
movups 16(%esi),%xmm7
@@ -1010,7 +1036,18 @@ aesni_ctr32_encrypt_blocks:
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
movups %xmm5,48(%edi)
-.L038ctr32_ret:
+.L040ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
movl 80(%esp),%esp
popl %edi
popl %esi
@@ -1036,12 +1073,12 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L044enc1_loop_8:
+.L046enc1_loop_8:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L044enc1_loop_8
+ jnz .L046enc1_loop_8
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1065,14 +1102,14 @@ aesni_xts_encrypt:
movl %edx,%ebp
movl %ecx,%ebx
subl $96,%eax
- jc .L045xts_enc_short
+ jc .L047xts_enc_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp .L046xts_enc_loop6
+ jmp .L048xts_enc_loop6
.align 16
-.L046xts_enc_loop6:
+.L048xts_enc_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1161,23 +1198,23 @@ aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L046xts_enc_loop6
+ jnc .L048xts_enc_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L045xts_enc_short:
+.L047xts_enc_short:
addl $96,%eax
- jz .L047xts_enc_done6x
+ jz .L049xts_enc_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L048xts_enc_one
+ jb .L050xts_enc_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L049xts_enc_two
+ je .L051xts_enc_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1186,7 +1223,7 @@ aesni_xts_encrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L050xts_enc_three
+ jb .L052xts_enc_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1196,7 +1233,7 @@ aesni_xts_encrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L051xts_enc_four
+ je .L053xts_enc_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1228,9 +1265,9 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L048xts_enc_one:
+.L050xts_enc_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1238,20 +1275,20 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L053enc1_loop_9:
+.L055enc1_loop_9:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L053enc1_loop_9
+ jnz .L055enc1_loop_9
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L049xts_enc_two:
+.L051xts_enc_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1265,9 +1302,9 @@ aesni_xts_encrypt:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L050xts_enc_three:
+.L052xts_enc_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1285,9 +1322,9 @@ aesni_xts_encrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L051xts_enc_four:
+.L053xts_enc_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1309,28 +1346,28 @@ aesni_xts_encrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L052xts_enc_done
+ jmp .L054xts_enc_done
.align 16
-.L047xts_enc_done6x:
+.L049xts_enc_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L054xts_enc_ret
+ jz .L056xts_enc_ret
movdqa %xmm1,%xmm5
movl %eax,112(%esp)
- jmp .L055xts_enc_steal
+ jmp .L057xts_enc_steal
.align 16
-.L052xts_enc_done:
+.L054xts_enc_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L054xts_enc_ret
+ jz .L056xts_enc_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm5
paddq %xmm1,%xmm1
pand 96(%esp),%xmm5
pxor %xmm1,%xmm5
-.L055xts_enc_steal:
+.L057xts_enc_steal:
movzbl (%esi),%ecx
movzbl -16(%edi),%edx
leal 1(%esi),%esi
@@ -1338,7 +1375,7 @@ aesni_xts_encrypt:
movb %dl,(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L055xts_enc_steal
+ jnz .L057xts_enc_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1348,16 +1385,30 @@ aesni_xts_encrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L056enc1_loop_10:
+.L058enc1_loop_10:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L056enc1_loop_10
+ jnz .L058enc1_loop_10
.byte 102,15,56,221,209
xorps %xmm5,%xmm2
movups %xmm2,-16(%edi)
-.L054xts_enc_ret:
+.L056xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1383,12 +1434,12 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L057enc1_loop_11:
+.L059enc1_loop_11:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L057enc1_loop_11
+ jnz .L059enc1_loop_11
.byte 102,15,56,221,209
movl 20(%esp),%esi
movl 24(%esp),%edi
@@ -1417,14 +1468,14 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
andl $-16,%eax
subl $96,%eax
- jc .L058xts_dec_short
+ jc .L060xts_dec_short
shll $4,%ecx
movl $16,%ebx
subl %ecx,%ebx
leal 32(%edx,%ecx,1),%edx
- jmp .L059xts_dec_loop6
+ jmp .L061xts_dec_loop6
.align 16
-.L059xts_dec_loop6:
+.L061xts_dec_loop6:
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,(%esp)
@@ -1513,23 +1564,23 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
subl $96,%eax
- jnc .L059xts_dec_loop6
+ jnc .L061xts_dec_loop6
movl 240(%ebp),%ecx
movl %ebp,%edx
movl %ecx,%ebx
-.L058xts_dec_short:
+.L060xts_dec_short:
addl $96,%eax
- jz .L060xts_dec_done6x
+ jz .L062xts_dec_done6x
movdqa %xmm1,%xmm5
cmpl $32,%eax
- jb .L061xts_dec_one
+ jb .L063xts_dec_one
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
paddq %xmm1,%xmm1
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
- je .L062xts_dec_two
+ je .L064xts_dec_two
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm6
@@ -1538,7 +1589,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
cmpl $64,%eax
- jb .L063xts_dec_three
+ jb .L065xts_dec_three
pshufd $19,%xmm0,%xmm2
pxor %xmm0,%xmm0
movdqa %xmm1,%xmm7
@@ -1548,7 +1599,7 @@ aesni_xts_decrypt:
pxor %xmm2,%xmm1
movdqa %xmm5,(%esp)
movdqa %xmm6,16(%esp)
- je .L064xts_dec_four
+ je .L066xts_dec_four
movdqa %xmm7,32(%esp)
pshufd $19,%xmm0,%xmm7
movdqa %xmm1,48(%esp)
@@ -1580,9 +1631,9 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
movups %xmm6,64(%edi)
leal 80(%edi),%edi
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L061xts_dec_one:
+.L063xts_dec_one:
movups (%esi),%xmm2
leal 16(%esi),%esi
xorps %xmm5,%xmm2
@@ -1590,20 +1641,20 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L066dec1_loop_12:
+.L068dec1_loop_12:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L066dec1_loop_12
+ jnz .L068dec1_loop_12
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
leal 16(%edi),%edi
movdqa %xmm5,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L062xts_dec_two:
+.L064xts_dec_two:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1617,9 +1668,9 @@ aesni_xts_decrypt:
movups %xmm3,16(%edi)
leal 32(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L063xts_dec_three:
+.L065xts_dec_three:
movaps %xmm1,%xmm7
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1637,9 +1688,9 @@ aesni_xts_decrypt:
movups %xmm4,32(%edi)
leal 48(%edi),%edi
movdqa %xmm7,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L064xts_dec_four:
+.L066xts_dec_four:
movaps %xmm1,%xmm6
movups (%esi),%xmm2
movups 16(%esi),%xmm3
@@ -1661,20 +1712,20 @@ aesni_xts_decrypt:
movups %xmm5,48(%edi)
leal 64(%edi),%edi
movdqa %xmm6,%xmm1
- jmp .L065xts_dec_done
+ jmp .L067xts_dec_done
.align 16
-.L060xts_dec_done6x:
+.L062xts_dec_done6x:
movl 112(%esp),%eax
andl $15,%eax
- jz .L067xts_dec_ret
+ jz .L069xts_dec_ret
movl %eax,112(%esp)
- jmp .L068xts_dec_only_one_more
+ jmp .L070xts_dec_only_one_more
.align 16
-.L065xts_dec_done:
+.L067xts_dec_done:
movl 112(%esp),%eax
pxor %xmm0,%xmm0
andl $15,%eax
- jz .L067xts_dec_ret
+ jz .L069xts_dec_ret
pcmpgtd %xmm1,%xmm0
movl %eax,112(%esp)
pshufd $19,%xmm0,%xmm2
@@ -1684,7 +1735,7 @@ aesni_xts_decrypt:
pand %xmm3,%xmm2
pcmpgtd %xmm1,%xmm0
pxor %xmm2,%xmm1
-.L068xts_dec_only_one_more:
+.L070xts_dec_only_one_more:
pshufd $19,%xmm0,%xmm5
movdqa %xmm1,%xmm6
paddq %xmm1,%xmm1
@@ -1698,16 +1749,16 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L069dec1_loop_13:
+.L071dec1_loop_13:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L069dec1_loop_13
+ jnz .L071dec1_loop_13
.byte 102,15,56,223,209
xorps %xmm5,%xmm2
movups %xmm2,(%edi)
-.L070xts_dec_steal:
+.L072xts_dec_steal:
movzbl 16(%esi),%ecx
movzbl (%edi),%edx
leal 1(%esi),%esi
@@ -1715,7 +1766,7 @@ aesni_xts_decrypt:
movb %dl,16(%edi)
leal 1(%edi),%edi
subl $1,%eax
- jnz .L070xts_dec_steal
+ jnz .L072xts_dec_steal
subl 112(%esp),%edi
movl %ebp,%edx
movl %ebx,%ecx
@@ -1725,16 +1776,30 @@ aesni_xts_decrypt:
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L071dec1_loop_14:
+.L073dec1_loop_14:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L071dec1_loop_14
+ jnz .L073dec1_loop_14
.byte 102,15,56,223,209
xorps %xmm6,%xmm2
movups %xmm2,(%edi)
-.L067xts_dec_ret:
+.L069xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
movl 116(%esp),%esp
popl %edi
popl %esi
@@ -1761,7 +1826,7 @@ aesni_cbc_encrypt:
movl 32(%esp),%edx
movl 36(%esp),%ebp
testl %eax,%eax
- jz .L072cbc_abort
+ jz .L074cbc_abort
cmpl $0,40(%esp)
xchgl %esp,%ebx
movups (%ebp),%xmm7
@@ -1769,14 +1834,14 @@ aesni_cbc_encrypt:
movl %edx,%ebp
movl %ebx,16(%esp)
movl %ecx,%ebx
- je .L073cbc_decrypt
+ je .L075cbc_decrypt
movaps %xmm7,%xmm2
cmpl $16,%eax
- jb .L074cbc_enc_tail
+ jb .L076cbc_enc_tail
subl $16,%eax
- jmp .L075cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L075cbc_enc_loop:
+.L077cbc_enc_loop:
movups (%esi),%xmm7
leal 16(%esi),%esi
movups (%edx),%xmm0
@@ -1784,24 +1849,25 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm7
leal 32(%edx),%edx
xorps %xmm7,%xmm2
-.L076enc1_loop_15:
+.L078enc1_loop_15:
.byte 102,15,56,220,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L076enc1_loop_15
+ jnz .L078enc1_loop_15
.byte 102,15,56,221,209
movl %ebx,%ecx
movl %ebp,%edx
movups %xmm2,(%edi)
leal 16(%edi),%edi
subl $16,%eax
- jnc .L075cbc_enc_loop
+ jnc .L077cbc_enc_loop
addl $16,%eax
- jnz .L074cbc_enc_tail
+ jnz .L076cbc_enc_tail
movaps %xmm2,%xmm7
- jmp .L077cbc_ret
-.L074cbc_enc_tail:
+ pxor %xmm2,%xmm2
+ jmp .L079cbc_ret
+.L076cbc_enc_tail:
movl %eax,%ecx
.long 2767451785
movl $16,%ecx
@@ -1812,20 +1878,20 @@ aesni_cbc_encrypt:
movl %ebx,%ecx
movl %edi,%esi
movl %ebp,%edx
- jmp .L075cbc_enc_loop
+ jmp .L077cbc_enc_loop
.align 16
-.L073cbc_decrypt:
+.L075cbc_decrypt:
cmpl $80,%eax
- jbe .L078cbc_dec_tail
+ jbe .L080cbc_dec_tail
movaps %xmm7,(%esp)
subl $80,%eax
- jmp .L079cbc_dec_loop6_enter
+ jmp .L081cbc_dec_loop6_enter
.align 16
-.L080cbc_dec_loop6:
+.L082cbc_dec_loop6:
movaps %xmm0,(%esp)
movups %xmm7,(%edi)
leal 16(%edi),%edi
-.L079cbc_dec_loop6_enter:
+.L081cbc_dec_loop6_enter:
movdqu (%esi),%xmm2
movdqu 16(%esi),%xmm3
movdqu 32(%esi),%xmm4
@@ -1855,28 +1921,28 @@ aesni_cbc_encrypt:
movups %xmm6,64(%edi)
leal 80(%edi),%edi
subl $96,%eax
- ja .L080cbc_dec_loop6
+ ja .L082cbc_dec_loop6
movaps %xmm7,%xmm2
movaps %xmm0,%xmm7
addl $80,%eax
- jle .L081cbc_dec_tail_collected
+ jle .L083cbc_dec_clear_tail_collected
movups %xmm2,(%edi)
leal 16(%edi),%edi
-.L078cbc_dec_tail:
+.L080cbc_dec_tail:
movups (%esi),%xmm2
movaps %xmm2,%xmm6
cmpl $16,%eax
- jbe .L082cbc_dec_one
+ jbe .L084cbc_dec_one
movups 16(%esi),%xmm3
movaps %xmm3,%xmm5
cmpl $32,%eax
- jbe .L083cbc_dec_two
+ jbe .L085cbc_dec_two
movups 32(%esi),%xmm4
cmpl $48,%eax
- jbe .L084cbc_dec_three
+ jbe .L086cbc_dec_three
movups 48(%esi),%xmm5
cmpl $64,%eax
- jbe .L085cbc_dec_four
+ jbe .L087cbc_dec_four
movups 64(%esi),%xmm6
movaps %xmm7,(%esp)
movups (%esi),%xmm2
@@ -1894,55 +1960,62 @@ aesni_cbc_encrypt:
xorps %xmm0,%xmm6
movups %xmm2,(%edi)
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
leal 64(%edi),%edi
movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
subl $80,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L082cbc_dec_one:
+.L084cbc_dec_one:
movups (%edx),%xmm0
movups 16(%edx),%xmm1
leal 32(%edx),%edx
xorps %xmm0,%xmm2
-.L086dec1_loop_16:
+.L089dec1_loop_16:
.byte 102,15,56,222,209
decl %ecx
movups (%edx),%xmm1
leal 16(%edx),%edx
- jnz .L086dec1_loop_16
+ jnz .L089dec1_loop_16
.byte 102,15,56,223,209
xorps %xmm7,%xmm2
movaps %xmm6,%xmm7
subl $16,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L083cbc_dec_two:
+.L085cbc_dec_two:
call _aesni_decrypt2
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
movups %xmm2,(%edi)
movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
leal 16(%edi),%edi
movaps %xmm5,%xmm7
subl $32,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L084cbc_dec_three:
+.L086cbc_dec_three:
call _aesni_decrypt3
xorps %xmm7,%xmm2
xorps %xmm6,%xmm3
xorps %xmm5,%xmm4
movups %xmm2,(%edi)
movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
leal 32(%edi),%edi
movups 32(%esi),%xmm7
subl $48,%eax
- jmp .L081cbc_dec_tail_collected
+ jmp .L088cbc_dec_tail_collected
.align 16
-.L085cbc_dec_four:
+.L087cbc_dec_four:
call _aesni_decrypt4
movups 16(%esi),%xmm1
movups 32(%esi),%xmm0
@@ -1952,28 +2025,44 @@ aesni_cbc_encrypt:
movups %xmm2,(%edi)
xorps %xmm1,%xmm4
movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
xorps %xmm0,%xmm5
movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
leal 48(%edi),%edi
movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
subl $64,%eax
-.L081cbc_dec_tail_collected:
+ jmp .L088cbc_dec_tail_collected
+.align 16
+.L083cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L088cbc_dec_tail_collected:
andl $15,%eax
- jnz .L087cbc_dec_tail_partial
+ jnz .L090cbc_dec_tail_partial
movups %xmm2,(%edi)
- jmp .L077cbc_ret
+ pxor %xmm0,%xmm0
+ jmp .L079cbc_ret
.align 16
-.L087cbc_dec_tail_partial:
+.L090cbc_dec_tail_partial:
movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
movl $16,%ecx
movl %esp,%esi
subl %eax,%ecx
.long 2767451785
-.L077cbc_ret:
+ movdqa %xmm2,(%esp)
+.L079cbc_ret:
movl 16(%esp),%esp
movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
movups %xmm7,(%ebp)
-.L072cbc_abort:
+ pxor %xmm7,%xmm7
+.L074cbc_abort:
popl %edi
popl %esi
popl %ebx
@@ -1984,52 +2073,62 @@ aesni_cbc_encrypt:
.type _aesni_set_encrypt_key,@function
.align 16
_aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
testl %eax,%eax
- jz .L088bad_pointer
+ jz .L091bad_pointer
testl %edx,%edx
- jz .L088bad_pointer
+ jz .L091bad_pointer
+ call .L092pic
+.L092pic:
+ popl %ebx
+ leal .Lkey_const-.L092pic(%ebx),%ebx
+ leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
movups (%eax),%xmm0
xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
leal 16(%edx),%edx
+ andl $268437504,%ebp
cmpl $256,%ecx
- je .L08914rounds
+ je .L09314rounds
cmpl $192,%ecx
- je .L09012rounds
+ je .L09412rounds
cmpl $128,%ecx
- jne .L091bad_keybits
+ jne .L095bad_keybits
.align 16
-.L09210rounds:
+.L09610rounds:
+ cmpl $268435456,%ebp
+ je .L09710rounds_alt
movl $9,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,200,1
- call .L093key_128_cold
+ call .L098key_128_cold
.byte 102,15,58,223,200,2
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,4
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,8
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,16
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,32
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,64
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,128
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,27
- call .L094key_128
+ call .L099key_128
.byte 102,15,58,223,200,54
- call .L094key_128
+ call .L099key_128
movups %xmm0,(%edx)
movl %ecx,80(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L094key_128:
+.L099key_128:
movups %xmm0,(%edx)
leal 16(%edx),%edx
-.L093key_128_cold:
+.L098key_128_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2038,38 +2137,91 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L09012rounds:
+.L09710rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L101loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L101loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L100good_key
+.align 16
+.L09412rounds:
movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10212rounds_alt
movl $11,%ecx
movups %xmm0,-16(%edx)
.byte 102,15,58,223,202,1
- call .L095key_192a_cold
+ call .L103key_192a_cold
.byte 102,15,58,223,202,2
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,4
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,8
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,16
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,32
- call .L096key_192b
+ call .L104key_192b
.byte 102,15,58,223,202,64
- call .L097key_192a
+ call .L105key_192a
.byte 102,15,58,223,202,128
- call .L096key_192b
+ call .L104key_192b
movups %xmm0,(%edx)
movl %ecx,48(%edx)
- xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L097key_192a:
+.L105key_192a:
movups %xmm0,(%edx)
leal 16(%edx),%edx
.align 16
-.L095key_192a_cold:
+.L103key_192a_cold:
movaps %xmm2,%xmm5
-.L098key_192b_warm:
+.L106key_192b_warm:
shufps $16,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
@@ -2083,56 +2235,90 @@ _aesni_set_encrypt_key:
pxor %xmm3,%xmm2
ret
.align 16
-.L096key_192b:
+.L104key_192b:
movaps %xmm0,%xmm3
shufps $68,%xmm0,%xmm5
movups %xmm5,(%edx)
shufps $78,%xmm2,%xmm3
movups %xmm3,16(%edx)
leal 32(%edx),%edx
- jmp .L098key_192b_warm
+ jmp .L106key_192b_warm
+.align 16
+.L10212rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L107loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L107loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L100good_key
.align 16
-.L08914rounds:
+.L09314rounds:
movups 16(%eax),%xmm2
- movl $13,%ecx
leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L10814rounds_alt
+ movl $13,%ecx
movups %xmm0,-32(%edx)
movups %xmm2,-16(%edx)
.byte 102,15,58,223,202,1
- call .L099key_256a_cold
+ call .L109key_256a_cold
.byte 102,15,58,223,200,1
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,2
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,2
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,4
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,4
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,8
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,8
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,16
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,16
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,32
- call .L101key_256a
+ call .L111key_256a
.byte 102,15,58,223,200,32
- call .L100key_256b
+ call .L110key_256b
.byte 102,15,58,223,202,64
- call .L101key_256a
+ call .L111key_256a
movups %xmm0,(%edx)
movl %ecx,16(%edx)
xorl %eax,%eax
- ret
+ jmp .L100good_key
.align 16
-.L101key_256a:
+.L111key_256a:
movups %xmm2,(%edx)
leal 16(%edx),%edx
-.L099key_256a_cold:
+.L109key_256a_cold:
shufps $16,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps $140,%xmm0,%xmm4
@@ -2141,7 +2327,7 @@ _aesni_set_encrypt_key:
xorps %xmm1,%xmm0
ret
.align 16
-.L100key_256b:
+.L110key_256b:
movups %xmm0,(%edx)
leal 16(%edx),%edx
shufps $16,%xmm2,%xmm4
@@ -2151,13 +2337,70 @@ _aesni_set_encrypt_key:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
ret
+.align 16
+.L10814rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L112loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L113done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L112loop_key256
+.L113done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L100good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
.align 4
-.L088bad_pointer:
+.L091bad_pointer:
movl $-1,%eax
+ popl %ebx
+ popl %ebp
ret
.align 4
-.L091bad_keybits:
+.L095bad_keybits:
+ pxor %xmm0,%xmm0
movl $-2,%eax
+ popl %ebx
+ popl %ebp
ret
.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
.globl aesni_set_encrypt_key
@@ -2185,7 +2428,7 @@ aesni_set_decrypt_key:
movl 12(%esp),%edx
shll $4,%ecx
testl %eax,%eax
- jnz .L102dec_key_ret
+ jnz .L114dec_key_ret
leal 16(%edx,%ecx,1),%eax
movups (%edx),%xmm0
movups (%eax),%xmm1
@@ -2193,7 +2436,7 @@ aesni_set_decrypt_key:
movups %xmm1,(%edx)
leal 16(%edx),%edx
leal -16(%eax),%eax
-.L103dec_key_inverse:
+.L115dec_key_inverse:
movups (%edx),%xmm0
movups (%eax),%xmm1
.byte 102,15,56,219,192
@@ -2203,14 +2446,22 @@ aesni_set_decrypt_key:
movups %xmm0,16(%eax)
movups %xmm1,-16(%edx)
cmpl %edx,%eax
- ja .L103dec_key_inverse
+ ja .L115dec_key_inverse
movups (%edx),%xmm0
.byte 102,15,56,219,192
movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
xorl %eax,%eax
-.L102dec_key_ret:
+.L114dec_key_ret:
ret
.size aesni_set_decrypt_key,.-.L_aesni_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
diff --git a/linux-x86/crypto/bn/bn-586.S b/linux-x86/crypto/bn/bn-586.S
index b953393..773beff 100644
--- a/linux-x86/crypto/bn/bn-586.S
+++ b/linux-x86/crypto/bn/bn-586.S
@@ -7,6 +7,102 @@
.align 16
bn_mul_add_words:
.L_bn_mul_add_words_begin:
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L001maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp .L002maw_sse2_entry
+.align 16
+.L003maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz .L004maw_sse2_exit
+.L002maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz .L003maw_sse2_unrolled
+.align 4
+.L005maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L005maw_sse2_loop
+.L004maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L001maw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -19,9 +115,9 @@ bn_mul_add_words:
andl $4294967288,%ecx
movl 32(%esp),%ebp
pushl %ecx
- jz .L000maw_finish
+ jz .L006maw_finish
.align 16
-.L001maw_loop:
+.L007maw_loop:
movl (%ebx),%eax
mull %ebp
@@ -98,13 +194,13 @@ bn_mul_add_words:
subl $8,%ecx
leal 32(%ebx),%ebx
leal 32(%edi),%edi
- jnz .L001maw_loop
-.L000maw_finish:
+ jnz .L007maw_loop
+.L006maw_finish:
movl 32(%esp),%ecx
andl $7,%ecx
- jnz .L002maw_finish2
- jmp .L003maw_end
-.L002maw_finish2:
+ jnz .L008maw_finish2
+ jmp .L009maw_end
+.L008maw_finish2:
movl (%ebx),%eax
mull %ebp
@@ -115,7 +211,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 4(%ebx),%eax
mull %ebp
@@ -126,7 +222,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,4(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 8(%ebx),%eax
mull %ebp
@@ -137,7 +233,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,8(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 12(%ebx),%eax
mull %ebp
@@ -148,7 +244,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,12(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 16(%ebx),%eax
mull %ebp
@@ -159,7 +255,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,16(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 20(%ebx),%eax
mull %ebp
@@ -170,7 +266,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,20(%edi)
movl %edx,%esi
- jz .L003maw_end
+ jz .L009maw_end
movl 24(%ebx),%eax
mull %ebp
@@ -180,7 +276,7 @@ bn_mul_add_words:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-.L003maw_end:
+.L009maw_end:
movl %esi,%eax
popl %ecx
popl %edi
@@ -195,6 +291,33 @@ bn_mul_add_words:
.align 16
bn_mul_words:
.L_bn_mul_words_begin:
+ call .L010PIC_me_up
+.L010PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L011mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 16
+.L012mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L012mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L011mw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -206,8 +329,8 @@ bn_mul_words:
movl 28(%esp),%ebp
movl 32(%esp),%ecx
andl $4294967288,%ebp
- jz .L004mw_finish
-.L005mw_loop:
+ jz .L013mw_finish
+.L014mw_loop:
movl (%ebx),%eax
mull %ecx
@@ -268,14 +391,14 @@ bn_mul_words:
addl $32,%ebx
addl $32,%edi
subl $8,%ebp
- jz .L004mw_finish
- jmp .L005mw_loop
-.L004mw_finish:
+ jz .L013mw_finish
+ jmp .L014mw_loop
+.L013mw_finish:
movl 28(%esp),%ebp
andl $7,%ebp
- jnz .L006mw_finish2
- jmp .L007mw_end
-.L006mw_finish2:
+ jnz .L015mw_finish2
+ jmp .L016mw_end
+.L015mw_finish2:
movl (%ebx),%eax
mull %ecx
@@ -284,7 +407,7 @@ bn_mul_words:
movl %eax,(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 4(%ebx),%eax
mull %ecx
@@ -293,7 +416,7 @@ bn_mul_words:
movl %eax,4(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 8(%ebx),%eax
mull %ecx
@@ -302,7 +425,7 @@ bn_mul_words:
movl %eax,8(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 12(%ebx),%eax
mull %ecx
@@ -311,7 +434,7 @@ bn_mul_words:
movl %eax,12(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 16(%ebx),%eax
mull %ecx
@@ -320,7 +443,7 @@ bn_mul_words:
movl %eax,16(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 20(%ebx),%eax
mull %ecx
@@ -329,7 +452,7 @@ bn_mul_words:
movl %eax,20(%edi)
movl %edx,%esi
decl %ebp
- jz .L007mw_end
+ jz .L016mw_end
movl 24(%ebx),%eax
mull %ecx
@@ -337,7 +460,7 @@ bn_mul_words:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-.L007mw_end:
+.L016mw_end:
movl %esi,%eax
popl %edi
popl %esi
@@ -351,6 +474,28 @@ bn_mul_words:
.align 16
bn_sqr_words:
.L_bn_sqr_words_begin:
+ call .L017PIC_me_up
+.L017PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L018sqr_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+.align 16
+.L019sqr_sse2_loop:
+ movd (%edx),%mm0
+ pmuludq %mm0,%mm0
+ leal 4(%edx),%edx
+ movq %mm0,(%eax)
+ subl $1,%ecx
+ leal 8(%eax),%eax
+ jnz .L019sqr_sse2_loop
+ emms
+ ret
+.align 16
+.L018sqr_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -360,8 +505,8 @@ bn_sqr_words:
movl 24(%esp),%edi
movl 28(%esp),%ebx
andl $4294967288,%ebx
- jz .L008sw_finish
-.L009sw_loop:
+ jz .L020sw_finish
+.L021sw_loop:
movl (%edi),%eax
mull %eax
@@ -406,59 +551,59 @@ bn_sqr_words:
addl $32,%edi
addl $64,%esi
subl $8,%ebx
- jnz .L009sw_loop
-.L008sw_finish:
+ jnz .L021sw_loop
+.L020sw_finish:
movl 28(%esp),%ebx
andl $7,%ebx
- jz .L010sw_end
+ jz .L022sw_end
movl (%edi),%eax
mull %eax
movl %eax,(%esi)
decl %ebx
movl %edx,4(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 4(%edi),%eax
mull %eax
movl %eax,8(%esi)
decl %ebx
movl %edx,12(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 8(%edi),%eax
mull %eax
movl %eax,16(%esi)
decl %ebx
movl %edx,20(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 12(%edi),%eax
mull %eax
movl %eax,24(%esi)
decl %ebx
movl %edx,28(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 16(%edi),%eax
mull %eax
movl %eax,32(%esi)
decl %ebx
movl %edx,36(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 20(%edi),%eax
mull %eax
movl %eax,40(%esi)
decl %ebx
movl %edx,44(%esi)
- jz .L010sw_end
+ jz .L022sw_end
movl 24(%edi),%eax
mull %eax
movl %eax,48(%esi)
movl %edx,52(%esi)
-.L010sw_end:
+.L022sw_end:
popl %edi
popl %esi
popl %ebx
@@ -494,8 +639,8 @@ bn_add_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L011aw_finish
-.L012aw_loop:
+ jz .L023aw_finish
+.L024aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -573,11 +718,11 @@ bn_add_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L012aw_loop
-.L011aw_finish:
+ jnz .L024aw_loop
+.L023aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L013aw_end
+ jz .L025aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -588,7 +733,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -599,7 +744,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -610,7 +755,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -621,7 +766,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -632,7 +777,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -643,7 +788,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L013aw_end
+ jz .L025aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -653,7 +798,7 @@ bn_add_words:
addl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-.L013aw_end:
+.L025aw_end:
popl %edi
popl %esi
popl %ebx
@@ -677,8 +822,8 @@ bn_sub_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L014aw_finish
-.L015aw_loop:
+ jz .L026aw_finish
+.L027aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -756,11 +901,11 @@ bn_sub_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L015aw_loop
-.L014aw_finish:
+ jnz .L027aw_loop
+.L026aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L016aw_end
+ jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -771,7 +916,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -782,7 +927,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -793,7 +938,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -804,7 +949,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -815,7 +960,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -826,7 +971,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L016aw_end
+ jz .L028aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -836,7 +981,7 @@ bn_sub_words:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-.L016aw_end:
+.L028aw_end:
popl %edi
popl %esi
popl %ebx
@@ -860,8 +1005,8 @@ bn_sub_part_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz .L017aw_finish
-.L018aw_loop:
+ jz .L029aw_finish
+.L030aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@@ -939,11 +1084,11 @@ bn_sub_part_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L018aw_loop
-.L017aw_finish:
+ jnz .L030aw_loop
+.L029aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -957,7 +1102,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -971,7 +1116,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -985,7 +1130,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -999,7 +1144,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -1013,7 +1158,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -1027,7 +1172,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz .L019aw_end
+ jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@@ -1040,20 +1185,20 @@ bn_sub_part_words:
addl $4,%esi
addl $4,%edi
addl $4,%ebx
-.L019aw_end:
+.L031aw_end:
cmpl $0,36(%esp)
- je .L020pw_end
+ je .L032pw_end
movl 36(%esp),%ebp
cmpl $0,%ebp
- je .L020pw_end
- jge .L021pw_pos
+ je .L032pw_end
+ jge .L033pw_pos
movl $0,%edx
subl %ebp,%edx
movl %edx,%ebp
andl $4294967288,%ebp
- jz .L022pw_neg_finish
-.L023pw_neg_loop:
+ jz .L034pw_neg_finish
+.L035pw_neg_loop:
movl $0,%ecx
movl (%edi),%edx
@@ -1130,13 +1275,13 @@ bn_sub_part_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz .L023pw_neg_loop
-.L022pw_neg_finish:
+ jnz .L035pw_neg_loop
+.L034pw_neg_finish:
movl 36(%esp),%edx
movl $0,%ebp
subl %edx,%ebp
andl $7,%ebp
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl (%edi),%edx
@@ -1147,7 +1292,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 4(%edi),%edx
@@ -1158,7 +1303,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 8(%edi),%edx
@@ -1169,7 +1314,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 12(%edi),%edx
@@ -1180,7 +1325,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 16(%edi),%edx
@@ -1191,7 +1336,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 20(%edi),%edx
@@ -1202,7 +1347,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz .L020pw_end
+ jz .L032pw_end
movl $0,%ecx
movl 24(%edi),%edx
@@ -1212,178 +1357,178 @@ bn_sub_part_words:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
- jmp .L020pw_end
-.L021pw_pos:
+ jmp .L032pw_end
+.L033pw_pos:
andl $4294967288,%ebp
- jz .L024pw_pos_finish
-.L025pw_pos_loop:
+ jz .L036pw_pos_finish
+.L037pw_pos_loop:
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc .L026pw_nc0
+ jnc .L038pw_nc0
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc .L027pw_nc1
+ jnc .L039pw_nc1
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc .L028pw_nc2
+ jnc .L040pw_nc2
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc .L029pw_nc3
+ jnc .L041pw_nc3
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc .L030pw_nc4
+ jnc .L042pw_nc4
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc .L031pw_nc5
+ jnc .L043pw_nc5
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc .L032pw_nc6
+ jnc .L044pw_nc6
movl 28(%esi),%ecx
subl %eax,%ecx
movl %ecx,28(%ebx)
- jnc .L033pw_nc7
+ jnc .L045pw_nc7
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz .L025pw_pos_loop
-.L024pw_pos_finish:
+ jnz .L037pw_pos_loop
+.L036pw_pos_finish:
movl 36(%esp),%ebp
andl $7,%ebp
- jz .L020pw_end
+ jz .L032pw_end
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc .L034pw_tail_nc0
+ jnc .L046pw_tail_nc0
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc .L035pw_tail_nc1
+ jnc .L047pw_tail_nc1
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc .L036pw_tail_nc2
+ jnc .L048pw_tail_nc2
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc .L037pw_tail_nc3
+ jnc .L049pw_tail_nc3
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc .L038pw_tail_nc4
+ jnc .L050pw_tail_nc4
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc .L039pw_tail_nc5
+ jnc .L051pw_tail_nc5
decl %ebp
- jz .L020pw_end
+ jz .L032pw_end
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc .L040pw_tail_nc6
+ jnc .L052pw_tail_nc6
movl $1,%eax
- jmp .L020pw_end
-.L041pw_nc_loop:
+ jmp .L032pw_end
+.L053pw_nc_loop:
movl (%esi),%ecx
movl %ecx,(%ebx)
-.L026pw_nc0:
+.L038pw_nc0:
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-.L027pw_nc1:
+.L039pw_nc1:
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-.L028pw_nc2:
+.L040pw_nc2:
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-.L029pw_nc3:
+.L041pw_nc3:
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-.L030pw_nc4:
+.L042pw_nc4:
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-.L031pw_nc5:
+.L043pw_nc5:
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-.L032pw_nc6:
+.L044pw_nc6:
movl 28(%esi),%ecx
movl %ecx,28(%ebx)
-.L033pw_nc7:
+.L045pw_nc7:
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz .L041pw_nc_loop
+ jnz .L053pw_nc_loop
movl 36(%esp),%ebp
andl $7,%ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl (%esi),%ecx
movl %ecx,(%ebx)
-.L034pw_tail_nc0:
+.L046pw_tail_nc0:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-.L035pw_tail_nc1:
+.L047pw_tail_nc1:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-.L036pw_tail_nc2:
+.L048pw_tail_nc2:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-.L037pw_tail_nc3:
+.L049pw_tail_nc3:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-.L038pw_tail_nc4:
+.L050pw_tail_nc4:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-.L039pw_tail_nc5:
+.L051pw_tail_nc5:
decl %ebp
- jz .L042pw_nc_end
+ jz .L054pw_nc_end
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-.L040pw_tail_nc6:
-.L042pw_nc_end:
+.L052pw_tail_nc6:
+.L054pw_nc_end:
movl $0,%eax
-.L020pw_end:
+.L032pw_end:
popl %edi
popl %esi
popl %ebx
diff --git a/linux-x86/crypto/bn/x86-mont.S b/linux-x86/crypto/bn/x86-mont.S
index e2d23b3..1569b2c 100644
--- a/linux-x86/crypto/bn/x86-mont.S
+++ b/linux-x86/crypto/bn/x86-mont.S
@@ -44,6 +44,126 @@ bn_mul_mont:
movl %esi,20(%esp)
leal -3(%edi),%ebx
movl %ebp,24(%esp)
+ call .L001PIC_me_up
+.L001PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L001PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L002non_sse2
+ movl $-1,%eax
+ movd %eax,%mm7
+ movl 8(%esp),%esi
+ movl 12(%esp),%edi
+ movl 16(%esp),%ebp
+ xorl %edx,%edx
+ xorl %ecx,%ecx
+ movd (%edi),%mm4
+ movd (%esi),%mm5
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ movq %mm5,%mm2
+ movq %mm5,%mm0
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ incl %ecx
+.align 16
+.L0031st:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ leal 1(%ecx),%ecx
+ cmpl %ebx,%ecx
+ jl .L0031st
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm2,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ incl %edx
+.L004outer:
+ xorl %ecx,%ecx
+ movd (%edi,%edx,4),%mm4
+ movd (%esi),%mm5
+ movd 32(%esp),%mm6
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ paddq %mm6,%mm5
+ movq %mm5,%mm0
+ movq %mm5,%mm2
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 36(%esp),%mm6
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ incl %ecx
+ decl %ebx
+.L005inner:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ movd 36(%esp,%ecx,4),%mm6
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ decl %ebx
+ leal 1(%ecx),%ecx
+ jnz .L005inner
+ movl %ecx,%ebx
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ movd 36(%esp,%ebx,4),%mm6
+ paddq %mm2,%mm3
+ paddq %mm6,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ leal 1(%edx),%edx
+ cmpl %ebx,%edx
+ jle .L004outer
+ emms
+ jmp .L006common_tail
+.align 16
+.L002non_sse2:
movl 8(%esp),%esi
leal 1(%ebx),%ebp
movl 12(%esp),%edi
@@ -54,12 +174,12 @@ bn_mul_mont:
leal 4(%edi,%ebx,4),%eax
orl %edx,%ebp
movl (%edi),%edi
- jz .L001bn_sqr_mont
+ jz .L007bn_sqr_mont
movl %eax,28(%esp)
movl (%esi),%eax
xorl %edx,%edx
.align 16
-.L002mull:
+.L008mull:
movl %edx,%ebp
mull %edi
addl %eax,%ebp
@@ -68,7 +188,7 @@ bn_mul_mont:
movl (%esi,%ecx,4),%eax
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl .L002mull
+ jl .L008mull
movl %edx,%ebp
mull %edi
movl 20(%esp),%edi
@@ -86,9 +206,9 @@ bn_mul_mont:
movl 4(%esi),%eax
adcl $0,%edx
incl %ecx
- jmp .L0032ndmadd
+ jmp .L0092ndmadd
.align 16
-.L0041stmadd:
+.L0101stmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -99,7 +219,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl .L0041stmadd
+ jl .L0101stmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%eax
@@ -122,7 +242,7 @@ bn_mul_mont:
adcl $0,%edx
movl $1,%ecx
.align 16
-.L0032ndmadd:
+.L0092ndmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -133,7 +253,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl .L0032ndmadd
+ jl .L0092ndmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -149,16 +269,16 @@ bn_mul_mont:
movl %edx,32(%esp,%ebx,4)
cmpl 28(%esp),%ecx
movl %eax,36(%esp,%ebx,4)
- je .L005common_tail
+ je .L006common_tail
movl (%ecx),%edi
movl 8(%esp),%esi
movl %ecx,12(%esp)
xorl %ecx,%ecx
xorl %edx,%edx
movl (%esi),%eax
- jmp .L0041stmadd
+ jmp .L0101stmadd
.align 16
-.L001bn_sqr_mont:
+.L007bn_sqr_mont:
movl %ebx,(%esp)
movl %ecx,12(%esp)
movl %edi,%eax
@@ -169,7 +289,7 @@ bn_mul_mont:
andl $1,%ebx
incl %ecx
.align 16
-.L006sqr:
+.L011sqr:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -181,7 +301,7 @@ bn_mul_mont:
cmpl (%esp),%ecx
movl %eax,%ebx
movl %ebp,28(%esp,%ecx,4)
- jl .L006sqr
+ jl .L011sqr
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -205,7 +325,7 @@ bn_mul_mont:
movl 4(%esi),%eax
movl $1,%ecx
.align 16
-.L0073rdmadd:
+.L0123rdmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -224,7 +344,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl .L0073rdmadd
+ jl .L0123rdmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -240,7 +360,7 @@ bn_mul_mont:
movl %edx,32(%esp,%ebx,4)
cmpl %ebx,%ecx
movl %eax,36(%esp,%ebx,4)
- je .L005common_tail
+ je .L006common_tail
movl 4(%esi,%ecx,4),%edi
leal 1(%ecx),%ecx
movl %edi,%eax
@@ -252,12 +372,12 @@ bn_mul_mont:
xorl %ebp,%ebp
cmpl %ebx,%ecx
leal 1(%ecx),%ecx
- je .L008sqrlast
+ je .L013sqrlast
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
.align 16
-.L009sqradd:
+.L014sqradd:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -273,13 +393,13 @@ bn_mul_mont:
cmpl (%esp),%ecx
movl %ebp,28(%esp,%ecx,4)
movl %eax,%ebx
- jle .L009sqradd
+ jle .L014sqradd
movl %edx,%ebp
addl %edx,%edx
shrl $31,%ebp
addl %ebx,%edx
adcl $0,%ebp
-.L008sqrlast:
+.L013sqrlast:
movl 20(%esp),%edi
movl 16(%esp),%esi
imull 32(%esp),%edi
@@ -294,9 +414,9 @@ bn_mul_mont:
adcl $0,%edx
movl $1,%ecx
movl 4(%esi),%eax
- jmp .L0073rdmadd
+ jmp .L0123rdmadd
.align 16
-.L005common_tail:
+.L006common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -304,16 +424,16 @@ bn_mul_mont:
movl %ebx,%ecx
xorl %edx,%edx
.align 16
-.L010sub:
+.L015sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge .L010sub
+ jge .L015sub
sbbl $0,%eax
.align 16
-.L011copy:
+.L016copy:
movl (%esi,%ebx,4),%edx
movl (%edi,%ebx,4),%ebp
xorl %ebp,%edx
@@ -322,7 +442,7 @@ bn_mul_mont:
movl %ecx,(%esi,%ebx,4)
movl %edx,(%edi,%ebx,4)
decl %ebx
- jge .L011copy
+ jge .L016copy
movl 24(%esp),%esp
movl $1,%eax
.L000just_leave:
diff --git a/linux-x86/crypto/cpu-x86-asm.S b/linux-x86/crypto/cpu-x86-asm.S
index b6f767b..24a8dd4 100644
--- a/linux-x86/crypto/cpu-x86-asm.S
+++ b/linux-x86/crypto/cpu-x86-asm.S
@@ -101,10 +101,6 @@ OPENSSL_ia32_cpuid:
cmpl $0,%ebp
jne .L005notintel
orl $1073741824,%edx
- andb $15,%ah
- cmpb $15,%ah
- jne .L005notintel
- orl $1048576,%edx
.L005notintel:
btl $28,%edx
jnc .L002generic
@@ -241,6 +237,18 @@ OPENSSL_wipe_cpu:
movl (%ecx),%ecx
btl $1,(%ecx)
jnc .L016no_x87
+ andl $83886080,%ecx
+ cmpl $83886080,%ecx
+ jne .L017no_sse2
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+.L017no_sse2:
.long 4007259865,4007259865,4007259865,4007259865,2430851995
.L016no_x87:
leal 4(%esp),%eax
@@ -257,11 +265,11 @@ OPENSSL_atomic_add:
pushl %ebx
nop
movl (%edx),%eax
-.L017spin:
+.L018spin:
leal (%eax,%ecx,1),%ebx
nop
.long 447811568
- jne .L017spin
+ jne .L018spin
movl %ebx,%eax
popl %ebx
ret
@@ -301,11 +309,11 @@ OPENSSL_indirect_call:
OPENSSL_ia32_rdrand:
.L_OPENSSL_ia32_rdrand_begin:
movl $8,%ecx
-.L018loop:
+.L019loop:
.byte 15,199,240
- jc .L019break
- loop .L018loop
-.L019break:
+ jc .L020break
+ loop .L019loop
+.L020break:
cmpl $0,%eax
cmovel %ecx,%eax
ret
diff --git a/linux-x86/crypto/sha/sha1-586.S b/linux-x86/crypto/sha/sha1-586.S
index 71ae5b3..808ccac 100644
--- a/linux-x86/crypto/sha/sha1-586.S
+++ b/linux-x86/crypto/sha/sha1-586.S
@@ -11,6 +11,23 @@ sha1_block_data_order:
pushl %ebx
pushl %esi
pushl %edi
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal OPENSSL_ia32cap_P-.L000pic_point(%ebp),%esi
+ leal .LK_XX_XX-.L000pic_point(%ebp),%ebp
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ testl $512,%edx
+ jz .L001x86
+ movl 8(%esi),%ecx
+ testl $16777216,%eax
+ jz .L001x86
+ testl $536870912,%ecx
+ jnz .Lshaext_shortcut
+ jmp .Lssse3_shortcut
+.align 16
+.L001x86:
movl 20(%esp),%ebp
movl 24(%esp),%esi
movl 28(%esp),%eax
@@ -19,9 +36,9 @@ sha1_block_data_order:
addl %esi,%eax
movl %eax,104(%esp)
movl 16(%ebp),%edi
- jmp .L000loop
+ jmp .L002loop
.align 16
-.L000loop:
+.L002loop:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
@@ -1368,7 +1385,7 @@ sha1_block_data_order:
movl %ebx,12(%ebp)
movl %edx,%esi
movl %ecx,16(%ebp)
- jb .L000loop
+ jb .L002loop
addl $76,%esp
popl %edi
popl %esi
@@ -1376,6 +1393,1405 @@ sha1_block_data_order:
popl %ebp
ret
.size sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.hidden _sha1_block_data_order_shaext
+.type _sha1_block_data_order_shaext,@function
+.align 16
+_sha1_block_data_order_shaext:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L003pic_point
+.L003pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lshaext_shortcut:
+ movl 20(%esp),%edi
+ movl %esp,%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ subl $32,%esp
+ movdqu (%edi),%xmm0
+ movd 16(%edi),%xmm1
+ andl $-32,%esp
+ movdqa 80(%ebp),%xmm3
+ movdqu (%esi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%esi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%esi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%esi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+.byte 102,15,56,0,251
+ jmp .L004loop_shaext
+.align 16
+.L004loop_shaext:
+ decl %ecx
+ leal 64(%esi),%eax
+ movdqa %xmm1,(%esp)
+ paddd %xmm4,%xmm1
+ cmovnel %eax,%esi
+ movdqa %xmm0,16(%esp)
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%esi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%esi),%xmm5
+.byte 102,15,56,0,227
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%esi),%xmm6
+.byte 102,15,56,0,235
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%esi),%xmm7
+.byte 102,15,56,0,243
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+ movdqa (%esp),%xmm2
+.byte 102,15,56,0,251
+.byte 15,56,200,202
+ paddd 16(%esp),%xmm0
+ jnz .L004loop_shaext
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%edi)
+ movd %xmm1,16(%edi)
+ movl %ebx,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext
+.hidden _sha1_block_data_order_ssse3
+.type _sha1_block_data_order_ssse3,@function
+.align 16
+_sha1_block_data_order_ssse3:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L005pic_point
+.L005pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L005pic_point(%ebp),%ebp
+.Lssse3_shortcut:
+ movdqa (%ebp),%xmm7
+ movdqa 16(%ebp),%xmm0
+ movdqa 32(%ebp),%xmm1
+ movdqa 48(%ebp),%xmm2
+ movdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ movdqa %xmm0,112(%esp)
+ movdqa %xmm1,128(%esp)
+ movdqa %xmm2,144(%esp)
+ shll $6,%edx
+ movdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ movdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ movdqu -64(%ebp),%xmm0
+ movdqu -48(%ebp),%xmm1
+ movdqu -32(%ebp),%xmm2
+ movdqu -16(%ebp),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ movdqa %xmm7,96(%esp)
+.byte 102,15,56,0,222
+ paddd %xmm7,%xmm0
+ paddd %xmm7,%xmm1
+ paddd %xmm7,%xmm2
+ movdqa %xmm0,(%esp)
+ psubd %xmm7,%xmm0
+ movdqa %xmm1,16(%esp)
+ psubd %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ xorl %edx,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebp,%esi
+ jmp .L006loop
+.align 16
+.L006loop:
+ rorl $2,%ebx
+ xorl %edx,%esi
+ movl %eax,%ebp
+ punpcklqdq %xmm1,%xmm4
+ movdqa %xmm3,%xmm6
+ addl (%esp),%edi
+ xorl %ecx,%ebx
+ paddd %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+ roll $5,%eax
+ addl %esi,%edi
+ psrldq $4,%xmm6
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm2,%xmm6
+ xorl %ecx,%ebp
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ pxor %xmm6,%xmm4
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ movdqa %xmm4,%xmm6
+ xorl %ebx,%esi
+ pslldq $12,%xmm0
+ paddd %xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ psrld $31,%xmm6
+ xorl %eax,%edi
+ roll $5,%edx
+ movdqa %xmm0,%xmm7
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ psrld $30,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm6,%xmm4
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ pslld $2,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ pxor %xmm0,%xmm4
+ movdqa 96(%esp),%xmm0
+ addl %ebp,%ebx
+ andl %edx,%esi
+ pxor %xmm7,%xmm4
+ pshufd $238,%xmm1,%xmm5
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ movl %ebx,%ebp
+ punpcklqdq %xmm2,%xmm5
+ movdqa %xmm4,%xmm7
+ addl 16(%esp),%eax
+ xorl %edx,%ecx
+ paddd %xmm4,%xmm0
+ movdqa %xmm1,80(%esp)
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm7
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm7
+ xorl %edx,%ebp
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ pxor %xmm7,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm0,(%esp)
+ addl %ebp,%edi
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ movdqa %xmm5,%xmm7
+ xorl %ecx,%esi
+ pslldq $12,%xmm1
+ paddd %xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ psrld $31,%xmm7
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ psrld $30,%xmm1
+ addl %edi,%edx
+ rorl $7,%edi
+ por %xmm7,%xmm5
+ xorl %ebx,%ebp
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ pslld $2,%xmm0
+ xorl %eax,%edi
+ roll $5,%edx
+ pxor %xmm1,%xmm5
+ movdqa 112(%esp),%xmm1
+ addl %ebp,%ecx
+ andl %edi,%esi
+ pxor %xmm0,%xmm5
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ punpcklqdq %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ addl 32(%esp),%ebx
+ xorl %edi,%edx
+ paddd %xmm5,%xmm1
+ movdqa %xmm2,96(%esp)
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm0
+ andl %edx,%ebp
+ xorl %edi,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm0
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ pxor %xmm0,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm1,16(%esp)
+ addl %ebp,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm0
+ xorl %edx,%esi
+ pslldq $12,%xmm2
+ paddd %xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ psrld $31,%xmm0
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm2,%xmm1
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ psrld $30,%xmm2
+ addl %eax,%edi
+ rorl $7,%eax
+ por %xmm0,%xmm6
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm0
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ pslld $2,%xmm1
+ xorl %ebx,%eax
+ roll $5,%edi
+ pxor %xmm2,%xmm6
+ movdqa 112(%esp),%xmm2
+ addl %ebp,%edx
+ andl %eax,%esi
+ pxor %xmm1,%xmm6
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%esi
+ movl %edx,%ebp
+ punpcklqdq %xmm4,%xmm7
+ movdqa %xmm6,%xmm1
+ addl 48(%esp),%ecx
+ xorl %eax,%edi
+ paddd %xmm6,%xmm2
+ movdqa %xmm3,64(%esp)
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm1
+ andl %edi,%ebp
+ xorl %eax,%edi
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm1
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ pxor %xmm1,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ movdqa %xmm2,32(%esp)
+ addl %ebp,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm1
+ xorl %edi,%esi
+ pslldq $12,%xmm3
+ paddd %xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ psrld $31,%xmm1
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm3,%xmm2
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ psrld $30,%xmm3
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm1,%xmm7
+ xorl %edx,%ebp
+ movdqa 80(%esp),%xmm1
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ pslld $2,%xmm2
+ xorl %ecx,%ebx
+ roll $5,%eax
+ pxor %xmm3,%xmm7
+ movdqa 112(%esp),%xmm3
+ addl %ebp,%edi
+ andl %ebx,%esi
+ pxor %xmm2,%xmm7
+ pshufd $238,%xmm6,%xmm2
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm3,%xmm4
+ addl %esi,%edx
+ paddd %xmm7,%xmm3
+ andl %eax,%ebp
+ pxor %xmm2,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ roll $5,%edx
+ pslld $2,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ psrld $30,%xmm2
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ xorl %edi,%edx
+ roll $5,%ecx
+ por %xmm2,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ movdqa 96(%esp),%xmm2
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ pshufd $238,%xmm7,%xmm3
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 16(%esp),%edi
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm4,%xmm5
+ rorl $7,%ebx
+ paddd %xmm0,%xmm4
+ addl %eax,%edi
+ pxor %xmm3,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm3
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm3,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ movdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ pshufd $238,%xmm0,%xmm4
+ addl %ecx,%ebx
+ addl 32(%esp),%eax
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ movdqa 128(%esp),%xmm6
+ rorl $7,%ecx
+ paddd %xmm1,%xmm5
+ addl %ebx,%eax
+ pxor %xmm4,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ pslld $2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ psrld $30,%xmm4
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ por %xmm4,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ movdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ pshufd $238,%xmm1,%xmm5
+ addl %edx,%ecx
+ addl 48(%esp),%ebx
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%edx
+ paddd %xmm2,%xmm6
+ addl %ecx,%ebx
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pslld $2,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ psrld $30,%xmm5
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ por %xmm5,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ movdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ pshufd $238,%xmm2,%xmm6
+ addl %edi,%edx
+ addl (%esp),%ecx
+ pxor %xmm0,%xmm4
+ punpcklqdq %xmm3,%xmm6
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ movdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ movdqa %xmm7,%xmm0
+ rorl $7,%edi
+ paddd %xmm3,%xmm7
+ addl %edx,%ecx
+ pxor %xmm6,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm6
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pslld $2,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ psrld $30,%xmm6
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ por %xmm6,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ pshufd $238,%xmm3,%xmm7
+ addl %eax,%edi
+ addl 16(%esp),%edx
+ pxor %xmm1,%xmm5
+ punpcklqdq %xmm4,%xmm7
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ pxor %xmm6,%xmm5
+ movdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm1
+ rorl $7,%eax
+ paddd %xmm4,%xmm0
+ addl %edi,%edx
+ pxor %xmm7,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm7
+ movdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ pslld $2,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ psrld $30,%xmm7
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ por %xmm7,%xmm5
+ addl 28(%esp),%eax
+ movdqa 80(%esp),%xmm7
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pshufd $238,%xmm4,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 32(%esp),%edi
+ pxor %xmm2,%xmm6
+ punpcklqdq %xmm5,%xmm0
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ pxor %xmm7,%xmm6
+ movdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ roll $5,%eax
+ movdqa %xmm1,%xmm2
+ addl %esi,%edi
+ paddd %xmm5,%xmm1
+ xorl %ebx,%ebp
+ pxor %xmm0,%xmm6
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ andl %ebx,%ebp
+ movdqa %xmm6,%xmm0
+ movdqa %xmm1,16(%esp)
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ pslld $2,%xmm6
+ addl %ebp,%edx
+ xorl %eax,%esi
+ psrld $30,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ por %xmm0,%xmm6
+ movl %edx,%ebp
+ xorl %eax,%esi
+ movdqa 96(%esp),%xmm0
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ pshufd $238,%xmm5,%xmm1
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 48(%esp),%eax
+ pxor %xmm3,%xmm7
+ punpcklqdq %xmm6,%xmm1
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ pxor %xmm0,%xmm7
+ movdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ roll $5,%ebx
+ movdqa 144(%esp),%xmm3
+ addl %esi,%eax
+ paddd %xmm6,%xmm2
+ xorl %ecx,%ebp
+ pxor %xmm1,%xmm7
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ andl %ecx,%ebp
+ movdqa %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ pslld $2,%xmm7
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ psrld $30,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ por %xmm1,%xmm7
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ movdqa 64(%esp),%xmm1
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ pshufd $238,%xmm6,%xmm2
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl (%esp),%ebx
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ roll $5,%ecx
+ movdqa %xmm3,%xmm4
+ addl %esi,%ebx
+ paddd %xmm7,%xmm3
+ xorl %edx,%ebp
+ pxor %xmm2,%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ andl %edx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pslld $2,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ psrld $30,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ por %xmm2,%xmm0
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ movdqa 80(%esp),%xmm2
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ pshufd $238,%xmm7,%xmm3
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 16(%esp),%ecx
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ roll $5,%edx
+ movdqa %xmm4,%xmm5
+ addl %esi,%ecx
+ paddd %xmm0,%xmm4
+ xorl %edi,%ebp
+ pxor %xmm3,%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ andl %edi,%ebp
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ pslld $2,%xmm1
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ psrld $30,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ por %xmm3,%xmm1
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ movdqa 96(%esp),%xmm3
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pshufd $238,%xmm0,%xmm4
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 32(%esp),%edx
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ roll $5,%edi
+ movdqa %xmm5,%xmm6
+ addl %esi,%edx
+ paddd %xmm1,%xmm5
+ xorl %eax,%ebp
+ pxor %xmm4,%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ andl %eax,%ebp
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ pslld $2,%xmm2
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ psrld $30,%xmm4
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ por %xmm4,%xmm2
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ movdqa 64(%esp),%xmm4
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ pshufd $238,%xmm1,%xmm5
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ addl 48(%esp),%edi
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%ebx
+ paddd %xmm2,%xmm6
+ addl %eax,%edi
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm5
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm5,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ paddd %xmm3,%xmm7
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ movdqa %xmm7,48(%esp)
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L007done
+ movdqa 160(%esp),%xmm7
+ movdqa 176(%esp),%xmm6
+ movdqu (%ebp),%xmm0
+ movdqu 16(%ebp),%xmm1
+ movdqu 32(%ebp),%xmm2
+ movdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+.byte 102,15,56,0,198
+ movl %ebp,196(%esp)
+ movdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+.byte 102,15,56,0,206
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ paddd %xmm7,%xmm0
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ movdqa %xmm0,(%esp)
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ psubd %xmm7,%xmm0
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+.byte 102,15,56,0,214
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ paddd %xmm7,%xmm1
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ movdqa %xmm1,16(%esp)
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ psubd %xmm7,%xmm1
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+.byte 102,15,56,0,222
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ paddd %xmm7,%xmm2
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ movdqa %xmm2,32(%esp)
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %ecx,%ebx
+ movl %edx,12(%ebp)
+ xorl %edx,%ebx
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L006loop
+.align 16
+.L007done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.align 64
+.LK_XX_XX:
+.long 1518500249,1518500249,1518500249,1518500249
+.long 1859775393,1859775393,1859775393,1859775393
+.long 2400959708,2400959708,2400959708,2400959708
+.long 3395469782,3395469782,3395469782,3395469782
+.long 66051,67438087,134810123,202182159
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
diff --git a/linux-x86/crypto/sha/sha256-586.S b/linux-x86/crypto/sha/sha256-586.S
index fe41afc..08d9484 100644
--- a/linux-x86/crypto/sha/sha256-586.S
+++ b/linux-x86/crypto/sha/sha256-586.S
@@ -27,6 +27,27 @@ sha256_block_data_order:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P-.L001K256(%ebp),%edx
+ movl (%edx),%ecx
+ movl 4(%edx),%ebx
+ testl $1048576,%ecx
+ jnz .L002loop
+ movl 8(%edx),%edx
+ testl $16777216,%ecx
+ jz .L003no_xmm
+ andl $1073741824,%ecx
+ andl $268435968,%ebx
+ testl $536870912,%edx
+ jnz .L004shaext
+ orl %ebx,%ecx
+ andl $1342177280,%ecx
+ cmpl $1342177280,%ecx
+ testl $512,%ebx
+ jnz .L005SSSE3
+.L003no_xmm:
+ subl %edi,%eax
+ cmpl $256,%eax
+ jae .L006unrolled
jmp .L002loop
.align 16
.L002loop:
@@ -98,7 +119,7 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00300_15:
+.L00700_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -136,11 +157,11 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00300_15
+ jne .L00700_15
movl 156(%esp),%ecx
- jmp .L00416_63
+ jmp .L00816_63
.align 16
-.L00416_63:
+.L00816_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -195,7 +216,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00416_63
+ jne .L00816_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -229,207 +250,6 @@ sha256_block_data_order:
popl %ebx
popl %ebp
ret
-.align 32
-.L005loop_shrd:
- movl (%edi),%eax
- movl 4(%edi),%ebx
- movl 8(%edi),%ecx
- bswap %eax
- movl 12(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 16(%edi),%eax
- movl 20(%edi),%ebx
- movl 24(%edi),%ecx
- bswap %eax
- movl 28(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 32(%edi),%eax
- movl 36(%edi),%ebx
- movl 40(%edi),%ecx
- bswap %eax
- movl 44(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- movl 48(%edi),%eax
- movl 52(%edi),%ebx
- movl 56(%edi),%ecx
- bswap %eax
- movl 60(%edi),%edx
- bswap %ebx
- pushl %eax
- bswap %ecx
- pushl %ebx
- bswap %edx
- pushl %ecx
- pushl %edx
- addl $64,%edi
- leal -36(%esp),%esp
- movl %edi,104(%esp)
- movl (%esi),%eax
- movl 4(%esi),%ebx
- movl 8(%esi),%ecx
- movl 12(%esi),%edi
- movl %ebx,8(%esp)
- xorl %ecx,%ebx
- movl %ecx,12(%esp)
- movl %edi,16(%esp)
- movl %ebx,(%esp)
- movl 16(%esi),%edx
- movl 20(%esi),%ebx
- movl 24(%esi),%ecx
- movl 28(%esi),%edi
- movl %ebx,24(%esp)
- movl %ecx,28(%esp)
- movl %edi,32(%esp)
-.align 16
-.L00600_15_shrd:
- movl %edx,%ecx
- movl 24(%esp),%esi
- shrdl $14,%ecx,%ecx
- movl 28(%esp),%edi
- xorl %edx,%ecx
- xorl %edi,%esi
- movl 96(%esp),%ebx
- shrdl $5,%ecx,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
- xorl %ecx,%edx
- addl 32(%esp),%ebx
- xorl %edi,%esi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %esi,%ebx
- shrdl $9,%ecx,%ecx
- addl %edx,%ebx
- movl 8(%esp),%edi
- xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
- shrdl $11,%ecx,%ecx
- movl (%ebp),%esi
- xorl %eax,%ecx
- movl 20(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %esi,%ebx
- movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
- addl %ecx,%ebx
- xorl %edi,%eax
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3248222580,%esi
- jne .L00600_15_shrd
- movl 156(%esp),%ecx
- jmp .L00716_63_shrd
-.align 16
-.L00716_63_shrd:
- movl %ecx,%ebx
- movl 104(%esp),%esi
- shrdl $11,%ecx,%ecx
- movl %esi,%edi
- shrdl $2,%esi,%esi
- xorl %ebx,%ecx
- shrl $3,%ebx
- shrdl $7,%ecx,%ecx
- xorl %edi,%esi
- xorl %ecx,%ebx
- shrdl $17,%esi,%esi
- addl 160(%esp),%ebx
- shrl $10,%edi
- addl 124(%esp),%ebx
- movl %edx,%ecx
- xorl %esi,%edi
- movl 24(%esp),%esi
- shrdl $14,%ecx,%ecx
- addl %edi,%ebx
- movl 28(%esp),%edi
- xorl %edx,%ecx
- xorl %edi,%esi
- movl %ebx,96(%esp)
- shrdl $5,%ecx,%ecx
- andl %edx,%esi
- movl %edx,20(%esp)
- xorl %ecx,%edx
- addl 32(%esp),%ebx
- xorl %edi,%esi
- shrdl $6,%edx,%edx
- movl %eax,%ecx
- addl %esi,%ebx
- shrdl $9,%ecx,%ecx
- addl %edx,%ebx
- movl 8(%esp),%edi
- xorl %eax,%ecx
- movl %eax,4(%esp)
- leal -4(%esp),%esp
- shrdl $11,%ecx,%ecx
- movl (%ebp),%esi
- xorl %eax,%ecx
- movl 20(%esp),%edx
- xorl %edi,%eax
- shrdl $2,%ecx,%ecx
- addl %esi,%ebx
- movl %eax,(%esp)
- addl %ebx,%edx
- andl 4(%esp),%eax
- addl %ecx,%ebx
- xorl %edi,%eax
- movl 156(%esp),%ecx
- addl $4,%ebp
- addl %ebx,%eax
- cmpl $3329325298,%esi
- jne .L00716_63_shrd
- movl 356(%esp),%esi
- movl 8(%esp),%ebx
- movl 16(%esp),%ecx
- addl (%esi),%eax
- addl 4(%esi),%ebx
- addl 8(%esi),%edi
- addl 12(%esi),%ecx
- movl %eax,(%esi)
- movl %ebx,4(%esi)
- movl %edi,8(%esi)
- movl %ecx,12(%esi)
- movl 24(%esp),%eax
- movl 28(%esp),%ebx
- movl 32(%esp),%ecx
- movl 360(%esp),%edi
- addl 16(%esi),%edx
- addl 20(%esi),%eax
- addl 24(%esi),%ebx
- addl 28(%esi),%ecx
- movl %edx,16(%esi)
- movl %eax,20(%esi)
- movl %ebx,24(%esi)
- movl %ecx,28(%esi)
- leal 356(%esp),%esp
- subl $256,%ebp
- cmpl 8(%esp),%edi
- jb .L005loop_shrd
- movl 12(%esp),%esp
- popl %edi
- popl %esi
- popl %ebx
- popl %ebp
- ret
.align 64
.L001K256:
.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
@@ -440,7 +260,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L008unrolled:
+.L006unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -3346,5 +3166,1414 @@ sha256_block_data_order:
popl %ebx
popl %ebp
ret
+.align 32
+.L004shaext:
+ subl $32,%esp
+ movdqu (%esi),%xmm1
+ leal 128(%ebp),%ebp
+ movdqu 16(%esi),%xmm2
+ movdqa 128(%ebp),%xmm7
+ pshufd $27,%xmm1,%xmm0
+ pshufd $177,%xmm1,%xmm1
+ pshufd $27,%xmm2,%xmm2
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .L010loop_shaext
+.align 16
+.L010loop_shaext:
+ movdqu (%edi),%xmm3
+ movdqu 16(%edi),%xmm4
+ movdqu 32(%edi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%edi),%xmm6
+ movdqa %xmm2,16(%esp)
+ movdqa -128(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,(%esp)
+.byte 15,56,203,202
+ movdqa -112(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ leal 64(%edi),%edi
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa -96(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa -80(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa -64(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa -48(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa -32(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa -16(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa (%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 16(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 32(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 48(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 64(%ebp),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 80(%ebp),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+ movdqa 96(%ebp),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa 128(%ebp),%xmm7
+.byte 15,56,203,202
+ movdqa 112(%ebp),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $14,%xmm0,%xmm0
+ cmpl %edi,%eax
+ nop
+.byte 15,56,203,202
+ paddd 16(%esp),%xmm2
+ paddd (%esp),%xmm1
+ jnz .L010loop_shaext
+ pshufd $177,%xmm2,%xmm2
+ pshufd $27,%xmm1,%xmm7
+ pshufd $177,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+ movl 44(%esp),%esp
+ movdqu %xmm1,(%esi)
+ movdqu %xmm2,16(%esi)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L005SSSE3:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ movdqa 256(%ebp),%xmm7
+ jmp .L011grand_ssse3
+.align 16
+.L011grand_ssse3:
+ movdqu (%edi),%xmm0
+ movdqu 16(%edi),%xmm1
+ movdqu 32(%edi),%xmm2
+ movdqu 48(%edi),%xmm3
+ addl $64,%edi
+.byte 102,15,56,0,199
+ movl %edi,100(%esp)
+.byte 102,15,56,0,207
+ movdqa (%ebp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 16(%ebp),%xmm5
+ paddd %xmm0,%xmm4
+.byte 102,15,56,0,223
+ movdqa 32(%ebp),%xmm6
+ paddd %xmm1,%xmm5
+ movdqa 48(%ebp),%xmm7
+ movdqa %xmm4,32(%esp)
+ paddd %xmm2,%xmm6
+ movdqa %xmm5,48(%esp)
+ paddd %xmm3,%xmm7
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm7,80(%esp)
+ jmp .L012ssse3_00_47
+.align 16
+.L012ssse3_00_47:
+ addl $64,%ebp
+ movl %edx,%ecx
+ movdqa %xmm1,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,224,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,250,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm3,%xmm7
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm0
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm0
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm0,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa (%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm0,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,32(%esp)
+ movl %edx,%ecx
+ movdqa %xmm2,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,225,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,251,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm0,%xmm7
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm1
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm1
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm1,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 16(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm1,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,48(%esp)
+ movl %edx,%ecx
+ movdqa %xmm3,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,226,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,248,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm1,%xmm7
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm2
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm2
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm2,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 32(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm2,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,64(%esp)
+ movl %edx,%ecx
+ movdqa %xmm0,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,227,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,249,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm2,%xmm7
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm3
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm3
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm3,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 48(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm3,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L012ssse3_00_47
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ movdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L011grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
#endif
diff --git a/linux-x86/crypto/sha/sha512-586.S b/linux-x86/crypto/sha/sha512-586.S
index 17f7945..a928400 100644
--- a/linux-x86/crypto/sha/sha512-586.S
+++ b/linux-x86/crypto/sha/sha512-586.S
@@ -27,6 +27,2269 @@ sha512_block_data_order:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
+ movl (%edx),%ecx
+ testl $67108864,%ecx
+ jz .L002loop_x86
+ movl 4(%edx),%edx
+ movq (%esi),%mm0
+ andl $16777216,%ecx
+ movq 8(%esi),%mm1
+ andl $512,%edx
+ movq 16(%esi),%mm2
+ orl %edx,%ecx
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ cmpl $16777728,%ecx
+ je .L003SSSE3
+ subl $80,%esp
+ jmp .L004loop_sse2
+.align 16
+.L004loop_sse2:
+ movq %mm1,8(%esp)
+ movq %mm2,16(%esp)
+ movq %mm3,24(%esp)
+ movq %mm5,40(%esp)
+ movq %mm6,48(%esp)
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ movq %mm0,%mm3
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ movl $15,%edx
+ bswap %eax
+ bswap %ebx
+ jmp .L00500_14_sse2
+.align 16
+.L00500_14_sse2:
+ movd %eax,%mm1
+ movl (%edi),%eax
+ movd %ebx,%mm7
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ bswap %eax
+ bswap %ebx
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ movq 48(%esp),%mm6
+ decl %edx
+ jnz .L00500_14_sse2
+ movd %eax,%mm1
+ movd %ebx,%mm7
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ pxor %mm0,%mm0
+ movl $32,%edx
+ jmp .L00616_79_sse2
+.align 16
+.L00616_79_sse2:
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm0
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm2
+ addl $8,%ebp
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm2
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm0
+ addl $8,%ebp
+ decl %edx
+ jnz .L00616_79_sse2
+ paddq %mm3,%mm0
+ movq 8(%esp),%mm1
+ movq 24(%esp),%mm3
+ movq 40(%esp),%mm5
+ movq 48(%esp),%mm6
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movl $640,%eax
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ leal (%esp,%eax,1),%esp
+ subl %eax,%ebp
+ cmpl 88(%esp),%edi
+ jb .L004loop_sse2
+ movl 92(%esp),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L003SSSE3:
+ leal -64(%esp),%edx
+ subl $256,%esp
+ movdqa 640(%ebp),%xmm1
+ movdqu (%edi),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%edi),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%edi),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%edi),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%edi),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%edi),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%edi),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%edi),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movdqa %xmm2,-16(%edx)
+ nop
+.align 32
+.L007loop_ssse3:
+ movdqa 16(%edx),%xmm2
+ movdqa %xmm3,48(%edx)
+ leal 128(%ebp),%ebp
+ movq %mm1,8(%esp)
+ movl %edi,%ebx
+ movq %mm2,16(%esp)
+ leal 128(%edi),%edi
+ movq %mm3,24(%esp)
+ cmpl %eax,%edi
+ movq %mm5,40(%esp)
+ cmovbl %edi,%ebx
+ movq %mm6,48(%esp)
+ movl $4,%ecx
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ pxor %mm3,%mm3
+ jmp .L00800_47_ssse3
+.align 32
+.L00800_47_ssse3:
+ movdqa %xmm5,%xmm3
+ movdqa %xmm2,%xmm1
+.byte 102,15,58,15,208,8
+ movdqa %xmm4,(%edx)
+.byte 102,15,58,15,220,8
+ movdqa %xmm2,%xmm4
+ psrlq $7,%xmm2
+ paddq %xmm3,%xmm0
+ movdqa %xmm4,%xmm3
+ psrlq $1,%xmm4
+ psllq $56,%xmm3
+ pxor %xmm4,%xmm2
+ psrlq $7,%xmm4
+ pxor %xmm3,%xmm2
+ psllq $7,%xmm3
+ pxor %xmm4,%xmm2
+ movdqa %xmm7,%xmm4
+ pxor %xmm3,%xmm2
+ movdqa %xmm7,%xmm3
+ psrlq $6,%xmm4
+ paddq %xmm2,%xmm0
+ movdqa %xmm7,%xmm2
+ psrlq $19,%xmm3
+ psllq $3,%xmm2
+ pxor %xmm3,%xmm4
+ psrlq $42,%xmm3
+ pxor %xmm2,%xmm4
+ psllq $42,%xmm2
+ pxor %xmm3,%xmm4
+ movdqa 32(%edx),%xmm3
+ pxor %xmm2,%xmm4
+ movdqa (%ebp),%xmm2
+ movq %mm4,%mm1
+ paddq %xmm4,%xmm0
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm0,%xmm2
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm2,-128(%edx)
+ movdqa %xmm6,%xmm4
+ movdqa %xmm3,%xmm2
+.byte 102,15,58,15,217,8
+ movdqa %xmm5,16(%edx)
+.byte 102,15,58,15,229,8
+ movdqa %xmm3,%xmm5
+ psrlq $7,%xmm3
+ paddq %xmm4,%xmm1
+ movdqa %xmm5,%xmm4
+ psrlq $1,%xmm5
+ psllq $56,%xmm4
+ pxor %xmm5,%xmm3
+ psrlq $7,%xmm5
+ pxor %xmm4,%xmm3
+ psllq $7,%xmm4
+ pxor %xmm5,%xmm3
+ movdqa %xmm0,%xmm5
+ pxor %xmm4,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $6,%xmm5
+ paddq %xmm3,%xmm1
+ movdqa %xmm0,%xmm3
+ psrlq $19,%xmm4
+ psllq $3,%xmm3
+ pxor %xmm4,%xmm5
+ psrlq $42,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $42,%xmm3
+ pxor %xmm4,%xmm5
+ movdqa 48(%edx),%xmm4
+ pxor %xmm3,%xmm5
+ movdqa 16(%ebp),%xmm3
+ movq %mm4,%mm1
+ paddq %xmm5,%xmm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm1,%xmm3
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm3,-112(%edx)
+ movdqa %xmm7,%xmm5
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,226,8
+ movdqa %xmm6,32(%edx)
+.byte 102,15,58,15,238,8
+ movdqa %xmm4,%xmm6
+ psrlq $7,%xmm4
+ paddq %xmm5,%xmm2
+ movdqa %xmm6,%xmm5
+ psrlq $1,%xmm6
+ psllq $56,%xmm5
+ pxor %xmm6,%xmm4
+ psrlq $7,%xmm6
+ pxor %xmm5,%xmm4
+ psllq $7,%xmm5
+ pxor %xmm6,%xmm4
+ movdqa %xmm1,%xmm6
+ pxor %xmm5,%xmm4
+ movdqa %xmm1,%xmm5
+ psrlq $6,%xmm6
+ paddq %xmm4,%xmm2
+ movdqa %xmm1,%xmm4
+ psrlq $19,%xmm5
+ psllq $3,%xmm4
+ pxor %xmm5,%xmm6
+ psrlq $42,%xmm5
+ pxor %xmm4,%xmm6
+ psllq $42,%xmm4
+ pxor %xmm5,%xmm6
+ movdqa (%edx),%xmm5
+ pxor %xmm4,%xmm6
+ movdqa 32(%ebp),%xmm4
+ movq %mm4,%mm1
+ paddq %xmm6,%xmm2
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm2,%xmm4
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm4,-96(%edx)
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm4
+.byte 102,15,58,15,235,8
+ movdqa %xmm7,48(%edx)
+.byte 102,15,58,15,247,8
+ movdqa %xmm5,%xmm7
+ psrlq $7,%xmm5
+ paddq %xmm6,%xmm3
+ movdqa %xmm7,%xmm6
+ psrlq $1,%xmm7
+ psllq $56,%xmm6
+ pxor %xmm7,%xmm5
+ psrlq $7,%xmm7
+ pxor %xmm6,%xmm5
+ psllq $7,%xmm6
+ pxor %xmm7,%xmm5
+ movdqa %xmm2,%xmm7
+ pxor %xmm6,%xmm5
+ movdqa %xmm2,%xmm6
+ psrlq $6,%xmm7
+ paddq %xmm5,%xmm3
+ movdqa %xmm2,%xmm5
+ psrlq $19,%xmm6
+ psllq $3,%xmm5
+ pxor %xmm6,%xmm7
+ psrlq $42,%xmm6
+ pxor %xmm5,%xmm7
+ psllq $42,%xmm5
+ pxor %xmm6,%xmm7
+ movdqa 16(%edx),%xmm6
+ pxor %xmm5,%xmm7
+ movdqa 48(%ebp),%xmm5
+ movq %mm4,%mm1
+ paddq %xmm7,%xmm3
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm3,%xmm5
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm5,-80(%edx)
+ movdqa %xmm1,%xmm7
+ movdqa %xmm6,%xmm5
+.byte 102,15,58,15,244,8
+ movdqa %xmm0,(%edx)
+.byte 102,15,58,15,248,8
+ movdqa %xmm6,%xmm0
+ psrlq $7,%xmm6
+ paddq %xmm7,%xmm4
+ movdqa %xmm0,%xmm7
+ psrlq $1,%xmm0
+ psllq $56,%xmm7
+ pxor %xmm0,%xmm6
+ psrlq $7,%xmm0
+ pxor %xmm7,%xmm6
+ psllq $7,%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm3,%xmm0
+ pxor %xmm7,%xmm6
+ movdqa %xmm3,%xmm7
+ psrlq $6,%xmm0
+ paddq %xmm6,%xmm4
+ movdqa %xmm3,%xmm6
+ psrlq $19,%xmm7
+ psllq $3,%xmm6
+ pxor %xmm7,%xmm0
+ psrlq $42,%xmm7
+ pxor %xmm6,%xmm0
+ psllq $42,%xmm6
+ pxor %xmm7,%xmm0
+ movdqa 32(%edx),%xmm7
+ pxor %xmm6,%xmm0
+ movdqa 64(%ebp),%xmm6
+ movq %mm4,%mm1
+ paddq %xmm0,%xmm4
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm4,%xmm6
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm6,-64(%edx)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm6
+.byte 102,15,58,15,253,8
+ movdqa %xmm1,16(%edx)
+.byte 102,15,58,15,193,8
+ movdqa %xmm7,%xmm1
+ psrlq $7,%xmm7
+ paddq %xmm0,%xmm5
+ movdqa %xmm1,%xmm0
+ psrlq $1,%xmm1
+ psllq $56,%xmm0
+ pxor %xmm1,%xmm7
+ psrlq $7,%xmm1
+ pxor %xmm0,%xmm7
+ psllq $7,%xmm0
+ pxor %xmm1,%xmm7
+ movdqa %xmm4,%xmm1
+ pxor %xmm0,%xmm7
+ movdqa %xmm4,%xmm0
+ psrlq $6,%xmm1
+ paddq %xmm7,%xmm5
+ movdqa %xmm4,%xmm7
+ psrlq $19,%xmm0
+ psllq $3,%xmm7
+ pxor %xmm0,%xmm1
+ psrlq $42,%xmm0
+ pxor %xmm7,%xmm1
+ psllq $42,%xmm7
+ pxor %xmm0,%xmm1
+ movdqa 48(%edx),%xmm0
+ pxor %xmm7,%xmm1
+ movdqa 80(%ebp),%xmm7
+ movq %mm4,%mm1
+ paddq %xmm1,%xmm5
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm5,%xmm7
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm7,-48(%edx)
+ movdqa %xmm3,%xmm1
+ movdqa %xmm0,%xmm7
+.byte 102,15,58,15,198,8
+ movdqa %xmm2,32(%edx)
+.byte 102,15,58,15,202,8
+ movdqa %xmm0,%xmm2
+ psrlq $7,%xmm0
+ paddq %xmm1,%xmm6
+ movdqa %xmm2,%xmm1
+ psrlq $1,%xmm2
+ psllq $56,%xmm1
+ pxor %xmm2,%xmm0
+ psrlq $7,%xmm2
+ pxor %xmm1,%xmm0
+ psllq $7,%xmm1
+ pxor %xmm2,%xmm0
+ movdqa %xmm5,%xmm2
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm1
+ psrlq $6,%xmm2
+ paddq %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $19,%xmm1
+ psllq $3,%xmm0
+ pxor %xmm1,%xmm2
+ psrlq $42,%xmm1
+ pxor %xmm0,%xmm2
+ psllq $42,%xmm0
+ pxor %xmm1,%xmm2
+ movdqa (%edx),%xmm1
+ pxor %xmm0,%xmm2
+ movdqa 96(%ebp),%xmm0
+ movq %mm4,%mm1
+ paddq %xmm2,%xmm6
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm6,%xmm0
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm0,-32(%edx)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm1,%xmm0
+.byte 102,15,58,15,207,8
+ movdqa %xmm3,48(%edx)
+.byte 102,15,58,15,211,8
+ movdqa %xmm1,%xmm3
+ psrlq $7,%xmm1
+ paddq %xmm2,%xmm7
+ movdqa %xmm3,%xmm2
+ psrlq $1,%xmm3
+ psllq $56,%xmm2
+ pxor %xmm3,%xmm1
+ psrlq $7,%xmm3
+ pxor %xmm2,%xmm1
+ psllq $7,%xmm2
+ pxor %xmm3,%xmm1
+ movdqa %xmm6,%xmm3
+ pxor %xmm2,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $6,%xmm3
+ paddq %xmm1,%xmm7
+ movdqa %xmm6,%xmm1
+ psrlq $19,%xmm2
+ psllq $3,%xmm1
+ pxor %xmm2,%xmm3
+ psrlq $42,%xmm2
+ pxor %xmm1,%xmm3
+ psllq $42,%xmm1
+ pxor %xmm2,%xmm3
+ movdqa 16(%edx),%xmm2
+ pxor %xmm1,%xmm3
+ movdqa 112(%ebp),%xmm1
+ movq %mm4,%mm1
+ paddq %xmm3,%xmm7
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm7,%xmm1
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm1,-16(%edx)
+ leal 128(%ebp),%ebp
+ decl %ecx
+ jnz .L00800_47_ssse3
+ movdqa (%ebp),%xmm1
+ leal -640(%ebp),%ebp
+ movdqu (%ebx),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%ebx),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movq %mm4,%mm1
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%ebx),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movq %mm4,%mm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%ebx),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movq %mm4,%mm1
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%ebx),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movq %mm4,%mm1
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%ebx),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movq %mm4,%mm1
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%ebx),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movq %mm4,%mm1
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%ebx),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movq %mm4,%mm1
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movq %mm4,%mm1
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm2,-16(%edx)
+ movq 8(%esp),%mm1
+ paddq %mm3,%mm0
+ movq 24(%esp),%mm3
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ cmpl %eax,%edi
+ jb .L007loop_ssse3
+ movl 76(%edx),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.align 16
.L002loop_x86:
movl (%edi),%eax
@@ -132,7 +2395,7 @@ sha512_block_data_order:
movl $16,%ecx
.long 2784229001
.align 16
-.L00300_15_x86:
+.L00900_15_x86:
movl 40(%esp),%ecx
movl 44(%esp),%edx
movl %ecx,%esi
@@ -239,9 +2502,9 @@ sha512_block_data_order:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $148,%dl
- jne .L00300_15_x86
+ jne .L00900_15_x86
.align 16
-.L00416_79_x86:
+.L01016_79_x86:
movl 312(%esp),%ecx
movl 316(%esp),%edx
movl %ecx,%esi
@@ -414,7 +2677,7 @@ sha512_block_data_order:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $23,%dl
- jne .L00416_79_x86
+ jne .L01016_79_x86
movl 840(%esp),%esi
movl 844(%esp),%edi
movl (%esi),%eax