summaryrefslogtreecommitdiffstats
path: root/mac-x86/crypto/bn
diff options
context:
space:
mode:
Diffstat (limited to 'mac-x86/crypto/bn')
-rw-r--r--mac-x86/crypto/bn/bn-586.S441
-rw-r--r--mac-x86/crypto/bn/x86-mont.S176
2 files changed, 445 insertions, 172 deletions
diff --git a/mac-x86/crypto/bn/bn-586.S b/mac-x86/crypto/bn/bn-586.S
index 34cf56f..0f0a94e 100644
--- a/mac-x86/crypto/bn/bn-586.S
+++ b/mac-x86/crypto/bn/bn-586.S
@@ -6,6 +6,102 @@
.align 4
_bn_mul_add_words:
L_bn_mul_add_words_begin:
+ call L000PIC_me_up
+L000PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L001maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp L002maw_sse2_entry
+.align 4,0x90
+L003maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz L004maw_sse2_exit
+L002maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz L003maw_sse2_unrolled
+.align 2,0x90
+L005maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz L005maw_sse2_loop
+L004maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 4,0x90
+L001maw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -18,9 +114,9 @@ L_bn_mul_add_words_begin:
andl $4294967288,%ecx
movl 32(%esp),%ebp
pushl %ecx
- jz L000maw_finish
+ jz L006maw_finish
.align 4,0x90
-L001maw_loop:
+L007maw_loop:
# Round 0
movl (%ebx),%eax
mull %ebp
@@ -97,13 +193,13 @@ L001maw_loop:
subl $8,%ecx
leal 32(%ebx),%ebx
leal 32(%edi),%edi
- jnz L001maw_loop
-L000maw_finish:
+ jnz L007maw_loop
+L006maw_finish:
movl 32(%esp),%ecx
andl $7,%ecx
- jnz L002maw_finish2
- jmp L003maw_end
-L002maw_finish2:
+ jnz L008maw_finish2
+ jmp L009maw_end
+L008maw_finish2:
# Tail Round 0
movl (%ebx),%eax
mull %ebp
@@ -114,7 +210,7 @@ L002maw_finish2:
decl %ecx
movl %eax,(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 1
movl 4(%ebx),%eax
mull %ebp
@@ -125,7 +221,7 @@ L002maw_finish2:
decl %ecx
movl %eax,4(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 2
movl 8(%ebx),%eax
mull %ebp
@@ -136,7 +232,7 @@ L002maw_finish2:
decl %ecx
movl %eax,8(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 3
movl 12(%ebx),%eax
mull %ebp
@@ -147,7 +243,7 @@ L002maw_finish2:
decl %ecx
movl %eax,12(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 4
movl 16(%ebx),%eax
mull %ebp
@@ -158,7 +254,7 @@ L002maw_finish2:
decl %ecx
movl %eax,16(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 5
movl 20(%ebx),%eax
mull %ebp
@@ -169,7 +265,7 @@ L002maw_finish2:
decl %ecx
movl %eax,20(%edi)
movl %edx,%esi
- jz L003maw_end
+ jz L009maw_end
# Tail Round 6
movl 24(%ebx),%eax
mull %ebp
@@ -179,7 +275,7 @@ L002maw_finish2:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-L003maw_end:
+L009maw_end:
movl %esi,%eax
popl %ecx
popl %edi
@@ -192,6 +288,33 @@ L003maw_end:
.align 4
_bn_mul_words:
L_bn_mul_words_begin:
+ call L010PIC_me_up
+L010PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L011mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 4,0x90
+L012mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz L012mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 4,0x90
+L011mw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -203,8 +326,8 @@ L_bn_mul_words_begin:
movl 28(%esp),%ebp
movl 32(%esp),%ecx
andl $4294967288,%ebp
- jz L004mw_finish
-L005mw_loop:
+ jz L013mw_finish
+L014mw_loop:
# Round 0
movl (%ebx),%eax
mull %ecx
@@ -265,14 +388,14 @@ L005mw_loop:
addl $32,%ebx
addl $32,%edi
subl $8,%ebp
- jz L004mw_finish
- jmp L005mw_loop
-L004mw_finish:
+ jz L013mw_finish
+ jmp L014mw_loop
+L013mw_finish:
movl 28(%esp),%ebp
andl $7,%ebp
- jnz L006mw_finish2
- jmp L007mw_end
-L006mw_finish2:
+ jnz L015mw_finish2
+ jmp L016mw_end
+L015mw_finish2:
# Tail Round 0
movl (%ebx),%eax
mull %ecx
@@ -281,7 +404,7 @@ L006mw_finish2:
movl %eax,(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 1
movl 4(%ebx),%eax
mull %ecx
@@ -290,7 +413,7 @@ L006mw_finish2:
movl %eax,4(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 2
movl 8(%ebx),%eax
mull %ecx
@@ -299,7 +422,7 @@ L006mw_finish2:
movl %eax,8(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 3
movl 12(%ebx),%eax
mull %ecx
@@ -308,7 +431,7 @@ L006mw_finish2:
movl %eax,12(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 4
movl 16(%ebx),%eax
mull %ecx
@@ -317,7 +440,7 @@ L006mw_finish2:
movl %eax,16(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 5
movl 20(%ebx),%eax
mull %ecx
@@ -326,7 +449,7 @@ L006mw_finish2:
movl %eax,20(%edi)
movl %edx,%esi
decl %ebp
- jz L007mw_end
+ jz L016mw_end
# Tail Round 6
movl 24(%ebx),%eax
mull %ecx
@@ -334,7 +457,7 @@ L006mw_finish2:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
-L007mw_end:
+L016mw_end:
movl %esi,%eax
popl %edi
popl %esi
@@ -346,6 +469,28 @@ L007mw_end:
.align 4
_bn_sqr_words:
L_bn_sqr_words_begin:
+ call L017PIC_me_up
+L017PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L018sqr_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+.align 4,0x90
+L019sqr_sse2_loop:
+ movd (%edx),%mm0
+ pmuludq %mm0,%mm0
+ leal 4(%edx),%edx
+ movq %mm0,(%eax)
+ subl $1,%ecx
+ leal 8(%eax),%eax
+ jnz L019sqr_sse2_loop
+ emms
+ ret
+.align 4,0x90
+L018sqr_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@@ -355,8 +500,8 @@ L_bn_sqr_words_begin:
movl 24(%esp),%edi
movl 28(%esp),%ebx
andl $4294967288,%ebx
- jz L008sw_finish
-L009sw_loop:
+ jz L020sw_finish
+L021sw_loop:
# Round 0
movl (%edi),%eax
mull %eax
@@ -401,59 +546,59 @@ L009sw_loop:
addl $32,%edi
addl $64,%esi
subl $8,%ebx
- jnz L009sw_loop
-L008sw_finish:
+ jnz L021sw_loop
+L020sw_finish:
movl 28(%esp),%ebx
andl $7,%ebx
- jz L010sw_end
+ jz L022sw_end
# Tail Round 0
movl (%edi),%eax
mull %eax
movl %eax,(%esi)
decl %ebx
movl %edx,4(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 1
movl 4(%edi),%eax
mull %eax
movl %eax,8(%esi)
decl %ebx
movl %edx,12(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 2
movl 8(%edi),%eax
mull %eax
movl %eax,16(%esi)
decl %ebx
movl %edx,20(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 3
movl 12(%edi),%eax
mull %eax
movl %eax,24(%esi)
decl %ebx
movl %edx,28(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 4
movl 16(%edi),%eax
mull %eax
movl %eax,32(%esi)
decl %ebx
movl %edx,36(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 5
movl 20(%edi),%eax
mull %eax
movl %eax,40(%esi)
decl %ebx
movl %edx,44(%esi)
- jz L010sw_end
+ jz L022sw_end
# Tail Round 6
movl 24(%edi),%eax
mull %eax
movl %eax,48(%esi)
movl %edx,52(%esi)
-L010sw_end:
+L022sw_end:
popl %edi
popl %esi
popl %ebx
@@ -485,8 +630,8 @@ L_bn_add_words_begin:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz L011aw_finish
-L012aw_loop:
+ jz L023aw_finish
+L024aw_loop:
# Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -564,11 +709,11 @@ L012aw_loop:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L012aw_loop
-L011aw_finish:
+ jnz L024aw_loop
+L023aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz L013aw_end
+ jz L025aw_end
# Tail Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -579,7 +724,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 1
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -590,7 +735,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 2
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -601,7 +746,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 3
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -612,7 +757,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 4
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -623,7 +768,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 5
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -634,7 +779,7 @@ L011aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz L013aw_end
+ jz L025aw_end
# Tail Round 6
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -644,7 +789,7 @@ L011aw_finish:
addl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-L013aw_end:
+L025aw_end:
popl %edi
popl %esi
popl %ebx
@@ -666,8 +811,8 @@ L_bn_sub_words_begin:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz L014aw_finish
-L015aw_loop:
+ jz L026aw_finish
+L027aw_loop:
# Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -745,11 +890,11 @@ L015aw_loop:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L015aw_loop
-L014aw_finish:
+ jnz L027aw_loop
+L026aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz L016aw_end
+ jz L028aw_end
# Tail Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -760,7 +905,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 1
movl 4(%esi),%ecx
movl 4(%edi),%edx
@@ -771,7 +916,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 2
movl 8(%esi),%ecx
movl 8(%edi),%edx
@@ -782,7 +927,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 3
movl 12(%esi),%ecx
movl 12(%edi),%edx
@@ -793,7 +938,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 4
movl 16(%esi),%ecx
movl 16(%edi),%edx
@@ -804,7 +949,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 5
movl 20(%esi),%ecx
movl 20(%edi),%edx
@@ -815,7 +960,7 @@ L014aw_finish:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz L016aw_end
+ jz L028aw_end
# Tail Round 6
movl 24(%esi),%ecx
movl 24(%edi),%edx
@@ -825,7 +970,7 @@ L014aw_finish:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
-L016aw_end:
+L028aw_end:
popl %edi
popl %esi
popl %ebx
@@ -847,8 +992,8 @@ L_bn_sub_part_words_begin:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
- jz L017aw_finish
-L018aw_loop:
+ jz L029aw_finish
+L030aw_loop:
# Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -926,11 +1071,11 @@ L018aw_loop:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L018aw_loop
-L017aw_finish:
+ jnz L030aw_loop
+L029aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 0
movl (%esi),%ecx
movl (%edi),%edx
@@ -944,7 +1089,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 1
movl (%esi),%ecx
movl (%edi),%edx
@@ -958,7 +1103,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 2
movl (%esi),%ecx
movl (%edi),%edx
@@ -972,7 +1117,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 3
movl (%esi),%ecx
movl (%edi),%edx
@@ -986,7 +1131,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 4
movl (%esi),%ecx
movl (%edi),%edx
@@ -1000,7 +1145,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 5
movl (%esi),%ecx
movl (%edi),%edx
@@ -1014,7 +1159,7 @@ L017aw_finish:
addl $4,%edi
addl $4,%ebx
decl %ebp
- jz L019aw_end
+ jz L031aw_end
# Tail Round 6
movl (%esi),%ecx
movl (%edi),%edx
@@ -1027,20 +1172,20 @@ L017aw_finish:
addl $4,%esi
addl $4,%edi
addl $4,%ebx
-L019aw_end:
+L031aw_end:
cmpl $0,36(%esp)
- je L020pw_end
+ je L032pw_end
movl 36(%esp),%ebp
cmpl $0,%ebp
- je L020pw_end
- jge L021pw_pos
+ je L032pw_end
+ jge L033pw_pos
# pw_neg
movl $0,%edx
subl %ebp,%edx
movl %edx,%ebp
andl $4294967288,%ebp
- jz L022pw_neg_finish
-L023pw_neg_loop:
+ jz L034pw_neg_finish
+L035pw_neg_loop:
# dl<0 Round 0
movl $0,%ecx
movl (%edi),%edx
@@ -1117,13 +1262,13 @@ L023pw_neg_loop:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
- jnz L023pw_neg_loop
-L022pw_neg_finish:
+ jnz L035pw_neg_loop
+L034pw_neg_finish:
movl 36(%esp),%edx
movl $0,%ebp
subl %edx,%ebp
andl $7,%ebp
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 0
movl $0,%ecx
movl (%edi),%edx
@@ -1134,7 +1279,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 1
movl $0,%ecx
movl 4(%edi),%edx
@@ -1145,7 +1290,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 2
movl $0,%ecx
movl 8(%edi),%edx
@@ -1156,7 +1301,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 3
movl $0,%ecx
movl 12(%edi),%edx
@@ -1167,7 +1312,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 4
movl $0,%ecx
movl 16(%edi),%edx
@@ -1178,7 +1323,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 5
movl $0,%ecx
movl 20(%edi),%edx
@@ -1189,7 +1334,7 @@ L022pw_neg_finish:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
- jz L020pw_end
+ jz L032pw_end
# dl<0 Tail Round 6
movl $0,%ecx
movl 24(%edi),%edx
@@ -1199,181 +1344,185 @@ L022pw_neg_finish:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
- jmp L020pw_end
-L021pw_pos:
+ jmp L032pw_end
+L033pw_pos:
andl $4294967288,%ebp
- jz L024pw_pos_finish
-L025pw_pos_loop:
+ jz L036pw_pos_finish
+L037pw_pos_loop:
# dl>0 Round 0
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc L026pw_nc0
+ jnc L038pw_nc0
# dl>0 Round 1
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc L027pw_nc1
+ jnc L039pw_nc1
# dl>0 Round 2
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc L028pw_nc2
+ jnc L040pw_nc2
# dl>0 Round 3
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc L029pw_nc3
+ jnc L041pw_nc3
# dl>0 Round 4
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc L030pw_nc4
+ jnc L042pw_nc4
# dl>0 Round 5
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc L031pw_nc5
+ jnc L043pw_nc5
# dl>0 Round 6
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc L032pw_nc6
+ jnc L044pw_nc6
# dl>0 Round 7
movl 28(%esi),%ecx
subl %eax,%ecx
movl %ecx,28(%ebx)
- jnc L033pw_nc7
+ jnc L045pw_nc7
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz L025pw_pos_loop
-L024pw_pos_finish:
+ jnz L037pw_pos_loop
+L036pw_pos_finish:
movl 36(%esp),%ebp
andl $7,%ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 0
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
- jnc L034pw_tail_nc0
+ jnc L046pw_tail_nc0
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 1
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
- jnc L035pw_tail_nc1
+ jnc L047pw_tail_nc1
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 2
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
- jnc L036pw_tail_nc2
+ jnc L048pw_tail_nc2
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 3
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
- jnc L037pw_tail_nc3
+ jnc L049pw_tail_nc3
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 4
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
- jnc L038pw_tail_nc4
+ jnc L050pw_tail_nc4
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 5
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
- jnc L039pw_tail_nc5
+ jnc L051pw_tail_nc5
decl %ebp
- jz L020pw_end
+ jz L032pw_end
# dl>0 Tail Round 6
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
- jnc L040pw_tail_nc6
+ jnc L052pw_tail_nc6
movl $1,%eax
- jmp L020pw_end
-L041pw_nc_loop:
+ jmp L032pw_end
+L053pw_nc_loop:
movl (%esi),%ecx
movl %ecx,(%ebx)
-L026pw_nc0:
+L038pw_nc0:
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-L027pw_nc1:
+L039pw_nc1:
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-L028pw_nc2:
+L040pw_nc2:
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-L029pw_nc3:
+L041pw_nc3:
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-L030pw_nc4:
+L042pw_nc4:
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-L031pw_nc5:
+L043pw_nc5:
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-L032pw_nc6:
+L044pw_nc6:
movl 28(%esi),%ecx
movl %ecx,28(%ebx)
-L033pw_nc7:
+L045pw_nc7:
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
- jnz L041pw_nc_loop
+ jnz L053pw_nc_loop
movl 36(%esp),%ebp
andl $7,%ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl (%esi),%ecx
movl %ecx,(%ebx)
-L034pw_tail_nc0:
+L046pw_tail_nc0:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
-L035pw_tail_nc1:
+L047pw_tail_nc1:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
-L036pw_tail_nc2:
+L048pw_tail_nc2:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
-L037pw_tail_nc3:
+L049pw_tail_nc3:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
-L038pw_tail_nc4:
+L050pw_tail_nc4:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
-L039pw_tail_nc5:
+L051pw_tail_nc5:
decl %ebp
- jz L042pw_nc_end
+ jz L054pw_nc_end
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
-L040pw_tail_nc6:
-L042pw_nc_end:
+L052pw_tail_nc6:
+L054pw_nc_end:
movl $0,%eax
-L020pw_end:
+L032pw_end:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
#endif
diff --git a/mac-x86/crypto/bn/x86-mont.S b/mac-x86/crypto/bn/x86-mont.S
index 1b79c5f..234034b 100644
--- a/mac-x86/crypto/bn/x86-mont.S
+++ b/mac-x86/crypto/bn/x86-mont.S
@@ -43,6 +43,126 @@ L_bn_mul_mont_begin:
movl %esi,20(%esp)
leal -3(%edi),%ebx
movl %ebp,24(%esp)
+ call L001PIC_me_up
+L001PIC_me_up:
+ popl %eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc L002non_sse2
+ movl $-1,%eax
+ movd %eax,%mm7
+ movl 8(%esp),%esi
+ movl 12(%esp),%edi
+ movl 16(%esp),%ebp
+ xorl %edx,%edx
+ xorl %ecx,%ecx
+ movd (%edi),%mm4
+ movd (%esi),%mm5
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ movq %mm5,%mm2
+ movq %mm5,%mm0
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ incl %ecx
+.align 4,0x90
+L0031st:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ leal 1(%ecx),%ecx
+ cmpl %ebx,%ecx
+ jl L0031st
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm2,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ incl %edx
+L004outer:
+ xorl %ecx,%ecx
+ movd (%edi,%edx,4),%mm4
+ movd (%esi),%mm5
+ movd 32(%esp),%mm6
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ paddq %mm6,%mm5
+ movq %mm5,%mm0
+ movq %mm5,%mm2
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 36(%esp),%mm6
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ incl %ecx
+ decl %ebx
+L005inner:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ movd 36(%esp,%ecx,4),%mm6
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ decl %ebx
+ leal 1(%ecx),%ecx
+ jnz L005inner
+ movl %ecx,%ebx
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ movd 36(%esp,%ebx,4),%mm6
+ paddq %mm2,%mm3
+ paddq %mm6,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ leal 1(%edx),%edx
+ cmpl %ebx,%edx
+ jle L004outer
+ emms
+ jmp L006common_tail
+.align 4,0x90
+L002non_sse2:
movl 8(%esp),%esi
leal 1(%ebx),%ebp
movl 12(%esp),%edi
@@ -53,12 +173,12 @@ L_bn_mul_mont_begin:
leal 4(%edi,%ebx,4),%eax
orl %edx,%ebp
movl (%edi),%edi
- jz L001bn_sqr_mont
+ jz L007bn_sqr_mont
movl %eax,28(%esp)
movl (%esi),%eax
xorl %edx,%edx
.align 4,0x90
-L002mull:
+L008mull:
movl %edx,%ebp
mull %edi
addl %eax,%ebp
@@ -67,7 +187,7 @@ L002mull:
movl (%esi,%ecx,4),%eax
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl L002mull
+ jl L008mull
movl %edx,%ebp
mull %edi
movl 20(%esp),%edi
@@ -85,9 +205,9 @@ L002mull:
movl 4(%esi),%eax
adcl $0,%edx
incl %ecx
- jmp L0032ndmadd
+ jmp L0092ndmadd
.align 4,0x90
-L0041stmadd:
+L0101stmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -98,7 +218,7 @@ L0041stmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl L0041stmadd
+ jl L0101stmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%eax
@@ -121,7 +241,7 @@ L0041stmadd:
adcl $0,%edx
movl $1,%ecx
.align 4,0x90
-L0032ndmadd:
+L0092ndmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -132,7 +252,7 @@ L0032ndmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl L0032ndmadd
+ jl L0092ndmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -148,16 +268,16 @@ L0032ndmadd:
movl %edx,32(%esp,%ebx,4)
cmpl 28(%esp),%ecx
movl %eax,36(%esp,%ebx,4)
- je L005common_tail
+ je L006common_tail
movl (%ecx),%edi
movl 8(%esp),%esi
movl %ecx,12(%esp)
xorl %ecx,%ecx
xorl %edx,%edx
movl (%esi),%eax
- jmp L0041stmadd
+ jmp L0101stmadd
.align 4,0x90
-L001bn_sqr_mont:
+L007bn_sqr_mont:
movl %ebx,(%esp)
movl %ecx,12(%esp)
movl %edi,%eax
@@ -168,7 +288,7 @@ L001bn_sqr_mont:
andl $1,%ebx
incl %ecx
.align 4,0x90
-L006sqr:
+L011sqr:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -180,7 +300,7 @@ L006sqr:
cmpl (%esp),%ecx
movl %eax,%ebx
movl %ebp,28(%esp,%ecx,4)
- jl L006sqr
+ jl L011sqr
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -204,7 +324,7 @@ L006sqr:
movl 4(%esi),%eax
movl $1,%ecx
.align 4,0x90
-L0073rdmadd:
+L0123rdmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -223,7 +343,7 @@ L0073rdmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl L0073rdmadd
+ jl L0123rdmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -239,7 +359,7 @@ L0073rdmadd:
movl %edx,32(%esp,%ebx,4)
cmpl %ebx,%ecx
movl %eax,36(%esp,%ebx,4)
- je L005common_tail
+ je L006common_tail
movl 4(%esi,%ecx,4),%edi
leal 1(%ecx),%ecx
movl %edi,%eax
@@ -251,12 +371,12 @@ L0073rdmadd:
xorl %ebp,%ebp
cmpl %ebx,%ecx
leal 1(%ecx),%ecx
- je L008sqrlast
+ je L013sqrlast
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
.align 4,0x90
-L009sqradd:
+L014sqradd:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -272,13 +392,13 @@ L009sqradd:
cmpl (%esp),%ecx
movl %ebp,28(%esp,%ecx,4)
movl %eax,%ebx
- jle L009sqradd
+ jle L014sqradd
movl %edx,%ebp
addl %edx,%edx
shrl $31,%ebp
addl %ebx,%edx
adcl $0,%ebp
-L008sqrlast:
+L013sqrlast:
movl 20(%esp),%edi
movl 16(%esp),%esi
imull 32(%esp),%edi
@@ -293,9 +413,9 @@ L008sqrlast:
adcl $0,%edx
movl $1,%ecx
movl 4(%esi),%eax
- jmp L0073rdmadd
+ jmp L0123rdmadd
.align 4,0x90
-L005common_tail:
+L006common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -303,16 +423,16 @@ L005common_tail:
movl %ebx,%ecx
xorl %edx,%edx
.align 4,0x90
-L010sub:
+L015sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge L010sub
+ jge L015sub
sbbl $0,%eax
.align 4,0x90
-L011copy:
+L016copy:
movl (%esi,%ebx,4),%edx
movl (%edi,%ebx,4),%ebp
xorl %ebp,%edx
@@ -321,7 +441,7 @@ L011copy:
movl %ecx,(%esi,%ebx,4)
movl %edx,(%edi,%ebx,4)
decl %ebx
- jge L011copy
+ jge L016copy
movl 24(%esp),%esp
movl $1,%eax
L000just_leave:
@@ -335,4 +455,8 @@ L000just_leave:
.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
.byte 111,114,103,62,0
+.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L_OPENSSL_ia32cap_P$non_lazy_ptr:
+.indirect_symbol _OPENSSL_ia32cap_P
+.long 0
#endif