From e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5 Mon Sep 17 00:00:00 2001 From: Adam Langley Date: Mon, 11 May 2015 17:20:37 -0700 Subject: external/boringssl: bump revision. This change bumps the BoringSSL revision to the current tip-of-tree. Change-Id: I91d5bf467e16e8d86cb19a4de873985f524e5faa --- mac-x86/crypto/bn/bn-586.S | 441 +++++++++++++++++++++++++++++-------------- mac-x86/crypto/bn/x86-mont.S | 176 ++++++++++++++--- 2 files changed, 445 insertions(+), 172 deletions(-) (limited to 'mac-x86/crypto/bn') diff --git a/mac-x86/crypto/bn/bn-586.S b/mac-x86/crypto/bn/bn-586.S index 34cf56f..0f0a94e 100644 --- a/mac-x86/crypto/bn/bn-586.S +++ b/mac-x86/crypto/bn/bn-586.S @@ -6,6 +6,102 @@ .align 4 _bn_mul_add_words: L_bn_mul_add_words_begin: + call L000PIC_me_up +L000PIC_me_up: + popl %eax + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax + btl $26,(%eax) + jnc L001maw_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 + jmp L002maw_sse2_entry +.align 4,0x90 +L003maw_sse2_unrolled: + movd (%eax),%mm3 + paddq %mm3,%mm1 + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + movd 4(%edx),%mm4 + pmuludq %mm0,%mm4 + movd 8(%edx),%mm6 + pmuludq %mm0,%mm6 + movd 12(%edx),%mm7 + pmuludq %mm0,%mm7 + paddq %mm2,%mm1 + movd 4(%eax),%mm3 + paddq %mm4,%mm3 + movd 8(%eax),%mm5 + paddq %mm6,%mm5 + movd 12(%eax),%mm4 + paddq %mm4,%mm7 + movd %mm1,(%eax) + movd 16(%edx),%mm2 + pmuludq %mm0,%mm2 + psrlq $32,%mm1 + movd 20(%edx),%mm4 + pmuludq %mm0,%mm4 + paddq %mm3,%mm1 + movd 24(%edx),%mm6 + pmuludq %mm0,%mm6 + movd %mm1,4(%eax) + psrlq $32,%mm1 + movd 28(%edx),%mm3 + addl $32,%edx + pmuludq %mm0,%mm3 + paddq %mm5,%mm1 + movd 16(%eax),%mm5 + paddq %mm5,%mm2 + movd %mm1,8(%eax) + psrlq $32,%mm1 + paddq %mm7,%mm1 + movd 20(%eax),%mm5 + paddq %mm5,%mm4 + movd %mm1,12(%eax) + psrlq $32,%mm1 + paddq %mm2,%mm1 + movd 24(%eax),%mm5 + paddq %mm5,%mm6 + movd %mm1,16(%eax) + psrlq $32,%mm1 + paddq %mm4,%mm1 + movd 28(%eax),%mm5 + paddq %mm5,%mm3 + movd %mm1,20(%eax) + psrlq $32,%mm1 + paddq %mm6,%mm1 + movd %mm1,24(%eax) + psrlq $32,%mm1 + paddq %mm3,%mm1 + movd %mm1,28(%eax) + leal 32(%eax),%eax + psrlq $32,%mm1 + subl $8,%ecx + jz L004maw_sse2_exit +L002maw_sse2_entry: + testl $4294967288,%ecx + jnz L003maw_sse2_unrolled +.align 2,0x90 +L005maw_sse2_loop: + movd (%edx),%mm2 + movd (%eax),%mm3 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm3,%mm1 + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz L005maw_sse2_loop +L004maw_sse2_exit: + movd %mm1,%eax + emms + ret +.align 4,0x90 +L001maw_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -18,9 +114,9 @@ L_bn_mul_add_words_begin: andl $4294967288,%ecx movl 32(%esp),%ebp pushl %ecx - jz L000maw_finish + jz L006maw_finish .align 4,0x90 -L001maw_loop: +L007maw_loop: # Round 0 movl (%ebx),%eax mull %ebp @@ -97,13 +193,13 @@ L001maw_loop: subl $8,%ecx leal 32(%ebx),%ebx leal 32(%edi),%edi - jnz L001maw_loop -L000maw_finish: + jnz L007maw_loop +L006maw_finish: movl 32(%esp),%ecx andl $7,%ecx - jnz L002maw_finish2 - jmp L003maw_end -L002maw_finish2: + jnz L008maw_finish2 + jmp L009maw_end +L008maw_finish2: # Tail Round 0 movl (%ebx),%eax mull %ebp @@ -114,7 +210,7 @@ L002maw_finish2: decl %ecx movl %eax,(%edi) movl %edx,%esi - jz L003maw_end + jz L009maw_end # Tail Round 1 movl 4(%ebx),%eax mull %ebp @@ -125,7 +221,7 @@ L002maw_finish2: decl %ecx movl %eax,4(%edi) movl %edx,%esi - jz L003maw_end + jz L009maw_end # Tail Round 2 movl 8(%ebx),%eax mull %ebp @@ -136,7 +232,7 @@ L002maw_finish2: decl %ecx movl %eax,8(%edi) movl %edx,%esi - jz L003maw_end + jz L009maw_end # Tail Round 3 movl 12(%ebx),%eax mull %ebp @@ -147,7 +243,7 @@ L002maw_finish2: decl %ecx movl %eax,12(%edi) movl %edx,%esi - jz L003maw_end + jz L009maw_end # Tail Round 4 movl 16(%ebx),%eax mull %ebp @@ -158,7 +254,7 @@ L002maw_finish2: decl %ecx movl %eax,16(%edi) movl %edx,%esi - jz L003maw_end + jz L009maw_end # Tail Round 5 movl 20(%ebx),%eax mull %ebp @@ -169,7 +265,7 @@ L002maw_finish2: decl %ecx movl %eax,20(%edi) movl %edx,%esi - jz L003maw_end + jz L009maw_end # Tail Round 6 movl 24(%ebx),%eax mull %ebp @@ -179,7 +275,7 @@ L002maw_finish2: adcl $0,%edx movl %eax,24(%edi) movl %edx,%esi -L003maw_end: +L009maw_end: movl %esi,%eax popl %ecx popl %edi @@ -192,6 +288,33 @@ L003maw_end: .align 4 _bn_mul_words: L_bn_mul_words_begin: + call L010PIC_me_up +L010PIC_me_up: + popl %eax + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax + btl $26,(%eax) + jnc L011mw_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 +.align 4,0x90 +L012mw_sse2_loop: + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz L012mw_sse2_loop + movd %mm1,%eax + emms + ret +.align 4,0x90 +L011mw_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -203,8 +326,8 @@ L_bn_mul_words_begin: movl 28(%esp),%ebp movl 32(%esp),%ecx andl $4294967288,%ebp - jz L004mw_finish -L005mw_loop: + jz L013mw_finish +L014mw_loop: # Round 0 movl (%ebx),%eax mull %ecx @@ -265,14 +388,14 @@ L005mw_loop: addl $32,%ebx addl $32,%edi subl $8,%ebp - jz L004mw_finish - jmp L005mw_loop -L004mw_finish: + jz L013mw_finish + jmp L014mw_loop +L013mw_finish: movl 28(%esp),%ebp andl $7,%ebp - jnz L006mw_finish2 - jmp L007mw_end -L006mw_finish2: + jnz L015mw_finish2 + jmp L016mw_end +L015mw_finish2: # Tail Round 0 movl (%ebx),%eax mull %ecx @@ -281,7 +404,7 @@ L006mw_finish2: movl %eax,(%edi) movl %edx,%esi decl %ebp - jz L007mw_end + jz L016mw_end # Tail Round 1 movl 4(%ebx),%eax mull %ecx @@ -290,7 +413,7 @@ L006mw_finish2: movl %eax,4(%edi) movl %edx,%esi decl %ebp - jz L007mw_end + jz L016mw_end # Tail Round 2 movl 8(%ebx),%eax mull %ecx @@ -299,7 +422,7 @@ L006mw_finish2: movl %eax,8(%edi) movl %edx,%esi decl %ebp - jz L007mw_end + jz L016mw_end # Tail Round 3 movl 12(%ebx),%eax mull %ecx @@ -308,7 +431,7 @@ L006mw_finish2: movl %eax,12(%edi) movl %edx,%esi decl %ebp - jz L007mw_end + jz L016mw_end # Tail Round 4 movl 16(%ebx),%eax mull %ecx @@ -317,7 +440,7 @@ L006mw_finish2: movl %eax,16(%edi) movl %edx,%esi decl %ebp - jz L007mw_end + jz L016mw_end # Tail Round 5 movl 20(%ebx),%eax mull %ecx @@ -326,7 +449,7 @@ L006mw_finish2: movl %eax,20(%edi) movl %edx,%esi decl %ebp - jz L007mw_end + jz L016mw_end # Tail Round 6 movl 24(%ebx),%eax mull %ecx @@ -334,7 +457,7 @@ L006mw_finish2: adcl $0,%edx movl %eax,24(%edi) movl %edx,%esi -L007mw_end: +L016mw_end: movl %esi,%eax popl %edi popl %esi @@ -346,6 +469,28 @@ L007mw_end: .align 4 _bn_sqr_words: L_bn_sqr_words_begin: + call L017PIC_me_up +L017PIC_me_up: + popl %eax + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax + btl $26,(%eax) + jnc L018sqr_non_sse2 + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx +.align 4,0x90 +L019sqr_sse2_loop: + movd (%edx),%mm0 + pmuludq %mm0,%mm0 + leal 4(%edx),%edx + movq %mm0,(%eax) + subl $1,%ecx + leal 8(%eax),%eax + jnz L019sqr_sse2_loop + emms + ret +.align 4,0x90 +L018sqr_non_sse2: pushl %ebp pushl %ebx pushl %esi @@ -355,8 +500,8 @@ L_bn_sqr_words_begin: movl 24(%esp),%edi movl 28(%esp),%ebx andl $4294967288,%ebx - jz L008sw_finish -L009sw_loop: + jz L020sw_finish +L021sw_loop: # Round 0 movl (%edi),%eax mull %eax @@ -401,59 +546,59 @@ L009sw_loop: addl $32,%edi addl $64,%esi subl $8,%ebx - jnz L009sw_loop -L008sw_finish: + jnz L021sw_loop +L020sw_finish: movl 28(%esp),%ebx andl $7,%ebx - jz L010sw_end + jz L022sw_end # Tail Round 0 movl (%edi),%eax mull %eax movl %eax,(%esi) decl %ebx movl %edx,4(%esi) - jz L010sw_end + jz L022sw_end # Tail Round 1 movl 4(%edi),%eax mull %eax movl %eax,8(%esi) decl %ebx movl %edx,12(%esi) - jz L010sw_end + jz L022sw_end # Tail Round 2 movl 8(%edi),%eax mull %eax movl %eax,16(%esi) decl %ebx movl %edx,20(%esi) - jz L010sw_end + jz L022sw_end # Tail Round 3 movl 12(%edi),%eax mull %eax movl %eax,24(%esi) decl %ebx movl %edx,28(%esi) - jz L010sw_end + jz L022sw_end # Tail Round 4 movl 16(%edi),%eax mull %eax movl %eax,32(%esi) decl %ebx movl %edx,36(%esi) - jz L010sw_end + jz L022sw_end # Tail Round 5 movl 20(%edi),%eax mull %eax movl %eax,40(%esi) decl %ebx movl %edx,44(%esi) - jz L010sw_end + jz L022sw_end # Tail Round 6 movl 24(%edi),%eax mull %eax movl %eax,48(%esi) movl %edx,52(%esi) -L010sw_end: +L022sw_end: popl %edi popl %esi popl %ebx @@ -485,8 +630,8 @@ L_bn_add_words_begin: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz L011aw_finish -L012aw_loop: + jz L023aw_finish +L024aw_loop: # Round 0 movl (%esi),%ecx movl (%edi),%edx @@ -564,11 +709,11 @@ L012aw_loop: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz L012aw_loop -L011aw_finish: + jnz L024aw_loop +L023aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz L013aw_end + jz L025aw_end # Tail Round 0 movl (%esi),%ecx movl (%edi),%edx @@ -579,7 +724,7 @@ L011aw_finish: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz L013aw_end + jz L025aw_end # Tail Round 1 movl 4(%esi),%ecx movl 4(%edi),%edx @@ -590,7 +735,7 @@ L011aw_finish: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz L013aw_end + jz L025aw_end # Tail Round 2 movl 8(%esi),%ecx movl 8(%edi),%edx @@ -601,7 +746,7 @@ L011aw_finish: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz L013aw_end + jz L025aw_end # Tail Round 3 movl 12(%esi),%ecx movl 12(%edi),%edx @@ -612,7 +757,7 @@ L011aw_finish: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz L013aw_end + jz L025aw_end # Tail Round 4 movl 16(%esi),%ecx movl 16(%edi),%edx @@ -623,7 +768,7 @@ L011aw_finish: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz L013aw_end + jz L025aw_end # Tail Round 5 movl 20(%esi),%ecx movl 20(%edi),%edx @@ -634,7 +779,7 @@ L011aw_finish: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz L013aw_end + jz L025aw_end # Tail Round 6 movl 24(%esi),%ecx movl 24(%edi),%edx @@ -644,7 +789,7 @@ L011aw_finish: addl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) -L013aw_end: +L025aw_end: popl %edi popl %esi popl %ebx @@ -666,8 +811,8 @@ L_bn_sub_words_begin: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz L014aw_finish -L015aw_loop: + jz L026aw_finish +L027aw_loop: # Round 0 movl (%esi),%ecx movl (%edi),%edx @@ -745,11 +890,11 @@ L015aw_loop: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz L015aw_loop -L014aw_finish: + jnz L027aw_loop +L026aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz L016aw_end + jz L028aw_end # Tail Round 0 movl (%esi),%ecx movl (%edi),%edx @@ -760,7 +905,7 @@ L014aw_finish: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz L016aw_end + jz L028aw_end # Tail Round 1 movl 4(%esi),%ecx movl 4(%edi),%edx @@ -771,7 +916,7 @@ L014aw_finish: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz L016aw_end + jz L028aw_end # Tail Round 2 movl 8(%esi),%ecx movl 8(%edi),%edx @@ -782,7 +927,7 @@ L014aw_finish: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz L016aw_end + jz L028aw_end # Tail Round 3 movl 12(%esi),%ecx movl 12(%edi),%edx @@ -793,7 +938,7 @@ L014aw_finish: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz L016aw_end + jz L028aw_end # Tail Round 4 movl 16(%esi),%ecx movl 16(%edi),%edx @@ -804,7 +949,7 @@ L014aw_finish: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz L016aw_end + jz L028aw_end # Tail Round 5 movl 20(%esi),%ecx movl 20(%edi),%edx @@ -815,7 +960,7 @@ L014aw_finish: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz L016aw_end + jz L028aw_end # Tail Round 6 movl 24(%esi),%ecx movl 24(%edi),%edx @@ -825,7 +970,7 @@ L014aw_finish: subl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) -L016aw_end: +L028aw_end: popl %edi popl %esi popl %ebx @@ -847,8 +992,8 @@ L_bn_sub_part_words_begin: movl 32(%esp),%ebp xorl %eax,%eax andl $4294967288,%ebp - jz L017aw_finish -L018aw_loop: + jz L029aw_finish +L030aw_loop: # Round 0 movl (%esi),%ecx movl (%edi),%edx @@ -926,11 +1071,11 @@ L018aw_loop: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz L018aw_loop -L017aw_finish: + jnz L030aw_loop +L029aw_finish: movl 32(%esp),%ebp andl $7,%ebp - jz L019aw_end + jz L031aw_end # Tail Round 0 movl (%esi),%ecx movl (%edi),%edx @@ -944,7 +1089,7 @@ L017aw_finish: addl $4,%edi addl $4,%ebx decl %ebp - jz L019aw_end + jz L031aw_end # Tail Round 1 movl (%esi),%ecx movl (%edi),%edx @@ -958,7 +1103,7 @@ L017aw_finish: addl $4,%edi addl $4,%ebx decl %ebp - jz L019aw_end + jz L031aw_end # Tail Round 2 movl (%esi),%ecx movl (%edi),%edx @@ -972,7 +1117,7 @@ L017aw_finish: addl $4,%edi addl $4,%ebx decl %ebp - jz L019aw_end + jz L031aw_end # Tail Round 3 movl (%esi),%ecx movl (%edi),%edx @@ -986,7 +1131,7 @@ L017aw_finish: addl $4,%edi addl $4,%ebx decl %ebp - jz L019aw_end + jz L031aw_end # Tail Round 4 movl (%esi),%ecx movl (%edi),%edx @@ -1000,7 +1145,7 @@ L017aw_finish: addl $4,%edi addl $4,%ebx decl %ebp - jz L019aw_end + jz L031aw_end # Tail Round 5 movl (%esi),%ecx movl (%edi),%edx @@ -1014,7 +1159,7 @@ L017aw_finish: addl $4,%edi addl $4,%ebx decl %ebp - jz L019aw_end + jz L031aw_end # Tail Round 6 movl (%esi),%ecx movl (%edi),%edx @@ -1027,20 +1172,20 @@ L017aw_finish: addl $4,%esi addl $4,%edi addl $4,%ebx -L019aw_end: +L031aw_end: cmpl $0,36(%esp) - je L020pw_end + je L032pw_end movl 36(%esp),%ebp cmpl $0,%ebp - je L020pw_end - jge L021pw_pos + je L032pw_end + jge L033pw_pos # pw_neg movl $0,%edx subl %ebp,%edx movl %edx,%ebp andl $4294967288,%ebp - jz L022pw_neg_finish -L023pw_neg_loop: + jz L034pw_neg_finish +L035pw_neg_loop: # dl<0 Round 0 movl $0,%ecx movl (%edi),%edx @@ -1117,13 +1262,13 @@ L023pw_neg_loop: addl $32,%edi addl $32,%ebx subl $8,%ebp - jnz L023pw_neg_loop -L022pw_neg_finish: + jnz L035pw_neg_loop +L034pw_neg_finish: movl 36(%esp),%edx movl $0,%ebp subl %edx,%ebp andl $7,%ebp - jz L020pw_end + jz L032pw_end # dl<0 Tail Round 0 movl $0,%ecx movl (%edi),%edx @@ -1134,7 +1279,7 @@ L022pw_neg_finish: adcl $0,%eax decl %ebp movl %ecx,(%ebx) - jz L020pw_end + jz L032pw_end # dl<0 Tail Round 1 movl $0,%ecx movl 4(%edi),%edx @@ -1145,7 +1290,7 @@ L022pw_neg_finish: adcl $0,%eax decl %ebp movl %ecx,4(%ebx) - jz L020pw_end + jz L032pw_end # dl<0 Tail Round 2 movl $0,%ecx movl 8(%edi),%edx @@ -1156,7 +1301,7 @@ L022pw_neg_finish: adcl $0,%eax decl %ebp movl %ecx,8(%ebx) - jz L020pw_end + jz L032pw_end # dl<0 Tail Round 3 movl $0,%ecx movl 12(%edi),%edx @@ -1167,7 +1312,7 @@ L022pw_neg_finish: adcl $0,%eax decl %ebp movl %ecx,12(%ebx) - jz L020pw_end + jz L032pw_end # dl<0 Tail Round 4 movl $0,%ecx movl 16(%edi),%edx @@ -1178,7 +1323,7 @@ L022pw_neg_finish: adcl $0,%eax decl %ebp movl %ecx,16(%ebx) - jz L020pw_end + jz L032pw_end # dl<0 Tail Round 5 movl $0,%ecx movl 20(%edi),%edx @@ -1189,7 +1334,7 @@ L022pw_neg_finish: adcl $0,%eax decl %ebp movl %ecx,20(%ebx) - jz L020pw_end + jz L032pw_end # dl<0 Tail Round 6 movl $0,%ecx movl 24(%edi),%edx @@ -1199,181 +1344,185 @@ L022pw_neg_finish: subl %edx,%ecx adcl $0,%eax movl %ecx,24(%ebx) - jmp L020pw_end -L021pw_pos: + jmp L032pw_end +L033pw_pos: andl $4294967288,%ebp - jz L024pw_pos_finish -L025pw_pos_loop: + jz L036pw_pos_finish +L037pw_pos_loop: # dl>0 Round 0 movl (%esi),%ecx subl %eax,%ecx movl %ecx,(%ebx) - jnc L026pw_nc0 + jnc L038pw_nc0 # dl>0 Round 1 movl 4(%esi),%ecx subl %eax,%ecx movl %ecx,4(%ebx) - jnc L027pw_nc1 + jnc L039pw_nc1 # dl>0 Round 2 movl 8(%esi),%ecx subl %eax,%ecx movl %ecx,8(%ebx) - jnc L028pw_nc2 + jnc L040pw_nc2 # dl>0 Round 3 movl 12(%esi),%ecx subl %eax,%ecx movl %ecx,12(%ebx) - jnc L029pw_nc3 + jnc L041pw_nc3 # dl>0 Round 4 movl 16(%esi),%ecx subl %eax,%ecx movl %ecx,16(%ebx) - jnc L030pw_nc4 + jnc L042pw_nc4 # dl>0 Round 5 movl 20(%esi),%ecx subl %eax,%ecx movl %ecx,20(%ebx) - jnc L031pw_nc5 + jnc L043pw_nc5 # dl>0 Round 6 movl 24(%esi),%ecx subl %eax,%ecx movl %ecx,24(%ebx) - jnc L032pw_nc6 + jnc L044pw_nc6 # dl>0 Round 7 movl 28(%esi),%ecx subl %eax,%ecx movl %ecx,28(%ebx) - jnc L033pw_nc7 + jnc L045pw_nc7 addl $32,%esi addl $32,%ebx subl $8,%ebp - jnz L025pw_pos_loop -L024pw_pos_finish: + jnz L037pw_pos_loop +L036pw_pos_finish: movl 36(%esp),%ebp andl $7,%ebp - jz L020pw_end + jz L032pw_end # dl>0 Tail Round 0 movl (%esi),%ecx subl %eax,%ecx movl %ecx,(%ebx) - jnc L034pw_tail_nc0 + jnc L046pw_tail_nc0 decl %ebp - jz L020pw_end + jz L032pw_end # dl>0 Tail Round 1 movl 4(%esi),%ecx subl %eax,%ecx movl %ecx,4(%ebx) - jnc L035pw_tail_nc1 + jnc L047pw_tail_nc1 decl %ebp - jz L020pw_end + jz L032pw_end # dl>0 Tail Round 2 movl 8(%esi),%ecx subl %eax,%ecx movl %ecx,8(%ebx) - jnc L036pw_tail_nc2 + jnc L048pw_tail_nc2 decl %ebp - jz L020pw_end + jz L032pw_end # dl>0 Tail Round 3 movl 12(%esi),%ecx subl %eax,%ecx movl %ecx,12(%ebx) - jnc L037pw_tail_nc3 + jnc L049pw_tail_nc3 decl %ebp - jz L020pw_end + jz L032pw_end # dl>0 Tail Round 4 movl 16(%esi),%ecx subl %eax,%ecx movl %ecx,16(%ebx) - jnc L038pw_tail_nc4 + jnc L050pw_tail_nc4 decl %ebp - jz L020pw_end + jz L032pw_end # dl>0 Tail Round 5 movl 20(%esi),%ecx subl %eax,%ecx movl %ecx,20(%ebx) - jnc L039pw_tail_nc5 + jnc L051pw_tail_nc5 decl %ebp - jz L020pw_end + jz L032pw_end # dl>0 Tail Round 6 movl 24(%esi),%ecx subl %eax,%ecx movl %ecx,24(%ebx) - jnc L040pw_tail_nc6 + jnc L052pw_tail_nc6 movl $1,%eax - jmp L020pw_end -L041pw_nc_loop: + jmp L032pw_end +L053pw_nc_loop: movl (%esi),%ecx movl %ecx,(%ebx) -L026pw_nc0: +L038pw_nc0: movl 4(%esi),%ecx movl %ecx,4(%ebx) -L027pw_nc1: +L039pw_nc1: movl 8(%esi),%ecx movl %ecx,8(%ebx) -L028pw_nc2: +L040pw_nc2: movl 12(%esi),%ecx movl %ecx,12(%ebx) -L029pw_nc3: +L041pw_nc3: movl 16(%esi),%ecx movl %ecx,16(%ebx) -L030pw_nc4: +L042pw_nc4: movl 20(%esi),%ecx movl %ecx,20(%ebx) -L031pw_nc5: +L043pw_nc5: movl 24(%esi),%ecx movl %ecx,24(%ebx) -L032pw_nc6: +L044pw_nc6: movl 28(%esi),%ecx movl %ecx,28(%ebx) -L033pw_nc7: +L045pw_nc7: addl $32,%esi addl $32,%ebx subl $8,%ebp - jnz L041pw_nc_loop + jnz L053pw_nc_loop movl 36(%esp),%ebp andl $7,%ebp - jz L042pw_nc_end + jz L054pw_nc_end movl (%esi),%ecx movl %ecx,(%ebx) -L034pw_tail_nc0: +L046pw_tail_nc0: decl %ebp - jz L042pw_nc_end + jz L054pw_nc_end movl 4(%esi),%ecx movl %ecx,4(%ebx) -L035pw_tail_nc1: +L047pw_tail_nc1: decl %ebp - jz L042pw_nc_end + jz L054pw_nc_end movl 8(%esi),%ecx movl %ecx,8(%ebx) -L036pw_tail_nc2: +L048pw_tail_nc2: decl %ebp - jz L042pw_nc_end + jz L054pw_nc_end movl 12(%esi),%ecx movl %ecx,12(%ebx) -L037pw_tail_nc3: +L049pw_tail_nc3: decl %ebp - jz L042pw_nc_end + jz L054pw_nc_end movl 16(%esi),%ecx movl %ecx,16(%ebx) -L038pw_tail_nc4: +L050pw_tail_nc4: decl %ebp - jz L042pw_nc_end + jz L054pw_nc_end movl 20(%esi),%ecx movl %ecx,20(%ebx) -L039pw_tail_nc5: +L051pw_tail_nc5: decl %ebp - jz L042pw_nc_end + jz L054pw_nc_end movl 24(%esi),%ecx movl %ecx,24(%ebx) -L040pw_tail_nc6: -L042pw_nc_end: +L052pw_tail_nc6: +L054pw_nc_end: movl $0,%eax -L020pw_end: +L032pw_end: popl %edi popl %esi popl %ebx popl %ebp ret +.section __IMPORT,__pointers,non_lazy_symbol_pointers +L_OPENSSL_ia32cap_P$non_lazy_ptr: +.indirect_symbol _OPENSSL_ia32cap_P +.long 0 #endif diff --git a/mac-x86/crypto/bn/x86-mont.S b/mac-x86/crypto/bn/x86-mont.S index 1b79c5f..234034b 100644 --- a/mac-x86/crypto/bn/x86-mont.S +++ b/mac-x86/crypto/bn/x86-mont.S @@ -43,6 +43,126 @@ L_bn_mul_mont_begin: movl %esi,20(%esp) leal -3(%edi),%ebx movl %ebp,24(%esp) + call L001PIC_me_up +L001PIC_me_up: + popl %eax + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001PIC_me_up(%eax),%eax + btl $26,(%eax) + jnc L002non_sse2 + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 4,0x90 +L0031st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl L0031st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +L004outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +L005inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz L005inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle L004outer + emms + jmp L006common_tail +.align 4,0x90 +L002non_sse2: movl 8(%esp),%esi leal 1(%ebx),%ebp movl 12(%esp),%edi @@ -53,12 +173,12 @@ L_bn_mul_mont_begin: leal 4(%edi,%ebx,4),%eax orl %edx,%ebp movl (%edi),%edi - jz L001bn_sqr_mont + jz L007bn_sqr_mont movl %eax,28(%esp) movl (%esi),%eax xorl %edx,%edx .align 4,0x90 -L002mull: +L008mull: movl %edx,%ebp mull %edi addl %eax,%ebp @@ -67,7 +187,7 @@ L002mull: movl (%esi,%ecx,4),%eax cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl L002mull + jl L008mull movl %edx,%ebp mull %edi movl 20(%esp),%edi @@ -85,9 +205,9 @@ L002mull: movl 4(%esi),%eax adcl $0,%edx incl %ecx - jmp L0032ndmadd + jmp L0092ndmadd .align 4,0x90 -L0041stmadd: +L0101stmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -98,7 +218,7 @@ L0041stmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl L0041stmadd + jl L0101stmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%eax @@ -121,7 +241,7 @@ L0041stmadd: adcl $0,%edx movl $1,%ecx .align 4,0x90 -L0032ndmadd: +L0092ndmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -132,7 +252,7 @@ L0032ndmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl L0032ndmadd + jl L0092ndmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -148,16 +268,16 @@ L0032ndmadd: movl %edx,32(%esp,%ebx,4) cmpl 28(%esp),%ecx movl %eax,36(%esp,%ebx,4) - je L005common_tail + je L006common_tail movl (%ecx),%edi movl 8(%esp),%esi movl %ecx,12(%esp) xorl %ecx,%ecx xorl %edx,%edx movl (%esi),%eax - jmp L0041stmadd + jmp L0101stmadd .align 4,0x90 -L001bn_sqr_mont: +L007bn_sqr_mont: movl %ebx,(%esp) movl %ecx,12(%esp) movl %edi,%eax @@ -168,7 +288,7 @@ L001bn_sqr_mont: andl $1,%ebx incl %ecx .align 4,0x90 -L006sqr: +L011sqr: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -180,7 +300,7 @@ L006sqr: cmpl (%esp),%ecx movl %eax,%ebx movl %ebp,28(%esp,%ecx,4) - jl L006sqr + jl L011sqr movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -204,7 +324,7 @@ L006sqr: movl 4(%esi),%eax movl $1,%ecx .align 4,0x90 -L0073rdmadd: +L0123rdmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -223,7 +343,7 @@ L0073rdmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl L0073rdmadd + jl L0123rdmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -239,7 +359,7 @@ L0073rdmadd: movl %edx,32(%esp,%ebx,4) cmpl %ebx,%ecx movl %eax,36(%esp,%ebx,4) - je L005common_tail + je L006common_tail movl 4(%esi,%ecx,4),%edi leal 1(%ecx),%ecx movl %edi,%eax @@ -251,12 +371,12 @@ L0073rdmadd: xorl %ebp,%ebp cmpl %ebx,%ecx leal 1(%ecx),%ecx - je L008sqrlast + je L013sqrlast movl %edx,%ebx shrl $1,%edx andl $1,%ebx .align 4,0x90 -L009sqradd: +L014sqradd: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -272,13 +392,13 @@ L009sqradd: cmpl (%esp),%ecx movl %ebp,28(%esp,%ecx,4) movl %eax,%ebx - jle L009sqradd + jle L014sqradd movl %edx,%ebp addl %edx,%edx shrl $31,%ebp addl %ebx,%edx adcl $0,%ebp -L008sqrlast: +L013sqrlast: movl 20(%esp),%edi movl 16(%esp),%esi imull 32(%esp),%edi @@ -293,9 +413,9 @@ L008sqrlast: adcl $0,%edx movl $1,%ecx movl 4(%esi),%eax - jmp L0073rdmadd + jmp L0123rdmadd .align 4,0x90 -L005common_tail: +L006common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -303,16 +423,16 @@ L005common_tail: movl %ebx,%ecx xorl %edx,%edx .align 4,0x90 -L010sub: +L015sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge L010sub + jge L015sub sbbl $0,%eax .align 4,0x90 -L011copy: +L016copy: movl (%esi,%ebx,4),%edx movl (%edi,%ebx,4),%ebp xorl %ebp,%edx @@ -321,7 +441,7 @@ L011copy: movl %ecx,(%esi,%ebx,4) movl %edx,(%edi,%ebx,4) decl %ebx - jge L011copy + jge L016copy movl 24(%esp),%esp movl $1,%eax L000just_leave: @@ -335,4 +455,8 @@ L000just_leave: .byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 .byte 111,114,103,62,0 +.section __IMPORT,__pointers,non_lazy_symbol_pointers +L_OPENSSL_ia32cap_P$non_lazy_ptr: +.indirect_symbol _OPENSSL_ia32cap_P +.long 0 #endif -- cgit v1.1