diff options
author | Jingwei Zhang <jingwei.zhang@intel.com> | 2014-10-31 18:29:18 +0800 |
---|---|---|
committer | Christopher Ferris <cferris@google.com> | 2015-03-09 13:19:08 -0700 |
commit | 5d4f0e6a26b66f1dab8d20a65af4469c6dd7370d (patch) | |
tree | a110442bd4fa9b13f610fd29f5b6dd42fdbf32fc /libm/x86/e_hypot.S | |
parent | edc1d3e3c6266eadcd05a9aa7bcdec701ba060aa (diff) | |
download | bionic-5d4f0e6a26b66f1dab8d20a65af4469c6dd7370d.zip bionic-5d4f0e6a26b66f1dab8d20a65af4469c6dd7370d.tar.gz bionic-5d4f0e6a26b66f1dab8d20a65af4469c6dd7370d.tar.bz2 |
Add the optimized implementation of 18 math functions for x86 and x86_64 respectively
Change-Id: I31bf601448a9427f825517f3a0ff24de47f49bfa
Signed-off-by: Jingwei Zhang <jingwei.zhang@intel.com>
Signed-off-by: Mingwei Shi <mingwei.shi@intel.com>
Diffstat (limited to 'libm/x86/e_hypot.S')
-rw-r--r-- | libm/x86/e_hypot.S | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/libm/x86/e_hypot.S b/libm/x86/e_hypot.S new file mode 100644 index 0000000..aa6ab64 --- /dev/null +++ b/libm/x86/e_hypot.S @@ -0,0 +1,221 @@ +/* +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/******************************************************************************/ +// ALGORITHM DESCRIPTION +// --------------------- +// +// X87 version: +// Use 80-bit FPU precision fmul, fsqrt to compute square and sqrt. +// +// SSE version: +// Swap x, y if |x|<|y| +// For x=2^k*x, get y=y*2^(-k) +// Get S ~ sqrt(x^2+y^2) (leading 1 + leading 25 mantissa bits) +// +// Get D = ( RN(x^2+y^2) - S^2 ) + ( x^2 - RN(x^2) ) + +// + ( y^2 - ((RN(x^2+y^2)-RN(x^2)) ) +// +// Result is 2^k*(S + Se), where Se = S*e +// S*e is approximated as (D/2S)*( 1 - (D/2S)^2*1.0/S ) +// +// Return 2^k*(S+Se) +// +// For |y/x|<2^(-64), return x +// +// For cases where maximum biased exponent is either greater than 7fdh or +// below 32, take a special path to check for special cases (0, NaN, Inf), +// possible overflow, and more accurate computation for denormal results +// +// Special cases: +// hypot(x,y), hypot(y,x), and hypot(x,-y) are equivalent +// hypot(x,+-0) is equivalent to fabs(x) +// hypot(x,y) = y if (x==NaN or x==INF) and y==INF +// hypot(x,y) = x if (x==NaN or x==INF) and y!=INF (even if y==NaN!) +// hypot(x,y) = y if (x!=NaN and x!=INF) and (y==NaN or y==INF) +// +/******************************************************************************/ + +#include <private/bionic_asm.h> +# -- Begin static_func + .text + .align __bionic_asm_align + .type static_func, @function +static_func: +..B1.1: + call ..L2 +..L2: + popl %eax + lea _GLOBAL_OFFSET_TABLE_+[. - ..L2](%eax), %eax + lea static_const_table@GOTOFF(%eax), %eax + ret + .size static_func,.-static_func +# -- End static_func + +# -- Begin hypot +ENTRY(hypot) +# parameter 1: 8 + %ebp +# parameter 2: 16 + %ebp +..B2.1: +..B2.2: + pushl %ebp + movl %esp, %ebp + subl $152, %esp + movl %ebx, 96(%esp) + call static_func + movl %eax, %ebx + movapd (%ebx), %xmm3 + movsd 160(%esp), %xmm0 + movsd 168(%esp), %xmm1 + andpd %xmm3, %xmm0 + andpd %xmm3, %xmm1 + pextrw $3, %xmm0, %eax + pextrw $3, %xmm1, %edx + cmpl $24528, %eax + ja .L_2TAG_PACKET_0.0.2 + cmpl $24528, %edx + ja .L_2TAG_PACKET_0.0.2 +.L_2TAG_PACKET_1.0.2: + fldl 160(%esp) + fldl 168(%esp) + fxch %st(1) + fmul %st(0), %st + fxch %st(1) + nop + fmul %st(0), %st + faddp %st, %st(1) + fsqrt + jmp .L_2TAG_PACKET_2.0.2 +.L_2TAG_PACKET_0.0.2: + cmpl $32752, %eax + movl %eax, %ecx + jae .L_2TAG_PACKET_3.0.2 + subl %edx, %ecx + cmpl $32752, %edx + jae .L_2TAG_PACKET_3.0.2 + addl $928, %ecx + addl %edx, %eax + cmpl $1856, %ecx + ja .L_2TAG_PACKET_4.0.2 + cmpl $49056, %eax + jb .L_2TAG_PACKET_1.0.2 + fldl 160(%esp) + fldl 168(%esp) + fxch %st(1) + fmul %st(0), %st + fxch %st(1) + nop + fmul %st(0), %st + faddp %st, %st(1) + fsqrt +.L_2TAG_PACKET_5.0.2: + fstl (%esp) + fstpt 16(%esp) + xorl %eax, %eax + movw 24(%esp), %ax + cmpl $17407, %eax + jae .L_2TAG_PACKET_6.0.2 + fldl (%esp) + jmp .L_2TAG_PACKET_7.0.2 +.L_2TAG_PACKET_4.0.2: + movsd %xmm0, 32(%esp) + movsd %xmm1, 40(%esp) + fldl 32(%esp) + faddl 40(%esp) + jmp .L_2TAG_PACKET_5.0.2 +.L_2TAG_PACKET_6.0.2: + movl $46, %edx +.L_2TAG_PACKET_8.0.2: + movsd 160(%esp), %xmm0 + movsd 168(%esp), %xmm1 + fldl (%esp) + jmp .L_2TAG_PACKET_7.0.2 +.L_2TAG_PACKET_3.0.2: + shufpd $0, %xmm1, %xmm0 + movdqa %xmm0, %xmm2 + movdqa 16(%ebx), %xmm3 + movsd %xmm0, 32(%esp) + movsd %xmm1, 40(%esp) + cmppd $3, %xmm0, %xmm2 + cmppd $0, %xmm0, %xmm3 + movmskpd %xmm2, %edx + movmskpd %xmm3, %eax + testl %edx, %edx + je .L_2TAG_PACKET_9.0.2 + fldl 32(%esp) + fmull 40(%esp) + testl $1, %eax + jne .L_2TAG_PACKET_10.0.2 + testl $2, %eax + jne .L_2TAG_PACKET_11.0.2 + jmp .L_2TAG_PACKET_2.0.2 +.L_2TAG_PACKET_9.0.2: + fldl 32(%esp) + faddl 40(%esp) + jmp .L_2TAG_PACKET_2.0.2 +.L_2TAG_PACKET_10.0.2: + fstpl 40(%esp) + fldl 32(%esp) + jmp .L_2TAG_PACKET_7.0.2 +.L_2TAG_PACKET_11.0.2: + fstpl 32(%esp) + fldl 40(%esp) + jmp .L_2TAG_PACKET_7.0.2 +.L_2TAG_PACKET_2.0.2: +.L_2TAG_PACKET_7.0.2: + movl 96(%esp), %ebx + movl %ebp, %esp + popl %ebp + ret +..B2.3: +END(hypot) +# -- End hypot + +# Start file scope ASM +.weak hypotl +.equ hypotl, hypot +# End file scope ASM + .section .rodata, "a" + .align 16 + .align 16 +static_const_table: + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 0 + .long 2146435072 + .long 0 + .long 2146435072 + .type static_const_table,@object + .size static_const_table,32 + .data + .section .note.GNU-stack, "" +# End |