diff options
51 files changed, 3244 insertions, 215 deletions
diff --git a/libc/Android.mk b/libc/Android.mk index 6a77deb..c04a0fc 100644 --- a/libc/Android.mk +++ b/libc/Android.mk @@ -279,7 +279,6 @@ libc_common_src_files := \ bionic/libc_init_common.c \ bionic/logd_write.c \ bionic/md5.c \ - bionic/memmove_words.c \ bionic/pututline.c \ bionic/realpath.c \ bionic/sched_getaffinity.c \ @@ -384,14 +383,46 @@ libc_common_src_files += \ arch-arm/bionic/memset.S \ arch-arm/bionic/setjmp.S \ arch-arm/bionic/sigsetjmp.S \ - arch-arm/bionic/strlen.c.arm \ arch-arm/bionic/strcpy.S \ arch-arm/bionic/strcmp.S \ arch-arm/bionic/syscall.S \ - string/memmove.c.arm \ - string/bcopy.c \ string/strncmp.c \ unistd/socketcalls.c +ifeq ($(ARCH_ARM_HAVE_ARMV7A),true) +libc_common_src_files += arch-arm/bionic/strlen-armv7.S +else +libc_common_src_files += arch-arm/bionic/strlen.c.arm +endif + +# Check if we want a neonized version of memmove instead of the +# current ARM version +ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true) +libc_common_src_files += \ + arch-arm/bionic/memmove.S \ + bionic/memmove_words.c +else +ifneq (, $(filter true,$(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION) $(TARGET_USE_SPARROW_BIONIC_OPTIMIZATION))) + libc_common_src_files += \ + arch-arm/bionic/memmove.S + else # Other ARM + libc_common_src_files += \ + string/bcopy.c \ + string/memmove.c.arm \ + bionic/memmove_words.c + endif # !TARGET_USE_KRAIT_BIONIC_OPTIMIZATION +endif # !TARGET_USE_SCORPION_BIONIC_OPTIMIZATION + +# If the kernel supports kernel user helpers for gettimeofday, use +# that instead. +ifeq ($(KERNEL_HAS_GETTIMEOFDAY_HELPER),true) + libc_common_src_files := $(filter-out arch-arm/syscalls/gettimeofday.S,$(libc_common_src_files)) + libc_common_src_files := $(filter-out arch-arm/syscalls/clock_gettime.S,$(libc_common_src_files)) + libc_common_src_files += \ + arch-arm/bionic/gettimeofday.c \ + arch-arm/bionic/gettimeofday_syscall.S \ + arch-arm/bionic/clock_gettime.c \ + arch-arm/bionic/clock_gettime_syscall.S +endif # KERNEL_HAS_GETTIMEOFDAY_HELPER # These files need to be arm so that gdbserver # can set breakpoints in them without messing @@ -436,6 +467,7 @@ libc_common_src_files += \ arch-x86/string/strcmp_wrapper.S \ arch-x86/string/strncmp_wrapper.S \ arch-x86/string/strlen_wrapper.S \ + bionic/memmove_words.c \ string/strcpy.c \ bionic/pthread-atfork.c \ bionic/pthread-rwlocks.c \ @@ -476,6 +508,9 @@ libc_common_src_files += \ arch-mips/string/mips_strlen.c libc_common_src_files += \ + bionic/memmove_words.c + +libc_common_src_files += \ string/bcopy.c \ string/memcmp.c \ string/strcmp.c \ @@ -555,6 +590,44 @@ ifeq ($(TARGET_ARCH),arm) ifeq ($(ARCH_ARM_USE_NON_NEON_MEMCPY),true) libc_common_cflags += -DARCH_ARM_USE_NON_NEON_MEMCPY endif + + # Add in defines to activate SCORPION_NEON_OPTIMIZATION + ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true) + libc_common_cflags += -DSCORPION_NEON_OPTIMIZATION + ifeq ($(TARGET_USE_SCORPION_PLD_SET),true) + libc_common_cflags += -DPLDOFFS=$(TARGET_SCORPION_BIONIC_PLDOFFS) + libc_common_cflags += -DPLDSIZE=$(TARGET_SCORPION_BIONIC_PLDSIZE) + endif + endif + ifeq ($(TARGET_HAVE_TEGRA_ERRATA_657451),true) + libc_common_cflags += -DHAVE_TEGRA_ERRATA_657451 + endif + # Add in defines to activate KRAIT_NEON_OPTIMIZATION + ifeq ($(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION),true) + libc_common_cflags += -DKRAIT_NEON_OPTIMIZATION + ifeq ($(TARGET_USE_KRAIT_PLD_SET),true) + libc_common_cflags += -DPLDOFFS=$(TARGET_KRAIT_BIONIC_PLDOFFS) + libc_common_cflags += -DPLDTHRESH=$(TARGET_KRAIT_BIONIC_PLDTHRESH) + libc_common_cflags += -DPLDSIZE=$(TARGET_KRAIT_BIONIC_PLDSIZE) + libc_common_cflags += -DBBTHRESH=$(TARGET_KRAIT_BIONIC_BBTHRESH) + endif + endif + ifeq ($(TARGET_USE_SPARROW_BIONIC_OPTIMIZATION),true) + libc_common_cflags += -DSPARROW_NEON_OPTIMIZATION + endif + ifeq ($(TARGET_CORTEX_CACHE_LINE_32),true) + libc_common_cflags += -DCORTEX_CACHE_LINE_32 + endif +else # !arm + ifeq ($(TARGET_ARCH),x86) + libc_crt_target_cflags := + ifeq ($(ARCH_X86_HAVE_SSE2),true) + libc_crt_target_cflags += -DUSE_SSE2=1 + endif + ifeq ($(ARCH_X86_HAVE_SSSE3),true) + libc_crt_target_cflags += -DUSE_SSSE3=1 + endif + endif # x86 endif # !arm ifeq ($(TARGET_ARCH),x86) diff --git a/libc/arch-arm/bionic/clock_gettime.c b/libc/arch-arm/bionic/clock_gettime.c new file mode 100644 index 0000000..c2917b0 --- /dev/null +++ b/libc/arch-arm/bionic/clock_gettime.c @@ -0,0 +1,94 @@ +/* Copyright (c) 2012, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <string.h> +#include <stdint.h> +#include <time.h> +#include <sys/time.h> +#include <machine/cpu-features.h> +#include <machine/kernel_user_helper.h> + +int clock_gettime(int clk_id, struct timespec *tp) +{ + unsigned prelock, postlock; + + /* + * Check if the offset in the kernel user helper page has + * the flag set appropriately to show that this feature is + * enabled in the kernel. If not, default to the original + * clock_gettime system call. + * + * Also, if this is anything other than CLOCK_MONOTONIC, route + * to the original system call as well. + */ + if ((__kuser_gtod_feature != __kuser_gtod_feature_flag) || + (clk_id != CLOCK_MONOTONIC)) + return clock_gettime_syscall(clk_id, tp); + + if (tp) { + struct gtod_t dgtod; + uint32_t nscount, cycleoffset; + uint32_t mono_sec, mono_nsec; + uint64_t cycle_delta; + + do { + prelock = __kuser_gtod_seqnum; + + dgtod.cycle_last = __kuser_gtod_cycle_last; + dgtod.mask = __kuser_gtod_mask; + dgtod.mult = __kuser_gtod_mult; + dgtod.shift = __kuser_gtod_shift; + dgtod.tv_sec = __kuser_gtod_tv_sec; + dgtod.tv_nsec = __kuser_gtod_tv_nsec; + + mono_sec = __kuser_gtod_wtm_tv_sec; + mono_nsec = __kuser_gtod_wtm_tv_nsec; + + cycleoffset = __kuser_gtod_offset; + cycleoffset += __kuser_gtod_cycle_base; + nscount = *(uint32_t *)cycleoffset; + + postlock = __kuser_gtod_seqnum; + } while (prelock != postlock); + + cycle_delta = (nscount - dgtod.cycle_last) & dgtod.mask; + dgtod.tv_nsec += (cycle_delta * dgtod.mult) >> dgtod.shift; + dgtod.tv_sec += mono_sec; + dgtod.tv_nsec += mono_nsec; + while (dgtod.tv_nsec >= NSEC_PER_SEC) { + dgtod.tv_sec += 1; + dgtod.tv_nsec -= NSEC_PER_SEC; + } + + tp->tv_sec = dgtod.tv_sec; + tp->tv_nsec = dgtod.tv_nsec; + } + + return 0; +} diff --git a/libc/arch-arm/bionic/clock_gettime_syscall.S b/libc/arch-arm/bionic/clock_gettime_syscall.S new file mode 100644 index 0000000..0b3078a --- /dev/null +++ b/libc/arch-arm/bionic/clock_gettime_syscall.S @@ -0,0 +1,42 @@ +/* Copyright (c) 2012, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <machine/asm.h> +#include <sys/linux-syscalls.h> + +ENTRY(clock_gettime_syscall) + .save {r4, r7} + stmfd sp!, {r4, r7} + ldr r7, =__NR_clock_gettime + swi #0 + movs r0, r0 + ldmfd sp!, {r4, r7} + bmi __set_syscall_errno + bx lr +END(clock_gettime_syscall) diff --git a/libc/arch-arm/bionic/gettimeofday.c b/libc/arch-arm/bionic/gettimeofday.c new file mode 100644 index 0000000..780d6e8 --- /dev/null +++ b/libc/arch-arm/bionic/gettimeofday.c @@ -0,0 +1,99 @@ +/* Copyright (c) 2012, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <string.h> +#include <stdint.h> +#include <sys/time.h> +#include <machine/cpu-features.h> +#include <machine/kernel_user_helper.h> + +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + unsigned prelock, postlock; + + /* + * Check if the offset in the kernel user helper page has + * the flag set appropriately to show that this feature is + * enabled in the kernel. If not, default to the original + * gettimeofday system call. + */ + if (__kuser_gtod_feature != __kuser_gtod_feature_flag) + return gettimeofday_syscall(tv, tz); + + if (tv) { + struct gtod_t dgtod; + uint32_t nscount, cycleoffset; + uint64_t cycle_delta; + uint32_t tmp = 0; + + do { + prelock = __kuser_gtod_seqnum; + + dgtod.cycle_last = __kuser_gtod_cycle_last; + dgtod.mask = __kuser_gtod_mask; + dgtod.mult = __kuser_gtod_mult; + dgtod.shift = __kuser_gtod_shift; + dgtod.tv_sec = __kuser_gtod_tv_sec; + dgtod.tv_nsec = __kuser_gtod_tv_nsec; + + cycleoffset = __kuser_gtod_offset; + cycleoffset += __kuser_gtod_cycle_base; + nscount = *(uint32_t *)cycleoffset; + + postlock = __kuser_gtod_seqnum; + } while (prelock != postlock); + + cycle_delta = (nscount - dgtod.cycle_last) & dgtod.mask; + dgtod.tv_nsec += (cycle_delta * dgtod.mult) >> dgtod.shift; + while (dgtod.tv_nsec >= NSEC_PER_SEC) { + dgtod.tv_sec += 1; + dgtod.tv_nsec -= NSEC_PER_SEC; + } + + tv->tv_sec = dgtod.tv_sec; + asm(" movw %[tmp], #0x4dd3\n\t" + " movt %[tmp], #0x1062\n\t" + " umull %[tmp], %[x], %[y], %[tmp]\n\t" + " lsr %[x], %[x], #6\n\t" : + [x] "=r" (tv->tv_usec) : + [y] "r" (dgtod.tv_nsec), [tmp] "r" (tmp) + : ); + } + + if (tz) { + do { + prelock = __kuser_gtod_seqnum; + tz->tz_minuteswest = __kuser_gtod_tz_minw; + tz->tz_dsttime = __kuser_gtod_tz_dst; + postlock = __kuser_gtod_seqnum; + } while (prelock != postlock); + } + + return 0; +} diff --git a/libc/arch-arm/bionic/gettimeofday_syscall.S b/libc/arch-arm/bionic/gettimeofday_syscall.S new file mode 100644 index 0000000..3a945e2 --- /dev/null +++ b/libc/arch-arm/bionic/gettimeofday_syscall.S @@ -0,0 +1,42 @@ +/* Copyright (c) 2012, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <machine/asm.h> +#include <sys/linux-syscalls.h> + +ENTRY(gettimeofday_syscall) + .save {r4, r7} + stmfd sp!, {r4, r7} + ldr r7, =__NR_gettimeofday + swi #0 + movs r0, r0 + ldmfd sp!, {r4, r7} + bmi __set_syscall_errno + bx lr +END(gettimeofday_syscall) diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S index c872a51..781c4f8 100644 --- a/libc/arch-arm/bionic/memcmp.S +++ b/libc/arch-arm/bionic/memcmp.S @@ -1,5 +1,6 @@ /* - * Copyright (C) 2008 The Android Open Source Project + * Copyright (C) 2008, 2011 The Android Open Source Project + * Copyright (C) 2010 ST-Ericsson SA * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,43 +31,71 @@ #include <machine/asm.h> /* - * Optimized memcmp() for ARM9. - * This would not be optimal on XScale or ARM11, where more prefetching - * and use of PLD will be needed. - * The 2 major optimzations here are - * (1) The main loop compares 16 bytes at a time - * (2) The loads are scheduled in a way they won't stall + * Optimized memcmp() for ARM9 and Cortex-A9 */ +#if __ARM_ARCH__ >= 7 +#define __ARM_CORTEX + +#if defined(CORTEX_CACHE_LINE_32) +#define CACHE_LINE_SIZE 32 +#else +#define CACHE_LINE_SIZE 64 +#endif + +#endif /* __ARM_ARCH__ */ + ENTRY(memcmp) +#if defined(__ARM_CORTEX) + pld [r0, #(CACHE_LINE_SIZE * 0)] + pld [r0, #(CACHE_LINE_SIZE * 1)] +#else PLD (r0, #0) PLD (r1, #0) +#endif /* take of the case where length is 0 or the buffers are the same */ cmp r0, r1 +#if !defined(__ARM_CORTEX) cmpne r2, #0 +#endif moveq r0, #0 bxeq lr +#if defined(__ARM_CORTEX) + pld [r1, #(CACHE_LINE_SIZE * 0)] + pld [r1, #(CACHE_LINE_SIZE * 1)] + + /* make sure we have at least 8+4 bytes, this simplify things below + * and avoid some overhead for small blocks + */ + cmp r2, #(8+4) + bmi 10f +#endif /* __ARM_CORTEX */ + .save {r4, lr} /* save registers */ stmfd sp!, {r4, lr} - + +#if !defined(__ARM_CORTEX) PLD (r0, #32) PLD (r1, #32) +#endif /* since r0 hold the result, move the first source * pointer somewhere else */ mov r4, r0 - + +#if !defined(__ARM_CORTEX) /* make sure we have at least 8+4 bytes, this simplify things below * and avoid some overhead for small blocks */ cmp r2, #(8+4) bmi 8f - +#endif + /* align first pointer to word boundary * offset = -src & 3 */ @@ -103,8 +132,14 @@ ENTRY(memcmp) subs r2, r2, #(32 + 4) bmi 1f -0: PLD (r4, #64) +0: +#if defined(__ARM_CORTEX) + pld [r4, #(CACHE_LINE_SIZE * 2)] + pld [r1, #(CACHE_LINE_SIZE * 2)] +#else + PLD (r4, #64) PLD (r1, #64) +#endif ldr r0, [r4], #4 ldr lr, [r1, #4]! eors r0, r0, ip @@ -170,6 +205,22 @@ ENTRY(memcmp) 9: /* restore registers and return */ ldmfd sp!, {r4, lr} bx lr + +#if defined(__ARM_CORTEX) +10: /* process less than 12 bytes */ + cmp r2, #0 + moveq r0, #0 + bxeq lr + mov r3, r0 +11: + ldrb r0, [r3], #1 + ldrb ip, [r1], #1 + subs r0, ip + bxne lr + subs r2, r2, #1 + bne 11b + bx lr +#endif /* __ARM_CORTEX */ END(memcmp) @@ -192,8 +243,14 @@ END(memcmp) bic r1, r1, #3 ldr lr, [r1], #4 -6: PLD (r1, #64) +6: +#if defined(__ARM_CORTEX) + pld [r1, #(CACHE_LINE_SIZE * 2)] + pld [r4, #(CACHE_LINE_SIZE * 2)] +#else + PLD (r1, #64) PLD (r4, #64) +#endif mov ip, lr, lsr #16 ldr lr, [r1], #4 ldr r0, [r4], #4 diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index 8453cc0..80f1bf5 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -2,6 +2,8 @@ * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * + * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -30,6 +32,396 @@ #include <machine/asm.h> #if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY) +#if defined(KRAIT_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true + * TARGET_USE_KRAIT_PLD_SET := true + * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> + * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> + * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> + * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold> + */ +#ifndef PLDOFFS +#define PLDOFFS (10) +#endif +#ifndef PLDTHRESH +#define PLDTHRESH (PLDOFFS) +#endif +#ifndef BBTHRESH +#define BBTHRESH (4096/64) +#endif +#if (PLDOFFS < 1) +#error Routine does not support offsets less than 1 +#endif +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif +#ifndef PLDSIZE +#define PLDSIZE (64) +#endif +#define NOP_OPCODE (0xe320f000) + + .text + .fpu neon + .global memcpy + .type memcpy, %function + .align 5 +memcpy: + stmfd sp!, {r0, r9, r10, lr} + cmp r2, #4 + blt .Lneon_lt4 + cmp r2, #16 + blt .Lneon_lt16 + cmp r2, #32 + blt .Lneon_16 + cmp r2, #64 + blt .Lneon_copy_32_a + + mov r12, r2, lsr #6 + cmp r12, #PLDTHRESH + ble .Lneon_copy_64_loop_nopld + + cmp r12, #BBTHRESH + ble .Lneon_prime_pump + + add lr, r0, #0x400 + add r9, r1, #(PLDOFFS*PLDSIZE) + sub lr, lr, r9 + lsl lr, lr, #21 + lsr lr, lr, #21 + add lr, lr, #(PLDOFFS*PLDSIZE) + cmp r12, lr, lsr #6 + movle lr, #(PLDOFFS*PLDSIZE) + + movgt r9, #(PLDOFFS) + rsbgts r9, r9, lr, lsr #6 + ble .Lneon_prime_pump + + add r10, r1, lr + bic r10, #0x3F + + sub r12, lr, lsr #6 + cmp r9, r12 + suble r12, r12, r9 + movgt r9, r12 + movgt r12, #0 + + pld [r1, #((PLDOFFS-1)*PLDSIZE)] + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_copy_64_loop_outer_doublepld: + pld [r1, #((PLDOFFS)*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r9, r9, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .Lneon_copy_64_loop_outer_doublepld + cmp r12, #0 + bne .Lneon_copy_64_loop_outer + mov r12, lr, lsr #6 + b .Lneon_copy_64_loop_nopld + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_prime_pump: + mov lr, #(PLDOFFS*PLDSIZE) + add r10, r1, #(PLDOFFS*PLDSIZE) + bic r10, #0x3F + sub r12, r12, #PLDOFFS + pld [r10, #(-1*PLDSIZE)] + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_copy_64_loop_outer: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .Lneon_copy_64_loop_outer + mov r12, lr, lsr #6 + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_copy_64_loop_nopld: + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_64_loop_nopld + ands r2, r2, #0x3f + beq .Lneon_exit + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_copy_32_a: + movs r12, r2, lsl #27 + bcc .Lneon_16 + vld1.32 {q0,q1}, [r1]! + vst1.32 {q0,q1}, [r0]! + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_16: + bpl .Lneon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + ands r2, r2, #0x0f + beq .Lneon_exit + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_lt16: + movs r12, r2, lsl #29 + ldrcs r3, [r1], #4 + ldrcs r12, [r1], #4 + strcs r3, [r0], #4 + strcs r12, [r0], #4 + ldrmi r3, [r1], #4 + strmi r3, [r0], #4 + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_lt4: + movs r2, r2, lsl #31 + ldrcsh r3, [r1], #2 + strcsh r3, [r0], #2 + ldrmib r12, [r1] + strmib r12, [r0] + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_exit: + ldmfd sp!, {r0, r9, r10, lr} + bx lr + .end +#elif defined(SCORPION_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true + * TARGET_USE_SCORPION_PLD_SET := true + * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset> + * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize> + */ +#ifndef PLDOFFS +#define PLDOFFS (6) +#endif +#ifndef PLDSIZE +#define PLDSIZE (128) /* L2 cache line size */ +#endif + .code 32 + .align 5 + .globl memcpy + .func +memcpy: + push {r0} + cmp r2, #4 + blt .Lneon_lt4 + cmp r2, #16 + blt .Lneon_lt16 + cmp r2, #32 + blt .Lneon_16 + cmp r2, #128 + blt .Lneon_copy_32_a + /* Copy blocks of 128-bytes (word-aligned) at a time*/ + /* Code below is optimized for PLDSIZE=128 only */ + mov r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #(PLDOFFS-1)*PLDSIZE] +.Lneon_copy_128_loop_outer: + pld [r1, #(PLDOFFS*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_copy_128_loop_nopld: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_exit + cmp r2, #32 + blt .Lneon_16 + nop + /* Copy blocks of 32-bytes (word aligned) at a time*/ +.Lneon_copy_32_a: + mov r12, r2, lsr #5 +.Lneon_copy_32_loop_a: + vld1.32 {q0,q1}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + bne .Lneon_copy_32_loop_a + ands r2, r2, #0x1f + beq .Lneon_exit +.Lneon_16: + subs r2, r2, #16 + blt .Lneon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + beq .Lneon_exit +.Lneon_lt16: + movs r12, r2, lsl #29 + bcc .Lneon_skip8 + ldr r3, [r1], #4 + ldr r12, [r1], #4 + str r3, [r0], #4 + str r12, [r0], #4 +.Lneon_skip8: + bpl .Lneon_lt4 + ldr r3, [r1], #4 + str r3, [r0], #4 +.Lneon_lt4: + movs r2, r2, lsl #31 + bcc .Lneon_lt2 + ldrh r3, [r1], #2 + strh r3, [r0], #2 +.Lneon_lt2: + bpl .Lneon_exit + ldrb r12, [r1] + strb r12, [r0] +.Lneon_exit: + pop {r0} + bx lr + .endfunc + .end +#else +#if defined(CORTEX_CACHE_LINE_32) + /* + *This can be enabled by setting flag + *TARGET_CORTEX_CACHE_LINE_32 in + *device/<vendor>/<board>/BoardConfig.mk + */ + .text + .fpu neon + + .global memcpy + .type memcpy, %function + .align 4 + +/* a prefetch distance of 4 cache-lines works best experimentally */ +#define CACHE_LINE_SIZE 32 +memcpy: + .fnstart + .save {r0, lr} + stmfd sp!, {r0, lr} + + /* start preloading as early as possible */ + pld [r1, #(CACHE_LINE_SIZE*0)] + pld [r1, #(CACHE_LINE_SIZE*1)] + + /* do we have at least 16-bytes to copy (needed for alignment below) */ + cmp r2, #16 + blo 5f + + /* align destination to half cache-line for the write-buffer */ + rsb r3, r0, #0 + ands r3, r3, #0xF + beq 0f + + /* copy up to 15-bytes (count in r3) */ + sub r2, r2, r3 + movs ip, r3, lsl #31 + ldrmib lr, [r1], #1 + strmib lr, [r0], #1 + ldrcsb ip, [r1], #1 + ldrcsb lr, [r1], #1 + strcsb ip, [r0], #1 + strcsb lr, [r0], #1 + movs ip, r3, lsl #29 + bge 1f + // copies 4 bytes, destination 32-bits aligned + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! +1: bcc 2f + // copies 8 bytes, destination 64-bits aligned + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0, :64]! +2: + +0: /* preload immediately the next cache line, which we may need */ + pld [r1, #(CACHE_LINE_SIZE*0)] + pld [r1, #(CACHE_LINE_SIZE*1)] + + /* make sure we have at least 128 bytes to copy */ + subs r2, r2, #128 + blo 2f + + /* preload all the cache lines we need. + * NOTE: the number of pld below depends on PREFETCH_DISTANCE, + * ideally would would increase the distance in the main loop to + * avoid the goofy code below. In practice this doesn't seem to make + * a big difference. + */ + pld [r1, #(CACHE_LINE_SIZE*2)] + pld [r1, #(CACHE_LINE_SIZE*3)] + pld [r1, #(CACHE_LINE_SIZE*4)] + + .align 3 +1: /* The main loop copies 128 bytes at a time */ + subs r2, r2, #128 + vld1.8 {d0 - d3}, [r1]! + vld1.8 {d4 - d7}, [r1]! + pld [r1, #(CACHE_LINE_SIZE*1)] + pld [r1, #(CACHE_LINE_SIZE*2)] + vld1.8 {d16 - d19}, [r1]! + vld1.8 {d20 - d23}, [r1]! + pld [r1, #(CACHE_LINE_SIZE*1)] + pld [r1, #(CACHE_LINE_SIZE*2)] + vst1.8 {d0 - d3}, [r0, :128]! + vst1.8 {d4 - d7}, [r0, :128]! + vst1.8 {d16 - d19}, [r0, :128]! + vst1.8 {d20 - d23}, [r0, :128]! + bhs 1b + +2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ + add r2, r2, #128 + subs r2, r2, #32 + blo 4f + +3: /* 32 bytes at a time. These cache lines were already preloaded */ + vld1.8 {d0 - d3}, [r1]! + subs r2, r2, #32 + vst1.8 {d0 - d3}, [r0, :128]! + bhs 3b + +4: /* less than 32 left */ + add r2, r2, #32 + tst r2, #0x10 + beq 5f + // copies 16 bytes, 128-bits aligned + vld1.8 {d0, d1}, [r1]! + vst1.8 {d0, d1}, [r0, :128]! + +5: /* copy up to 15-bytes (count in r2) */ + movs ip, r2, lsl #29 + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: bge 2f + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! +2: movs ip, r2, lsl #31 + ldrmib r3, [r1], #1 + ldrcsb ip, [r1], #1 + ldrcsb lr, [r1], #1 + strmib r3, [r0], #1 + strcsb ip, [r0], #1 + strcsb lr, [r0], #1 + + ldmfd sp!, {r0, lr} + bx lr + .fnend +#else /*!CORTEX_CACHE_LINE_32*/ .text .fpu neon @@ -165,8 +557,8 @@ ENTRY(memcpy) ldmfd sp!, {r0, lr} bx lr END(memcpy) - - +#endif /* CORTEX_CACHE_LINE_32 */ +#endif /* !SCORPION_NEON_OPTIMIZATION */ #else /* __ARM_ARCH__ < 7 */ diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S new file mode 100644 index 0000000..937d14b --- /dev/null +++ b/libc/arch-arm/bionic/memmove.S @@ -0,0 +1,526 @@ +/*************************************************************************** + Copyright (c) 2009-2012 Code Aurora Forum. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Code Aurora nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +/*************************************************************************** + * Neon memmove: Attempts to do a memmove with Neon registers if possible, + * Inputs: + * dest: The destination buffer + * src: The source buffer + * n: The size of the buffer to transfer + * Outputs: + * + ***************************************************************************/ + +#include <machine/cpu-features.h> + +#if defined(KRAIT_NEON_OPTIMIZATION) || defined(SPARROW_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true + * TARGET_USE_KRAIT_PLD_SET := true + * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> + * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> + * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> + */ +#ifndef PLDOFFS +#define PLDOFFS (10) +#endif +#ifndef PLDTHRESH +#define PLDTHRESH (PLDOFFS) +#endif +#if (PLDOFFS < 5) +#error Routine does not support offsets less than 5 +#endif +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif +#ifndef PLDSIZE +#define PLDSIZE (64) +#endif +#define NOP_OPCODE (0xe320f000) + + .code 32 + .align 5 + .global memmove + .type memmove, %function + + .global _memmove_words + .type _memmove_words, %function + + .global bcopy + .type bcopy, %function + +bcopy: + mov r12, r0 + mov r0, r1 + mov r1, r12 + .balignl 64, NOP_OPCODE, 4*2 +memmove: +_memmove_words: +.Lneon_memmove_cmf: + subs r12, r0, r1 + bxeq lr + cmphi r2, r12 + bls memcpy /* Use memcpy for non-overlapping areas */ + + push {r0} + +.Lneon_back_to_front_copy: + add r0, r0, r2 + add r1, r1, r2 + cmp r2, #4 + bgt .Lneon_b2f_gt4 + cmp r2, #0 +.Lneon_b2f_smallcopy_loop: + beq .Lneon_memmove_done + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + b .Lneon_b2f_smallcopy_loop +.Lneon_b2f_gt4: + sub r3, r0, r1 + cmp r2, r3 + movle r12, r2 + movgt r12, r3 + cmp r12, #64 + bge .Lneon_b2f_copy_64 + cmp r12, #32 + bge .Lneon_b2f_copy_32 + cmp r12, #8 + bge .Lneon_b2f_copy_8 + cmp r12, #4 + bge .Lneon_b2f_copy_4 + b .Lneon_b2f_copy_1 +.Lneon_b2f_copy_64: + sub r1, r1, #64 /* Predecrement */ + sub r0, r0, #64 + movs r12, r2, lsr #6 + cmp r12, #PLDTHRESH + ble .Lneon_b2f_copy_64_loop_nopld + sub r12, #PLDOFFS + pld [r1, #-(PLDOFFS-5)*PLDSIZE] + pld [r1, #-(PLDOFFS-4)*PLDSIZE] + pld [r1, #-(PLDOFFS-3)*PLDSIZE] + pld [r1, #-(PLDOFFS-2)*PLDSIZE] + pld [r1, #-(PLDOFFS-1)*PLDSIZE] + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_b2f_copy_64_loop_outer: + pld [r1, #-(PLDOFFS)*PLDSIZE] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + sub r1, r1, #96 /* Post-fixup and predecrement */ + vst1.32 {q2, q3}, [r0] + sub r0, r0, #96 + bne .Lneon_b2f_copy_64_loop_outer + mov r12, #PLDOFFS + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_b2f_copy_64_loop_nopld: + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1] + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + sub r1, r1, #96 /* Post-fixup and predecrement */ + vst1.32 {q10, q11}, [r0] + sub r0, r0, #96 + bne .Lneon_b2f_copy_64_loop_nopld + ands r2, r2, #0x3f + beq .Lneon_memmove_done + add r1, r1, #64 /* Post-fixup */ + add r0, r0, #64 + cmp r2, #32 + blt .Lneon_b2f_copy_finish +.Lneon_b2f_copy_32: + mov r12, r2, lsr #5 +.Lneon_b2f_copy_32_loop: + sub r1, r1, #32 /* Predecrement */ + sub r0, r0, #32 + vld1.32 {q0,q1}, [r1] + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0] + bne .Lneon_b2f_copy_32_loop + ands r2, r2, #0x1f + beq .Lneon_memmove_done +.Lneon_b2f_copy_finish: +.Lneon_b2f_copy_8: + movs r12, r2, lsr #0x3 + beq .Lneon_b2f_copy_4 + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_b2f_copy_8_loop: + sub r1, r1, #8 /* Predecrement */ + sub r0, r0, #8 + vld1.32 {d0}, [r1] + subs r12, r12, #1 + vst1.32 {d0}, [r0] + bne .Lneon_b2f_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_b2f_copy_4: + movs r12, r2, lsr #0x2 + beq .Lneon_b2f_copy_1 +.Lneon_b2f_copy_4_loop: + ldr r3, [r1, #-4]! + subs r12, r12, #1 + str r3, [r0, #-4]! + bne .Lneon_b2f_copy_4_loop + ands r2, r2, #0x3 +.Lneon_b2f_copy_1: + cmp r2, #0 + beq .Lneon_memmove_done + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_b2f_copy_1_loop: + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + bne .Lneon_b2f_copy_1_loop + +.Lneon_memmove_done: + pop {r0} + bx lr + + .end + +#elif defined(SCORPION_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true + * TARGET_USE_SCORPION_PLD_SET := true + * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset> + * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize> + */ +#ifndef PLDOFFS +#define PLDOFFS (6) +#endif +#ifndef PLDSIZE +#define PLDSIZE (128) /* L2 cache line size */ +#endif + + .code 32 + .align 5 + .global memmove + .type memmove, %function + + .global bcopy + .type bcopy, %function + +bcopy: + mov r12, r0 + mov r0, r1 + mov r1, r12 +memmove: + push {r0} + + /* + * The requirements for memmove state that the function should + * operate as if data were being copied from the source to a + * buffer, then to the destination. This is to allow a user + * to copy data from a source and target that overlap. + * + * We can't just do byte copies front-to-back automatically, since + * there's a good chance we may have an overlap (why else would someone + * intentionally use memmove then?). + * + * We'll break this into two parts. Front-to-back, or back-to-front + * copies. + */ +.Lneon_memmove_cmf: + cmp r0, r1 + blt .Lneon_front_to_back_copy + bgt .Lneon_back_to_front_copy + b .Lneon_memmove_done + + /* ############################################################# + * Front to Back copy + */ +.Lneon_front_to_back_copy: + /* + * For small copies, just do a quick memcpy. We can do this for + * front-to-back copies, aligned or unaligned, since we're only + * doing 1 byte at a time... + */ + cmp r2, #4 + bgt .Lneon_f2b_gt4 + cmp r2, #0 +.Lneon_f2b_smallcopy_loop: + beq .Lneon_memmove_done + ldrb r12, [r1], #1 + subs r2, r2, #1 + strb r12, [r0], #1 + b .Lneon_f2b_smallcopy_loop +.Lneon_f2b_gt4: + /* The window size is in r3. */ + sub r3, r1, r0 + /* ############################################################# + * Front to Back copy + */ + /* + * Note that we can't just route based on the size in r2. If that's + * larger than the overlap window in r3, we could potentially + * (and likely!) destroy data we're copying. + */ + cmp r2, r3 + movle r12, r2 + movgt r12, r3 + cmp r12, #256 + bge .Lneon_f2b_copy_128 + cmp r12, #64 + bge .Lneon_f2b_copy_32 + cmp r12, #16 + bge .Lneon_f2b_copy_16 + cmp r12, #8 + bge .Lneon_f2b_copy_8 + cmp r12, #4 + bge .Lneon_f2b_copy_4 + b .Lneon_f2b_copy_1 + nop +.Lneon_f2b_copy_128: + mov r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_f2b_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #(PLDOFFS-1)*PLDSIZE] +.Lneon_f2b_copy_128_loop_outer: + pld [r1, #(PLDOFFS*PLDSIZE)] + vld1.32 {q0,q1}, [r1]! + vld1.32 {q2,q3}, [r1]! + vld1.32 {q8,q9}, [r1]! + vld1.32 {q10,q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + vst1.32 {q2,q3}, [r0]! + vst1.32 {q8,q9}, [r0]! + vst1.32 {q10,q11}, [r0]! + bne .Lneon_f2b_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_f2b_copy_128_loop_nopld: + vld1.32 {q0,q1}, [r1]! + vld1.32 {q2,q3}, [r1]! + vld1.32 {q8,q9}, [r1]! + vld1.32 {q10,q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + vst1.32 {q2,q3}, [r0]! + vst1.32 {q8,q9}, [r0]! + vst1.32 {q10,q11}, [r0]! + bne .Lneon_f2b_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_memmove_done + cmp r2, #32 + bge .Lneon_f2b_copy_32 + b .Lneon_f2b_copy_finish +.Lneon_f2b_copy_32: + mov r12, r2, lsr #5 +.Lneon_f2b_copy_32_loop: + vld1.32 {q0,q1}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + bne .Lneon_f2b_copy_32_loop + ands r2, r2, #0x1f + beq .Lneon_memmove_done +.Lneon_f2b_copy_finish: +.Lneon_f2b_copy_16: + movs r12, r2, lsr #4 + beq .Lneon_f2b_copy_8 +.Lneon_f2b_copy_16_loop: + vld1.32 {q0}, [r1]! + subs r12, r12, #1 + vst1.32 {q0}, [r0]! + bne .Lneon_f2b_copy_16_loop + ands r2, r2, #0xf + beq .Lneon_memmove_done +.Lneon_f2b_copy_8: + movs r12, r2, lsr #3 + beq .Lneon_f2b_copy_4 +.Lneon_f2b_copy_8_loop: + vld1.32 {d0}, [r1]! + subs r12, r12, #1 + vst1.32 {d0}, [r0]! + bne .Lneon_f2b_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_f2b_copy_4: + movs r12, r2, lsr #2 + beq .Lneon_f2b_copy_1 +.Lneon_f2b_copy_4_loop: + ldr r3, [r1], #4 + subs r12, r12, #1 + str r3, [r0], #4 + bne .Lneon_f2b_copy_4_loop + ands r2, r2, #0x3 + nop +.Lneon_f2b_copy_1: + cmp r2, #0 + beq .Lneon_memmove_done +.Lneon_f2b_copy_1_loop: + ldrb r12, [r1], #1 + subs r2, r2, #1 + strb r12, [r0], #1 + bne .Lneon_f2b_copy_1_loop +.Lneon_f2b_finish: + b .Lneon_memmove_done + + /* ############################################################# + * Back to Front copy + */ +.Lneon_back_to_front_copy: + /* + * Here, we'll want to shift to the end of the buffers. This + * actually points us one past where we need to go, but since + * we'll pre-decrement throughout, this will be fine. + */ + add r0, r0, r2 + add r1, r1, r2 + cmp r2, #4 + bgt .Lneon_b2f_gt4 + cmp r2, #0 +.Lneon_b2f_smallcopy_loop: + beq .Lneon_memmove_done + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + b .Lneon_b2f_smallcopy_loop +.Lneon_b2f_gt4: + /* + * The minimum of the overlap window size and the copy size + * is in r3. + */ + sub r3, r0, r1 + /* + * ############################################################# + * Back to Front copy - + */ + cmp r2, r3 + movle r12, r2 + movgt r12, r3 + cmp r12, #256 + bge .Lneon_b2f_copy_128 + cmp r12, #64 + bge .Lneon_b2f_copy_32 + cmp r12, #8 + bge .Lneon_b2f_copy_8 + cmp r12, #4 + bge .Lneon_b2f_copy_4 + b .Lneon_b2f_copy_1 + nop +.Lneon_b2f_copy_128: + movs r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_b2f_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #-(PLDOFFS-1)*PLDSIZE] +.Lneon_b2f_copy_128_loop_outer: + pld [r1, #-(PLDOFFS*PLDSIZE)] + sub r1, r1, #128 + sub r0, r0, #128 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + sub r1, r1, #128 + sub r0, r0, #128 + bne .Lneon_b2f_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_b2f_copy_128_loop_nopld: + sub r1, r1, #128 + sub r0, r0, #128 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + sub r1, r1, #128 + sub r0, r0, #128 + bne .Lneon_b2f_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_memmove_done + cmp r2, #32 + bge .Lneon_b2f_copy_32 + b .Lneon_b2f_copy_finish +.Lneon_b2f_copy_32: + mov r12, r2, lsr #5 +.Lneon_b2f_copy_32_loop: + sub r1, r1, #32 + sub r0, r0, #32 + vld1.32 {q0,q1}, [r1] + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0] + bne .Lneon_b2f_copy_32_loop + ands r2, r2, #0x1f + beq .Lneon_memmove_done +.Lneon_b2f_copy_finish: +.Lneon_b2f_copy_8: + movs r12, r2, lsr #0x3 + beq .Lneon_b2f_copy_4 +.Lneon_b2f_copy_8_loop: + sub r1, r1, #8 + sub r0, r0, #8 + vld1.32 {d0}, [r1] + subs r12, r12, #1 + vst1.32 {d0}, [r0] + bne .Lneon_b2f_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_b2f_copy_4: + movs r12, r2, lsr #0x2 + beq .Lneon_b2f_copy_1 +.Lneon_b2f_copy_4_loop: + ldr r3, [r1, #-4]! + subs r12, r12, #1 + str r3, [r0, #-4]! + bne .Lneon_b2f_copy_4_loop + ands r2, r2, #0x3 + nop +.Lneon_b2f_copy_1: + cmp r2, #0 + beq .Lneon_memmove_done +.Lneon_b2f_copy_1_loop: + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + bne .Lneon_b2f_copy_1_loop + +.Lneon_memmove_done: + pop {r0} + bx lr + + .end +#endif /* SCORPION_NEON_OPTIMIZATION */ + diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S index 273b9e3..c386e7e 100644 --- a/libc/arch-arm/bionic/memset.S +++ b/libc/arch-arm/bionic/memset.S @@ -2,6 +2,8 @@ * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * + * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -27,6 +29,90 @@ */ #include <machine/asm.h> + +#if( defined(SCORPION_NEON_OPTIMIZATION) || defined(CORTEX_CACHE_LINE_32)) + .code 32 + .align 8 + .global memset + .type memset, %function + + .global bzero + .type bzero, %function + +bzero: + mov r2, r1 + mov r1, #0 +memset: + push {r0} + + cmp r2, #6 + bgt .Lmemset_gt6 + cmp r2, #0 + beq .Lmemset_smallcopy_done +.Lmemset_smallcopy_loop: + strb r1, [r0], #1 + subs r2, r2, #1 + bne .Lmemset_smallcopy_loop +.Lmemset_smallcopy_done: + pop {r0} + bx lr + +.Lmemset_gt6: + vdup.8 q0, r1 + vmov r1, s0 + + /* + * Decide where to route for the maximum copy sizes. + */ + cmp r2, #4 + blt .Lmemset_lt4 + cmp r2, #16 + blt .Lmemset_lt16 + vmov q1, q0 + cmp r2, #128 + blt .Lmemset_32 +.Lmemset_128: + mov r12, r2, lsr #7 +.Lmemset_128_loop: + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + subs r12, r12, #1 + bne .Lmemset_128_loop + ands r2, r2, #0x7f + beq .Lmemset_end +.Lmemset_32: + movs r12, r2, lsr #5 + beq .Lmemset_lt32 +.Lmemset_32_loop: + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + bne .Lmemset_32_loop + ands r2, r2, #0x1f + beq .Lmemset_end +.Lmemset_lt32: + cmp r2, #16 + blt .Lmemset_lt16 + vst1.64 {q0}, [r0]! + subs r2, r2, #16 + beq .Lmemset_end +.Lmemset_lt16: + movs r12, r2, lsl #29 + strcs r1, [r0], #4 + strcs r1, [r0], #4 + strmi r1, [r0], #4 +.Lmemset_lt4: + movs r2, r2, lsl #31 + strcsh r1, [r0], #2 + strmib r1, [r0] +.Lmemset_end: + pop {r0} + bx lr + + .end +#else /* !(SCORPION_NEON_OPTIMIZATION || CORTEX_CACHE_LINE_32) */ + /* * Optimized memset() for ARM. @@ -107,3 +193,5 @@ ENTRY(memset) ldmfd sp!, {r0, r4-r7, lr} bx lr END(memset) + +#endif /* SCORPION_NEON_OPTIMIZATION */ diff --git a/libc/arch-arm/bionic/strlen-armv7.S b/libc/arch-arm/bionic/strlen-armv7.S new file mode 100644 index 0000000..125e92f --- /dev/null +++ b/libc/arch-arm/bionic/strlen-armv7.S @@ -0,0 +1,111 @@ +/* Copyright (c) 2010-2011, Linaro Limited + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Linaro Limited nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Written by Dave Gilbert <david.gilbert@linaro.org> + + This strlen routine is optimised on a Cortex-A9 and should work on + all ARMv7 processors. This routine is reasonably fast for short + strings, but is probably slower than a simple implementation if all + your strings are very short */ + +@ 2011-02-08 david.gilbert@linaro.org +@ Extracted from local git 6848613a + + +@ this lets us check a flag in a 00/ff byte easily in either endianness +#ifdef __ARMEB__ +#define CHARTSTMASK(c) 1<<(31-(c*8)) +#else +#define CHARTSTMASK(c) 1<<(c*8) +#endif + +@----------------------------------------------------------------------------------------------------------------------------- + .syntax unified + .arch armv7-a + + .thumb_func + .align 2 + .p2align 4,,15 + .global strlen + .type strlen,%function +strlen: + @ r0 = string + @ returns count of bytes in string not including terminator + mov r1, r0 + push { r4,r6 } + mvns r6, #0 @ all F + movs r4, #0 + tst r0, #7 + beq 2f + +1: + ldrb r2, [r1], #1 + tst r1, #7 @ Hit alignment yet? + cbz r2, 10f @ Exit if we found the 0 + bne 1b + + @ So we're now aligned +2: + ldmia r1!,{r2,r3} + uadd8 r2, r2, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r2, r4, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r3, r3, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r3, r2, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cmp r3, #0 + beq 2b + +strlenendtmp: + @ One (or more) of the bytes we loaded was 0 - but which one? + @ r2 has the mask corresponding to the first loaded word + @ r3 has a combined mask of the two words - but if r2 was all-non 0 + @ then it's just the 2nd words + cmp r2, #0 + itte eq + moveq r2, r3 @ the end is in the 2nd word + subeq r1,r1,#3 + subne r1,r1,#7 + + @ r1 currently points to the 2nd byte of the word containing the 0 + tst r2, # CHARTSTMASK(0) @ 1st character + bne 10f + adds r1,r1,#1 + tst r2, # CHARTSTMASK(1) @ 2nd character + ittt eq + addeq r1,r1,#1 + tsteq r2, # (3<<15) @ 2nd & 3rd character + @ If not the 3rd must be the last one + addeq r1,r1,#1 + +10: + @ r0 is still at the beginning, r1 is pointing 1 byte after the terminator + sub r0, r1, r0 + subs r0, r0, #1 + pop { r4, r6 } + bx lr diff --git a/libc/arch-arm/include/machine/kernel_user_helper.h b/libc/arch-arm/include/machine/kernel_user_helper.h new file mode 100644 index 0000000..8836c50 --- /dev/null +++ b/libc/arch-arm/include/machine/kernel_user_helper.h @@ -0,0 +1,68 @@ +/* Copyright (c) 2012, The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _ARCH_ARM_KERNEL_USER_HELPER_H +#define _ARCH_ARM_KERNEL_USER_HELPER_H + +extern int clock_gettime_syscall(int clk_id, struct timespec *tp); +extern int gettimeofday_syscall(struct timeval *tv, struct timezone *tz); + +#define __kuser_gtod_base (*(int32_t *)0xffff0f40) +#define __kuser_gtod_cycle_last (*(int32_t *)0xffff0f40) +#define __kuser_gtod_mask (*(int32_t *)0xffff0f48) +#define __kuser_gtod_mult (*(int32_t *)0xffff0f50) +#define __kuser_gtod_shift (*(int32_t *)0xffff0f54) +#define __kuser_gtod_tv_sec (*(int32_t *)0xffff0f58) +#define __kuser_gtod_tv_nsec (*(int32_t *)0xffff0f5c) + +#define __kuser_gtod_seqnum (*(int32_t *)0xffff0f28) +#define __kuser_gtod_offset (*(int32_t *)0xffff0f30) +#define __kuser_gtod_cycle_base 0xfffef000 +#define __kuser_gtod_feature (*(int32_t *)0xffff0f34) +#define __kuser_gtod_feature_flag 0xffff0f20 + +#define __kuser_gtod_wtm_tv_sec (*(int32_t *)0xffff0f38) +#define __kuser_gtod_wtm_tv_nsec (*(int32_t *)0xffff0f3c) + +#define __kuser_gtod_timezone (*(int32_t *)0xffff0f20) +#define __kuser_gtod_tz_minw (*(int32_t *)0xffff0f20) +#define __kuser_gtod_tz_dst (*(int32_t *)0xffff0f24) + +struct gtod_t { + uint64_t cycle_last; + uint64_t mask; + uint32_t mult; + uint32_t shift; + uint32_t tv_sec; + uint32_t tv_nsec; +}; + +#define NSEC_PER_SEC 1000000000L + +#endif diff --git a/libc/bionic/md5.c b/libc/bionic/md5.c index ba4aaed..02785bd 100644 --- a/libc/bionic/md5.c +++ b/libc/bionic/md5.c @@ -231,7 +231,7 @@ MD5_Update (struct md5 *m, const void *v, size_t len) } calc(m, current); #else - calc(m, (u_int32_t*)m->save); + calc(m, m->save32); #endif offset = 0; } diff --git a/libc/bionic/md5.h b/libc/bionic/md5.h index a381994..079ed84 100644 --- a/libc/bionic/md5.h +++ b/libc/bionic/md5.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995 - 2001 Kungliga Tekniska Högskolan + * Copyright (c) 1995 - 2001 Kungliga Tekniska H?gskolan * (Royal Institute of Technology, Stockholm, Sweden). * All rights reserved. * @@ -40,7 +40,10 @@ struct md5 { unsigned int sz[2]; u_int32_t counter[4]; - unsigned char save[64]; + union { + unsigned char save[64]; + u_int32_t save32[16]; + }; }; typedef struct md5 MD5_CTX; diff --git a/libc/bionic/sha1.c b/libc/bionic/sha1.c index efa95a5..7384812 100644 --- a/libc/bionic/sha1.c +++ b/libc/bionic/sha1.c @@ -23,10 +23,6 @@ #include <sha1.h> #include <string.h> -#if HAVE_NBTOOL_CONFIG_H -#include "nbtool_config.h" -#endif - #if !HAVE_SHA1_H #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) @@ -54,77 +50,16 @@ #define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); typedef union { - u_char c[64]; - u_int l[16]; + uint8_t c[64]; + uint32_t l[16]; } CHAR64LONG16; -/* old sparc64 gcc could not compile this */ -#undef SPARC64_GCC_WORKAROUND -#if defined(__sparc64__) && defined(__GNUC__) && __GNUC__ < 3 -#define SPARC64_GCC_WORKAROUND -#endif - -#ifdef SPARC64_GCC_WORKAROUND -void do_R01(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *); -void do_R2(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *); -void do_R3(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *); -void do_R4(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *); - -#define nR0(v,w,x,y,z,i) R0(*v,*w,*x,*y,*z,i) -#define nR1(v,w,x,y,z,i) R1(*v,*w,*x,*y,*z,i) -#define nR2(v,w,x,y,z,i) R2(*v,*w,*x,*y,*z,i) -#define nR3(v,w,x,y,z,i) R3(*v,*w,*x,*y,*z,i) -#define nR4(v,w,x,y,z,i) R4(*v,*w,*x,*y,*z,i) - -void -do_R01(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *block) -{ - nR0(a,b,c,d,e, 0); nR0(e,a,b,c,d, 1); nR0(d,e,a,b,c, 2); nR0(c,d,e,a,b, 3); - nR0(b,c,d,e,a, 4); nR0(a,b,c,d,e, 5); nR0(e,a,b,c,d, 6); nR0(d,e,a,b,c, 7); - nR0(c,d,e,a,b, 8); nR0(b,c,d,e,a, 9); nR0(a,b,c,d,e,10); nR0(e,a,b,c,d,11); - nR0(d,e,a,b,c,12); nR0(c,d,e,a,b,13); nR0(b,c,d,e,a,14); nR0(a,b,c,d,e,15); - nR1(e,a,b,c,d,16); nR1(d,e,a,b,c,17); nR1(c,d,e,a,b,18); nR1(b,c,d,e,a,19); -} - -void -do_R2(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *block) -{ - nR2(a,b,c,d,e,20); nR2(e,a,b,c,d,21); nR2(d,e,a,b,c,22); nR2(c,d,e,a,b,23); - nR2(b,c,d,e,a,24); nR2(a,b,c,d,e,25); nR2(e,a,b,c,d,26); nR2(d,e,a,b,c,27); - nR2(c,d,e,a,b,28); nR2(b,c,d,e,a,29); nR2(a,b,c,d,e,30); nR2(e,a,b,c,d,31); - nR2(d,e,a,b,c,32); nR2(c,d,e,a,b,33); nR2(b,c,d,e,a,34); nR2(a,b,c,d,e,35); - nR2(e,a,b,c,d,36); nR2(d,e,a,b,c,37); nR2(c,d,e,a,b,38); nR2(b,c,d,e,a,39); -} - -void -do_R3(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *block) -{ - nR3(a,b,c,d,e,40); nR3(e,a,b,c,d,41); nR3(d,e,a,b,c,42); nR3(c,d,e,a,b,43); - nR3(b,c,d,e,a,44); nR3(a,b,c,d,e,45); nR3(e,a,b,c,d,46); nR3(d,e,a,b,c,47); - nR3(c,d,e,a,b,48); nR3(b,c,d,e,a,49); nR3(a,b,c,d,e,50); nR3(e,a,b,c,d,51); - nR3(d,e,a,b,c,52); nR3(c,d,e,a,b,53); nR3(b,c,d,e,a,54); nR3(a,b,c,d,e,55); - nR3(e,a,b,c,d,56); nR3(d,e,a,b,c,57); nR3(c,d,e,a,b,58); nR3(b,c,d,e,a,59); -} - -void -do_R4(u_int32_t *a, u_int32_t *b, u_int32_t *c, u_int32_t *d, u_int32_t *e, CHAR64LONG16 *block) -{ - nR4(a,b,c,d,e,60); nR4(e,a,b,c,d,61); nR4(d,e,a,b,c,62); nR4(c,d,e,a,b,63); - nR4(b,c,d,e,a,64); nR4(a,b,c,d,e,65); nR4(e,a,b,c,d,66); nR4(d,e,a,b,c,67); - nR4(c,d,e,a,b,68); nR4(b,c,d,e,a,69); nR4(a,b,c,d,e,70); nR4(e,a,b,c,d,71); - nR4(d,e,a,b,c,72); nR4(c,d,e,a,b,73); nR4(b,c,d,e,a,74); nR4(a,b,c,d,e,75); - nR4(e,a,b,c,d,76); nR4(d,e,a,b,c,77); nR4(c,d,e,a,b,78); nR4(b,c,d,e,a,79); -} -#endif - /* * Hash a single 512-bit block. This is the core of the algorithm. */ -void SHA1Transform(state, buffer) - u_int32_t state[5]; - const u_char buffer[64]; +void SHA1Transform(uint32_t state[5], const uint8_t buffer[64]) { - u_int32_t a, b, c, d, e; + uint32_t a, b, c, d, e; CHAR64LONG16 *block; #ifdef SHA1HANDSOFF @@ -148,12 +83,6 @@ void SHA1Transform(state, buffer) d = state[3]; e = state[4]; -#ifdef SPARC64_GCC_WORKAROUND - do_R01(&a, &b, &c, &d, &e, block); - do_R2(&a, &b, &c, &d, &e, block); - do_R3(&a, &b, &c, &d, &e, block); - do_R4(&a, &b, &c, &d, &e, block); -#else /* 4 rounds of 20 operations each. Loop unrolled. */ R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); @@ -175,7 +104,6 @@ void SHA1Transform(state, buffer) R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); -#endif /* Add the working vars back into context.state[] */ state[0] += a; @@ -192,10 +120,8 @@ void SHA1Transform(state, buffer) /* * SHA1Init - Initialize new context */ -void SHA1Init(context) - SHA1_CTX *context; +void SHA1Init(SHA1_CTX *context) { - assert(context != 0); /* SHA1 initialization constants */ @@ -211,12 +137,9 @@ void SHA1Init(context) /* * Run your data through this. */ -void SHA1Update(context, data, len) - SHA1_CTX *context; - const u_char *data; - u_int len; +void SHA1Update(SHA1_CTX *context, const uint8_t *data, unsigned int len) { - u_int i, j; + unsigned int i, j; assert(context != 0); assert(data != 0); @@ -241,28 +164,26 @@ void SHA1Update(context, data, len) /* * Add padding and return the message digest. */ -void SHA1Final(digest, context) - u_char digest[20]; - SHA1_CTX* context; +void SHA1Final(uint8_t digest[20], SHA1_CTX *context) { - u_int i; - u_char finalcount[8]; + unsigned int i; + uint8_t finalcount[8]; assert(digest != 0); assert(context != 0); for (i = 0; i < 8; i++) { - finalcount[i] = (u_char)((context->count[(i >= 4 ? 0 : 1)] + finalcount[i] = (uint8_t)((context->count[(i >= 4 ? 0 : 1)] >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ } - SHA1Update(context, (const u_char *)"\200", 1); + SHA1Update(context, (const uint8_t *)"\200", 1); while ((context->count[0] & 504) != 448) - SHA1Update(context, (const u_char *)"\0", 1); + SHA1Update(context, (const uint8_t *)"\0", 1); SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */ if (digest) { for (i = 0; i < 20; i++) - digest[i] = (u_char) + digest[i] = (uint8_t) ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255); } } diff --git a/libc/bionic/system_properties.c b/libc/bionic/system_properties.c index caa5ca6..756ee3f 100644 --- a/libc/bionic/system_properties.c +++ b/libc/bionic/system_properties.c @@ -158,7 +158,10 @@ int __system_property_get(const char *name, char *value) static int send_prop_msg(prop_msg *msg) { struct pollfd pollfds[1]; - struct sockaddr_un addr; + union { + struct sockaddr_un addr; + struct sockaddr addr_g; + } addr; socklen_t alen; size_t namelen; int s; @@ -172,11 +175,11 @@ static int send_prop_msg(prop_msg *msg) memset(&addr, 0, sizeof(addr)); namelen = strlen(property_service_socket); - strlcpy(addr.sun_path, property_service_socket, sizeof addr.sun_path); - addr.sun_family = AF_LOCAL; + strlcpy(addr.addr.sun_path, property_service_socket, sizeof addr.addr.sun_path); + addr.addr.sun_family = AF_LOCAL; alen = namelen + offsetof(struct sockaddr_un, sun_path) + 1; - if(TEMP_FAILURE_RETRY(connect(s, (struct sockaddr *) &addr, alen)) < 0) { + if(TEMP_FAILURE_RETRY(connect(s, &addr.addr_g, alen) < 0)) { close(s); return result; } diff --git a/libc/include/errno.h b/libc/include/errno.h index e1b15c0..d3b0506 100644 --- a/libc/include/errno.h +++ b/libc/include/errno.h @@ -45,6 +45,7 @@ __BEGIN_DECLS extern int __set_errno(int error); /* internal function returning the address of the thread-specific errno */ +__attribute__((const)) extern volatile int* __errno(void); /* a macro expanding to the errno l-value */ diff --git a/libc/include/netinet/in6.h b/libc/include/netinet/in6.h index 7f3286a..ba24b6c 100644 --- a/libc/include/netinet/in6.h +++ b/libc/include/netinet/in6.h @@ -31,28 +31,28 @@ #include <linux/in6.h> #define IN6_IS_ADDR_UNSPECIFIED(a) \ - ((*(const uint32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[12]) == 0)) + (((a)->s6_addr32[0] == 0) && \ + ((a)->s6_addr32[1] == 0) && \ + ((a)->s6_addr32[2] == 0) && \ + ((a)->s6_addr32[3] == 0)) #define IN6_IS_ADDR_LOOPBACK(a) \ - ((*(const uint32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[12]) == ntohl(1))) + (((a)->s6_addr32[0] == 0) && \ + ((a)->s6_addr32[1] == 0) && \ + ((a)->s6_addr32[2] == 0) && \ + ((a)->s6_addr32[3] == ntohl(1))) #define IN6_IS_ADDR_V4COMPAT(a) \ - ((*(const uint32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[8]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[12]) != 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[12]) != ntohl(1))) + (((a)->s6_addr32[0] == 0) && \ + ((a)->s6_addr32[1] == 0) && \ + ((a)->s6_addr32[2] == 0) && \ + ((a)->s6_addr32[3] != 0) && \ + ((a)->s6_addr32[3] != ntohl(1))) #define IN6_IS_ADDR_V4MAPPED(a) \ - ((*(const uint32_t *)(const void *)(&(a)->s6_addr[0]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ - (*(const uint32_t *)(const void *)(&(a)->s6_addr[8]) == ntohl(0x0000ffff))) + (((a)->s6_addr32[0] == 0) && \ + ((a)->s6_addr32[1] == 0) && \ + ((a)->s6_addr32[2] == ntohl(0x0000ffff))) #define IN6_IS_ADDR_LINKLOCAL(a) \ (((a)->s6_addr[0] == 0xfe) && (((a)->s6_addr[1] & 0xc0) == 0x80)) @@ -65,7 +65,7 @@ (((a)->s6_addr[0] & 0xfe) == 0xfc) #define IN6_IS_ADDR_MULTICAST(a) \ - (((__const uint8_t *) (a))[0] == 0xff) + ((a)->s6_addr[0] == 0xff) #define IPV6_ADDR_SCOPE_NODELOCAL 0x01 diff --git a/libc/include/pthread.h b/libc/include/pthread.h index 2015ac0..af0cc5f 100644 --- a/libc/include/pthread.h +++ b/libc/include/pthread.h @@ -146,6 +146,7 @@ void pthread_exit(void * retval); int pthread_join(pthread_t thid, void ** ret_val); int pthread_detach(pthread_t thid); +__attribute__((const)) pthread_t pthread_self(void); int pthread_equal(pthread_t one, pthread_t two); diff --git a/libc/include/resolv.h b/libc/include/resolv.h index 7c34012..221410d 100644 --- a/libc/include/resolv.h +++ b/libc/include/resolv.h @@ -40,7 +40,7 @@ __BEGIN_DECLS struct res_state; -extern struct __res_state *__res_state(void); +extern struct __res_state *__res_state(void) __attribute__((const)); #define _res (*__res_state()) /* Base-64 functions - because some code expects it there */ diff --git a/libc/include/sha1.h b/libc/include/sha1.h index f7ada46..adfa1fc 100644 --- a/libc/include/sha1.h +++ b/libc/include/sha1.h @@ -18,14 +18,14 @@ typedef struct { uint32_t state[5]; uint32_t count[2]; - u_char buffer[64]; + uint8_t buffer[64]; } SHA1_CTX; __BEGIN_DECLS -void SHA1Transform(uint32_t[5], const u_char[64]); +void SHA1Transform(uint32_t[5], const uint8_t[64]); void SHA1Init(SHA1_CTX *); -void SHA1Update(SHA1_CTX *, const u_char *, u_int); -void SHA1Final(u_char[SHA1_DIGEST_LENGTH], SHA1_CTX *); +void SHA1Update(SHA1_CTX *, const uint8_t *, unsigned int); +void SHA1Final(uint8_t[SHA1_DIGEST_LENGTH], SHA1_CTX *); __END_DECLS #endif /* _SYS_SHA1_H_ */ diff --git a/libc/include/string.h b/libc/include/string.h index 06e2284..2ed74e8 100644 --- a/libc/include/string.h +++ b/libc/include/string.h @@ -224,6 +224,39 @@ size_t strlen(const char *s) { return __strlen_chk(s, bos); } +__purefunc extern char* __strchr_real(const char *, int) + __asm__(__USER_LABEL_PREFIX__ "strchr"); +extern char* __strchr_chk(const char *, int, size_t); + +__BIONIC_FORTIFY_INLINE +char* strchr(const char *s, int c) { + size_t bos = __builtin_object_size(s, 0); + + // Compiler doesn't know destination size. Don't call __strchr_chk + if (bos == __BIONIC_FORTIFY_UNKNOWN_SIZE) { + return __strchr_real(s, c); + } + + return __strchr_chk(s, c, bos); +} + +__purefunc extern char* __strrchr_real(const char *, int) + __asm__(__USER_LABEL_PREFIX__ "strrchr"); +extern char* __strrchr_chk(const char *, int, size_t); + +__BIONIC_FORTIFY_INLINE +char* strrchr(const char *s, int c) { + size_t bos = __builtin_object_size(s, 0); + + // Compiler doesn't know destination size. Don't call __strrchr_chk + if (bos == __BIONIC_FORTIFY_UNKNOWN_SIZE) { + return __strrchr_real(s, c); + } + + return __strrchr_chk(s, c, bos); +} + + #endif /* defined(__BIONIC_FORTIFY_INLINE) */ __END_DECLS diff --git a/libc/kernel/arch-arm/asm/unistd.h b/libc/kernel/arch-arm/asm/unistd.h index 454ed89..b3d75ca 100644 --- a/libc/kernel/arch-arm/asm/unistd.h +++ b/libc/kernel/arch-arm/asm/unistd.h @@ -466,7 +466,7 @@ #define __ARM_NR_usr32 (__ARM_NR_BASE+4) #define __ARM_NR_set_tls (__ARM_NR_BASE+5) /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ -#ifdef __ARM_EABI__ +#if defined(__ARM_EABI__) && !defined(__KERNEL__) #undef __NR_time #undef __NR_umount #undef __NR_stime diff --git a/libc/kernel/common/linux/android_pmem.h b/libc/kernel/common/linux/android_pmem.h index 8c605e4..f463807 100644 --- a/libc/kernel/common/linux/android_pmem.h +++ b/libc/kernel/common/linux/android_pmem.h @@ -29,6 +29,11 @@ #define PMEM_CONNECT _IOW(PMEM_IOCTL_MAGIC, 6, unsigned int) #define PMEM_GET_TOTAL_SIZE _IOW(PMEM_IOCTL_MAGIC, 7, unsigned int) #define PMEM_CACHE_FLUSH _IOW(PMEM_IOCTL_MAGIC, 8, unsigned int) + +#define PMEM_CLEAN_INV_CACHES _IOW(PMEM_IOCTL_MAGIC, 11, unsigned int) + +#define PMEM_ALLOCATE_ALIGNED _IOW(PMEM_IOCTL_MAGIC, 15, unsigned int) + struct android_pmem_platform_data /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ { @@ -46,4 +51,16 @@ struct pmem_region { unsigned long len; }; /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ + +struct pmem_addr { + unsigned long vaddr; + unsigned long offset; + unsigned long length; +}; + +struct pmem_allocation { + unsigned long size; + unsigned int align; +}; + #endif diff --git a/libc/kernel/common/linux/ashmem.h b/libc/kernel/common/linux/ashmem.h index e402e4e..a24d75a 100644 --- a/libc/kernel/common/linux/ashmem.h +++ b/libc/kernel/common/linux/ashmem.h @@ -47,4 +47,6 @@ struct ashmem_pin { #define ASHMEM_GET_PIN_STATUS _IO(__ASHMEMIOC, 9) /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ #define ASHMEM_PURGE_ALL_CACHES _IO(__ASHMEMIOC, 10) +#define ASHMEM_CACHE_FLUSH_RANGE _IO(__ASHMEMIOC, 11) + #endif diff --git a/libc/netbsd/gethnamaddr.c b/libc/netbsd/gethnamaddr.c index 9a9f6e2..055e9f2 100644 --- a/libc/netbsd/gethnamaddr.c +++ b/libc/netbsd/gethnamaddr.c @@ -653,14 +653,14 @@ gethostbyaddr(const void *addr, assert(addr != NULL); if (af == AF_INET6 && len == IN6ADDRSZ && - (IN6_IS_ADDR_LINKLOCAL((const struct in6_addr *)(const void *)uaddr) || - IN6_IS_ADDR_SITELOCAL((const struct in6_addr *)(const void *)uaddr))) { + (IN6_IS_ADDR_LINKLOCAL((const struct in6_addr *)addr) || + IN6_IS_ADDR_SITELOCAL((const struct in6_addr *)addr))) { h_errno = HOST_NOT_FOUND; return NULL; } if (af == AF_INET6 && len == IN6ADDRSZ && - (IN6_IS_ADDR_V4MAPPED((const struct in6_addr *)(const void *)uaddr) || - IN6_IS_ADDR_V4COMPAT((const struct in6_addr *)(const void *)uaddr))) { + (IN6_IS_ADDR_V4MAPPED((const struct in6_addr *)addr) || + IN6_IS_ADDR_V4COMPAT((const struct in6_addr *)addr))) { /* Unmap. */ addr += IN6ADDRSZ - INADDRSZ; uaddr += IN6ADDRSZ - INADDRSZ; diff --git a/libc/netbsd/net/getaddrinfo.c b/libc/netbsd/net/getaddrinfo.c index 326b09c..bd29c5a 100644 --- a/libc/netbsd/net/getaddrinfo.c +++ b/libc/netbsd/net/getaddrinfo.c @@ -411,7 +411,10 @@ android_getaddrinfo_proxy( { int sock; const int one = 1; - struct sockaddr_un proxy_addr; + union { + struct sockaddr_un un; + struct sockaddr generic; + } proxy_addr; const char* cache_mode = getenv("ANDROID_DNS_MODE"); FILE* proxy = NULL; int success = 0; @@ -452,12 +455,12 @@ android_getaddrinfo_proxy( setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); memset(&proxy_addr, 0, sizeof(proxy_addr)); - proxy_addr.sun_family = AF_UNIX; - strlcpy(proxy_addr.sun_path, "/dev/socket/dnsproxyd", - sizeof(proxy_addr.sun_path)); + proxy_addr.un.sun_family = AF_UNIX; + strlcpy(proxy_addr.un.sun_path, "/dev/socket/dnsproxyd", + sizeof(proxy_addr.un.sun_path)); if (TEMP_FAILURE_RETRY(connect(sock, - (const struct sockaddr*) &proxy_addr, - sizeof(proxy_addr))) != 0) { + &proxy_addr.generic, + sizeof(proxy_addr.un))) != 0) { close(sock); return -1; } @@ -1547,7 +1550,7 @@ _get_scope(const struct sockaddr *addr) /* RFC 4380, section 2.6 */ #define IN6_IS_ADDR_TEREDO(a) \ - ((*(const uint32_t *)(const void *)(&(a)->s6_addr[0]) == ntohl(0x20010000))) + (((a)->s6_addr32[0]) == ntohl(0x20010000)) /* RFC 3056, section 2. */ #define IN6_IS_ADDR_6TO4(a) \ diff --git a/libc/netbsd/net/getnameinfo.c b/libc/netbsd/net/getnameinfo.c index d8ac037..da9d7e3 100644 --- a/libc/netbsd/net/getnameinfo.c +++ b/libc/netbsd/net/getnameinfo.c @@ -147,7 +147,10 @@ android_gethostbyaddr_proxy(char* nameBuf, size_t nameBufLen, const void *addr, int sock; const int one = 1; - struct sockaddr_un proxy_addr; + union { + struct sockaddr_un un; + struct sockaddr generic; + } proxy_addr; const char* cache_mode = getenv("ANDROID_DNS_MODE"); FILE* proxy = NULL; int result = -1; @@ -175,11 +178,11 @@ android_gethostbyaddr_proxy(char* nameBuf, size_t nameBufLen, const void *addr, setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); memset(&proxy_addr, 0, sizeof(proxy_addr)); - proxy_addr.sun_family = AF_UNIX; - strlcpy(proxy_addr.sun_path, "/dev/socket/dnsproxyd", - sizeof(proxy_addr.sun_path)); - if (TEMP_FAILURE_RETRY(connect(sock, (const struct sockaddr*) (void*) &proxy_addr, - sizeof(proxy_addr))) != 0) { + proxy_addr.un.sun_family = AF_UNIX; + strlcpy(proxy_addr.un.sun_path, "/dev/socket/dnsproxyd", + sizeof(proxy_addr.un.sun_path)); + if (TEMP_FAILURE_RETRY(connect(sock, &proxy_addr.generic, + sizeof(proxy_addr.un))) != 0) { close(sock); return -1; } diff --git a/libc/netbsd/resolv/res_send.c b/libc/netbsd/resolv/res_send.c index f3ee539..028ffaf 100644 --- a/libc/netbsd/resolv/res_send.c +++ b/libc/netbsd/resolv/res_send.c @@ -404,7 +404,10 @@ res_nsend(res_state statp, */ if (EXT(statp).nscount != 0) { int needclose = 0; - struct sockaddr_storage peer; + union { + struct sockaddr_storage storage; + struct sockaddr generic; + } peer; socklen_t peerlen; if (EXT(statp).nscount != statp->nscount) @@ -420,13 +423,13 @@ res_nsend(res_state statp, if (EXT(statp).nssocks[ns] == -1) continue; - peerlen = sizeof(peer); + peerlen = sizeof(peer.storage); if (getpeername(EXT(statp).nssocks[ns], - (struct sockaddr *)(void *)&peer, &peerlen) < 0) { + &peer.generic, &peerlen) < 0) { needclose++; break; } - if (!sock_eq((struct sockaddr *)(void *)&peer, + if (!sock_eq(&peer.generic, get_nsaddr(statp, (size_t)ns))) { needclose++; break; @@ -750,12 +753,15 @@ send_vc(res_state statp, /* Are we still talking to whom we want to talk to? */ if (statp->_vcsock >= 0 && (statp->_flags & RES_F_VC) != 0) { - struct sockaddr_storage peer; - socklen_t size = sizeof peer; + union { + struct sockaddr_storage storage; + struct sockaddr generic; + } peer; + socklen_t size = sizeof peer.storage; if (getpeername(statp->_vcsock, - (struct sockaddr *)(void *)&peer, &size) < 0 || - !sock_eq((struct sockaddr *)(void *)&peer, nsap)) { + &peer.generic, &size) < 0 || + !sock_eq(&peer.generic, nsap)) { res_nclose(statp); statp->_flags &= ~RES_F_VC; } @@ -1034,7 +1040,10 @@ send_dg(res_state statp, int nsaplen; struct timespec now, timeout, finish; fd_set dsmask; - struct sockaddr_storage from; + union { + struct sockaddr_storage storage; + struct sockaddr generic; + } from; socklen_t fromlen; int resplen, seconds, n, s; @@ -1126,9 +1135,9 @@ retry: return (0); } errno = 0; - fromlen = sizeof(from); + fromlen = sizeof(from.storage); resplen = recvfrom(s, (char*)ans, (size_t)anssiz,0, - (struct sockaddr *)(void *)&from, &fromlen); + &from.generic, &fromlen); if (resplen <= 0) { Perror(statp, stderr, "recvfrom", errno); res_nclose(statp); @@ -1162,7 +1171,7 @@ retry: goto retry; } if (!(statp->options & RES_INSECURE1) && - !res_ourserver_p(statp, (struct sockaddr *)(void *)&from)) { + !res_ourserver_p(statp, &from.generic)) { /* * response from wrong server? ignore it. * XXX - potential security hazard could diff --git a/libc/private/bionic_atomic_arm.h b/libc/private/bionic_atomic_arm.h index 275c1c9..380c143 100644 --- a/libc/private/bionic_atomic_arm.h +++ b/libc/private/bionic_atomic_arm.h @@ -124,6 +124,11 @@ __bionic_memory_barrier(void) } #endif /* !ANDROID_SMP */ +/* LDREX/STREX routines broken on ARMv6 */ +# if __ARM_ARCH__ == 6 +# define BROKEN_REX +# endif + /* Compare-and-swap, without any explicit barriers. Note that this functions * returns 0 on success, and 1 on failure. The opposite convention is typically * used on other platforms. @@ -135,7 +140,7 @@ __bionic_memory_barrier(void) * * LDREX/STREX are only available starting from ARMv6 */ -#ifdef __ARM_HAVE_LDREX_STREX +#if defined(__ARM_HAVE_LDREX_STREX) && !defined(BROKEN_REX) __ATOMIC_INLINE__ int __bionic_cmpxchg(int32_t old_value, int32_t new_value, volatile int32_t* ptr) { @@ -182,7 +187,7 @@ __bionic_cmpxchg(int32_t old_value, int32_t new_value, volatile int32_t* ptr) * ARMv6+ => use LDREX/STREX * < ARMv6 => use SWP instead. */ -#ifdef __ARM_HAVE_LDREX_STREX +#if defined(__ARM_HAVE_LDREX_STREX) && !defined(BROKEN_REX) __ATOMIC_INLINE__ int32_t __bionic_swap(int32_t new_value, volatile int32_t* ptr) { @@ -216,7 +221,7 @@ __bionic_swap(int32_t new_value, volatile int32_t* ptr) /* Atomic increment - without any barriers * This returns the old value */ -#ifdef __ARM_HAVE_LDREX_STREX +#if defined(__ARM_HAVE_LDREX_STREX) && !defined(BROKEN_REX) __ATOMIC_INLINE__ int32_t __bionic_atomic_inc(volatile int32_t* ptr) { @@ -250,7 +255,7 @@ __bionic_atomic_inc(volatile int32_t* ptr) /* Atomic decrement - without any barriers * This returns the old value. */ -#ifdef __ARM_HAVE_LDREX_STREX +#if defined(__ARM_HAVE_LDREX_STREX) && !defined(BROKEN_REX) __ATOMIC_INLINE__ int32_t __bionic_atomic_dec(volatile int32_t* ptr) { diff --git a/libc/private/bionic_tls.h b/libc/private/bionic_tls.h index 4658866..2456ebb 100644 --- a/libc/private/bionic_tls.h +++ b/libc/private/bionic_tls.h @@ -100,7 +100,9 @@ extern int __set_tls(void *ptr); * C library, because we don't know where the corresponding code * is going to run. */ -# ifdef LIBC_STATIC +# if defined(LIBC_STATIC) || \ + (defined(__ARM_ARCH_6__) && defined(HAVE_ARM_TLS_REGISTER) && \ + !defined(__ARM_ARCH_6T2__)) /* Use the kernel helper in static C library. */ typedef volatile void* (__kernel_get_tls_t)(void); @@ -111,6 +113,12 @@ extern int __set_tls(void *ptr); * Note that HAVE_ARM_TLS_REGISTER is build-specific * (it must match your kernel configuration) */ +# ifdef HAVE_TEGRA_ERRATA_657451 +# define __munge_tls(_v) ( ((_v)&~((1ul<<20)|1ul)) | (((_v)&0x1)<<20) ) +# else +# define __munge_tls(_v) (_v) +#endif + # ifdef HAVE_ARM_TLS_REGISTER /* We can read the address directly from a coprocessor * register, which avoids touching the data cache @@ -119,6 +127,7 @@ extern int __set_tls(void *ptr); # define __get_tls() \ ({ register unsigned int __val asm("r0"); \ asm ("mrc p15, 0, r0, c13, c0, 3" : "=r"(__val) ); \ + __val = __munge_tls(__val); \ (volatile void*)__val; }) # else /* !HAVE_ARM_TLS_REGISTER */ /* The kernel provides the address of the TLS at a fixed diff --git a/libc/private/logd.h b/libc/private/logd.h index c81a91a..26878ba 100644 --- a/libc/private/logd.h +++ b/libc/private/logd.h @@ -29,6 +29,7 @@ #define _ANDROID_BIONIC_LOGD_H #include <stdarg.h> +#include <stdint.h> #define BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW 80100 #define BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW 80105 diff --git a/libc/string/strchr.c b/libc/string/strchr.c index 9b4332c..44516ef 100644 --- a/libc/string/strchr.c +++ b/libc/string/strchr.c @@ -29,11 +29,17 @@ */ #include <string.h> +#include <private/logd.h> char * -strchr(const char *p, int ch) +__strchr_chk(const char *p, int ch, size_t s_len) { - for (;; ++p) { + for (;; ++p, s_len--) { + if (s_len == 0) { + __libc_android_log_print(ANDROID_LOG_FATAL, "libc", + "*** FORTIFY_SOURCE strchr read beyond buffer ***\n"); + abort(); + } if (*p == (char) ch) return((char *)p); if (!*p) @@ -41,3 +47,8 @@ strchr(const char *p, int ch) } /* NOTREACHED */ } + +char * +strchr(const char *p, int ch) { + return __strchr_chk(p, ch, (size_t) -1); +} diff --git a/libc/string/strrchr.c b/libc/string/strrchr.c index 10c07e6..fc3dc4e 100644 --- a/libc/string/strrchr.c +++ b/libc/string/strrchr.c @@ -29,13 +29,19 @@ */ #include <string.h> +#include <private/logd.h> char * -strrchr(const char *p, int ch) +__strrchr_chk(const char *p, int ch, size_t s_len) { char *save; - for (save = NULL;; ++p) { + for (save = NULL;; ++p, s_len--) { + if (s_len == 0) { + __libc_android_log_print(ANDROID_LOG_FATAL, "libc", + "*** FORTIFY_SOURCE strrchr read beyond buffer ***\n"); + abort(); + } if (*p == (char) ch) save = (char *)p; if (!*p) @@ -43,3 +49,9 @@ strrchr(const char *p, int ch) } /* NOTREACHED */ } + +char * +strrchr(const char *p, int ch) +{ + return __strrchr_chk(p, ch, (size_t) -1); +} diff --git a/libc/tools/zoneinfo/ZoneCompactor.java b/libc/tools/zoneinfo/ZoneCompactor.java index b657748..cc77c94 100644 --- a/libc/tools/zoneinfo/ZoneCompactor.java +++ b/libc/tools/zoneinfo/ZoneCompactor.java @@ -55,11 +55,13 @@ public class ZoneCompactor { InputStream in = new FileInputStream(inFile); byte[] buf = new byte[8192]; + int length = 0; while (true) { int nbytes = in.read(buf); if (nbytes == -1) { break; } + length += nbytes; out.write(buf, 0, nbytes); byte[] nret = new byte[ret.length + nbytes]; @@ -67,6 +69,8 @@ public class ZoneCompactor { System.arraycopy(buf, 0, nret, ret.length, nbytes); ret = nret; } + if (length%4 != 0) + out.write(new byte[] {00,00,00,00}, 0, 4 - length % 4); out.flush(); return ret; } @@ -105,6 +109,9 @@ public class ZoneCompactor { lengths.put(s, new Integer((int)length)); start += length; + if (start % 4 != 0) + start += 4 - start % 4; + byte[] data = copyFile(f, zoneInfo); TimeZone tz = ZoneInfo.make(s, data); diff --git a/libc/tools/zoneinfo/generate b/libc/tools/zoneinfo/generate index ab2617f..7017e90 100755 --- a/libc/tools/zoneinfo/generate +++ b/libc/tools/zoneinfo/generate @@ -92,7 +92,7 @@ def upgrade_to(ftp, filename): subprocess.check_call(['javac', '-d', '.', '%s/ZoneCompactor.java' % bionic_libc_tools_zoneinfo_dir, '%s/ZoneInfo.java' % bionic_libc_tools_zoneinfo_dir]) - subprocess.check_call(['java', 'ZoneCompactor', 'setup', 'data']) + subprocess.check_call(['java', '-classpath', '.', 'ZoneCompactor', 'setup', 'data']) print 'Updating bionic from %s to %s...' % (current_tzdata_version(), version) # Move the .dat and .idx files... @@ -116,7 +116,8 @@ ftp.cwd('tz/releases') tzdata_filenames = [] for filename in ftp.nlst(): if filename.startswith('tzdata20'): - tzdata_filenames.append(filename) + if filename.endswith('tar.gz'): + tzdata_filenames.append(filename) tzdata_filenames.sort() # If you're several releases behind, we'll walk you through the upgrades one by one. diff --git a/libc/unistd/getopt_long.c b/libc/unistd/getopt_long.c index dbdf01a..0b8181a 100644 --- a/libc/unistd/getopt_long.c +++ b/libc/unistd/getopt_long.c @@ -100,12 +100,12 @@ static int nonopt_start = -1; /* first non option argument (for permute) */ static int nonopt_end = -1; /* first option after non options (for permute) */ /* Error messages */ -static const char recargchar[] = "option requires an argument -- %c"; -static const char recargstring[] = "option requires an argument -- %s"; -static const char ambig[] = "ambiguous option -- %.*s"; -static const char noarg[] = "option doesn't take an argument -- %.*s"; -static const char illoptchar[] = "unknown option -- %c"; -static const char illoptstring[] = "unknown option -- %s"; +static const char recargchar[] = "option requires an argument -- %c\n"; +static const char recargstring[] = "option requires an argument -- %s\n"; +static const char ambig[] = "ambiguous option -- %.*s\n"; +static const char noarg[] = "option doesn't take an argument -- %.*s\n"; +static const char illoptchar[] = "unknown option -- %c\n"; +static const char illoptstring[] = "unknown option -- %s\n"; /* * Compute the greatest common divisor of a and b. diff --git a/libc/zoneinfo/zoneinfo.dat b/libc/zoneinfo/zoneinfo.dat Binary files differindex cb0507a..cd4b4cc 100644 --- a/libc/zoneinfo/zoneinfo.dat +++ b/libc/zoneinfo/zoneinfo.dat diff --git a/libc/zoneinfo/zoneinfo.idx b/libc/zoneinfo/zoneinfo.idx Binary files differindex c93b637..1f5f538 100644 --- a/libc/zoneinfo/zoneinfo.idx +++ b/libc/zoneinfo/zoneinfo.idx diff --git a/libc/zoneinfo/zoneinfo.version b/libc/zoneinfo/zoneinfo.version index 73bb417..0bbfa63 100644 --- a/libc/zoneinfo/zoneinfo.version +++ b/libc/zoneinfo/zoneinfo.version @@ -1 +1 @@ -2012h +2012j diff --git a/libm/Android.mk b/libm/Android.mk index 9c88798..a28f1b8 100644 --- a/libm/Android.mk +++ b/libm/Android.mk @@ -72,7 +72,6 @@ libm_common_src_files:= \ src/s_ceill.c \ src/s_copysign.c \ src/s_copysignf.c \ - src/s_cos.c \ src/s_cosf.c \ src/s_erf.c \ src/s_erff.c \ @@ -132,7 +131,6 @@ libm_common_src_files:= \ src/s_signgam.c \ src/s_significand.c \ src/s_significandf.c \ - src/s_sin.c \ src/s_sinf.c \ src/s_tan.c \ src/s_tanf.c \ @@ -162,6 +160,30 @@ ifeq ($(TARGET_ARCH),arm) src/s_scalbnf.c \ src/e_sqrtf.c + ifeq ($(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION),true) + libm_common_src_files += \ + arm/e_pow.S \ + arm/s_cos.S \ + arm/s_sin.S + libm_common_cflags += -DKRAIT_NEON_OPTIMIZATION -fno-if-conversion + else + libm_common_src_files += \ + src/s_cos.c \ + src/s_sin.c + endif + + ifeq ($(TARGET_USE_SPARROW_BIONIC_OPTIMIZATION),true) + libm_common_src_files += \ + arm/e_pow.S + libm_common_cflags += -DSPARROW_NEON_OPTIMIZATION + endif + + ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true) + libm_common_src_files += \ + arm/e_pow.S + libm_common_cflags += -DSCORPION_NEON_OPTIMIZATION + endif + libm_common_includes = $(LOCAL_PATH)/arm endif @@ -182,7 +204,9 @@ ifeq ($(TARGET_ARCH),mips) src/s_scalbln.c \ src/s_scalbn.c \ src/s_scalbnf.c \ - src/e_sqrtf.c + src/e_sqrtf.c \ + src/s_sin.c \ + src/s_cos.c libm_common_includes = $(LOCAL_PATH)/mips # Need to build *rint* functions @@ -201,6 +225,8 @@ LOCAL_ARM_MODE := arm LOCAL_C_INCLUDES += $(libm_common_includes) LOCAL_CFLAGS := $(libm_common_cflags) +LOCAL_CFLAGS:= $(libm_common_cflags) + LOCAL_MODULE:= libm LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk @@ -221,6 +247,8 @@ LOCAL_ARM_MODE := arm LOCAL_C_INCLUDES += $(libm_common_includes) LOCAL_CFLAGS := $(libm_common_cflags) +LOCAL_CFLAGS:= $(libm_common_cflags) + LOCAL_MODULE:= libm LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk diff --git a/libm/arm/e_pow.S b/libm/arm/e_pow.S new file mode 100644 index 0000000..1e328f8 --- /dev/null +++ b/libm/arm/e_pow.S @@ -0,0 +1,443 @@ +@ Copyright (c) 2012, Code Aurora Forum. All rights reserved. +@ +@ Redistribution and use in source and binary forms, with or without +@ modification, are permitted provided that the following conditions are +@ met: +@ * Redistributions of source code must retain the above copyright +@ notice, this list of conditions and the following disclaimer. +@ * Redistributions in binary form must reproduce the above +@ copyright notice, this list of conditions and the following +@ disclaimer in the documentation and/or other materials provided +@ with the distribution. +@ * Neither the name of Code Aurora Forum, Inc. nor the names of its +@ contributors may be used to endorse or promote products derived +@ from this software without specific prior written permission. +@ +@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED +@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT +@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <machine/cpu-features.h> +#include <machine/asm.h> + +@ Values which exist the program lifetime: +#define HIGH_WORD_MASK d31 +#define EXPONENT_MASK d30 +#define int_1 d29 +#define double_1 d28 +@ sign and 2^int_n fixup: +#define expadjustment d7 +#define literals r10 +@ Values which exist within both polynomial implementations: +#define int_n d2 +#define int_n_low s4 +#define int_n_high s5 +#define double_n d3 +#define k1 d27 +#define k2 d26 +#define k3 d25 +#define k4 d24 +@ Values which cross the boundaries between polynomial implementations: +#define ss d16 +#define ss2 d17 +#define ss4 d18 +#define Result d0 +#define Return_hw r1 +#define Return_lw r0 +#define ylg2x d0 +@ Intermediate values only needed sometimes: +@ initial (sorted in approximate order of availability for overwriting): +#define x_hw r1 +#define x_lw r0 +#define y_hw r3 +#define y_lw r2 +#define x d0 +#define bp d4 +#define y d1 +@ log series: +#define u d19 +#define v d20 +#define lg2coeff d21 +#define bpa d5 +#define bpb d3 +#define lg2const d6 +#define xmantissa r8 +#define twoto1o5 r4 +#define twoto3o5 r5 +#define ix r6 +#define iEXP_MASK r7 +@ exp input setup: +#define twoto1o8mask d3 +#define twoto1o4mask d4 +#define twoto1o2mask d1 +#define ylg2x_round_offset d16 +#define ylg2x_temp d17 +#define yn_temp d18 +#define yn_round_offset d19 +#define ln2 d5 +@ Careful, overwriting HIGH_WORD_MASK, reset it if you need it again ... +#define rounded_exponent d31 +@ exp series: +#define k5 d23 +#define k6 d22 +#define k7 d21 +#define k8 d20 +#define ss3 d19 +@ overwrite double_1 (we're done with it by now) +#define k0 d28 +#define twoto1o4 d6 + +@instructions that gas doesn't like to encode correctly: +#define vmov_f64 fconstd +#define vmov_f32 fconsts +#define vmovne_f64 fconstdne + +ENTRY(pow_neon) +#if defined(KRAIT_NO_AAPCS_VFP_MODE) + @ ARM ABI has inputs coming in via r registers, lets move to a d register + vmov x, x_lw, x_hw +#endif + push {r4, r5, r6, r7, r8, r9, r10, lr} + + @ pre-staged bp values + vldr bpa, .LbpA + vldr bpb, .LbpB + @ load two fifths into constant term in case we need it due to offsets + vldr lg2const, .Ltwofifths + + @ bp is initially 1.0, may adjust later based on x value + vmov_f64 bp, #0x70 + + @ extract the mantissa from x for scaled value comparisons + lsl xmantissa, x_hw, #12 + + @ twoto1o5 = 2^(1/5) (input bracketing) + movw twoto1o5, #0x186c + movt twoto1o5, #0x2611 + @ twoto3o5 = 2^(3/5) (input bracketing) + movw twoto3o5, #0x003b + movt twoto3o5, #0x8406 + + @ finish extracting xmantissa + orr xmantissa, xmantissa, x_lw, lsr #20 + + @ begin preparing a mask for normalization + vmov.i64 HIGH_WORD_MASK, #0xffffffff00000000 + + @ double_1 = (double) 1.0 + vmov_f64 double_1, #0x70 + +#if defined(KRAIT_NO_AAPCS_VFP_MODE) + @ move y from r registers to a d register + vmov y, y_lw, y_hw +#endif + + cmp xmantissa, twoto1o5 + + vshl.i64 EXPONENT_MASK, HIGH_WORD_MASK, #20 + vshr.u64 int_1, HIGH_WORD_MASK, #63 + + adr literals, .LliteralTable + + bhi .Lxgt2to1over5 + @ zero out lg2 constant term if don't offset our input + vsub.f64 lg2const, lg2const, lg2const + b .Lxle2to1over5 + +.Lxgt2to1over5: + @ if normalized x > 2^(1/5), bp = 1 + (2^(2/5)-1) = 2^(2/5) + vadd.f64 bp, bp, bpa + +.Lxle2to1over5: + @ will need ln2 for various things + vldr ln2, .Lln2 + + cmp xmantissa, twoto3o5 +@@@@ X Value Normalization @@@@ + + @ ss = abs(x) 2^(-1024) + vbic.i64 ss, x, EXPONENT_MASK + + @ N = (floor(log2(x)) + 0x3ff) * 2^52 + vand.i64 int_n, x, EXPONENT_MASK + + bls .Lxle2to3over5 + @ if normalized x > 2^(3/5), bp = 2^(2/5) + (2^(4/5) - 2^(2/5) = 2^(4/5) + vadd.f64 bp, bp, bpb + vadd.f64 lg2const, lg2const, lg2const + +.Lxle2to3over5: + + @ load log2 polynomial series constants + vldm literals!, {k4, k3, k2, k1} + + @ s = abs(x) 2^(-floor(log2(x))) (normalize abs(x) to around 1) + vorr.i64 ss, ss, double_1 + +@@@@ 3/2 (Log(bp(1+s)/(1-s))) input computation (s = (x-bp)/(x+bp)) @@@@ + + vsub.f64 u, ss, bp + vadd.f64 v, ss, bp + + @ s = (x-1)/(x+1) + vdiv.f64 ss, u, v + + @ load 2/(3log2) into lg2coeff + vldr lg2coeff, .Ltwooverthreeln2 + + @ N = floor(log2(x)) * 2^52 + vsub.i64 int_n, int_n, double_1 + +@@@@ 3/2 (Log(bp(1+s)/(1-s))) polynomial series @@@@ + + @ ss2 = ((x-dp)/(x+dp))^2 + vmul.f64 ss2, ss, ss + @ ylg2x = 3.0 + vmov_f64 ylg2x, #8 + vmul.f64 ss4, ss2, ss2 + + @ todo: useful later for two-way clamp + vmul.f64 lg2coeff, lg2coeff, y + + @ N = floor(log2(x)) + vshr.s64 int_n, int_n, #52 + + @ k3 = ss^2 * L4 + L3 + vmla.f64 k3, ss2, k4 + + @ k1 = ss^2 * L2 + L1 + vmla.f64 k1, ss2, k2 + + @ scale ss by 2/(3 ln 2) + vmul.f64 lg2coeff, ss, lg2coeff + + @ ylg2x = 3.0 + s^2 + vadd.f64 ylg2x, ylg2x, ss2 + + vcvt.f64.s32 double_n, int_n_low + + @ k1 = s^4 (s^2 L4 + L3) + s^2 L2 + L1 + vmla.f64 k1, ss4, k3 + + @ add in constant term + vadd.f64 double_n, lg2const + + @ ylg2x = 3.0 + s^2 + s^4 (s^4 (s^2 L4 + L3) + s^2 L2 + L1) + vmla.f64 ylg2x, ss4, k1 + + @ ylg2x = y 2 s / (3 ln(2)) (3.0 + s^2 + s^4 (s^4(s^2 L4 + L3) + s^2 L2 + L1) + vmul.f64 ylg2x, lg2coeff, ylg2x + +@@@@ Compute input to Exp(s) (s = y(n + log2(x)) - (floor(8 yn + 1)/8 + floor(8 ylog2(x) + 1)/8) @@@@@ + + @ mask to extract bit 1 (2^-2 from our fixed-point representation) + vshl.u64 twoto1o4mask, int_1, #1 + + @ double_n = y * n + vmul.f64 double_n, double_n, y + + @ Load 2^(1/4) for later computations + vldr twoto1o4, .Ltwoto1o4 + + @ either add or subtract one based on the sign of double_n and ylg2x + vshr.s64 ylg2x_round_offset, ylg2x, #62 + vshr.s64 yn_round_offset, double_n, #62 + + @ move unmodified y*lg2x into temp space + vmov ylg2x_temp, ylg2x + @ compute floor(8 y * n + 1)/8 + @ and floor(8 y (log2(x)) + 1)/8 + vcvt.s32.f64 ylg2x, ylg2x, #3 + @ move unmodified y*n into temp space + vmov yn_temp, double_n + vcvt.s32.f64 double_n, double_n, #3 + + @ load exp polynomial series constants + vldm literals!, {k8, k7, k6, k5, k4, k3, k2, k1} + + @ mask to extract bit 2 (2^-1 from our fixed-point representation) + vshl.u64 twoto1o2mask, int_1, #2 + + @ make rounding offsets either 1 or -1 instead of 0 or -2 + vorr.u64 ylg2x_round_offset, ylg2x_round_offset, int_1 + vorr.u64 yn_round_offset, yn_round_offset, int_1 + + @ round up to the nearest 1/8th + vadd.s32 ylg2x, ylg2x, ylg2x_round_offset + vadd.s32 double_n, double_n, yn_round_offset + + @ clear out round-up bit for y log2(x) + vbic.s32 ylg2x, ylg2x, int_1 + @ clear out round-up bit for yn + vbic.s32 double_n, double_n, int_1 + @ add together the (fixed precision) rounded parts + vadd.s64 rounded_exponent, double_n, ylg2x + @ turn int_n into a double with value 2^int_n + vshl.i64 int_n, rounded_exponent, #49 + @ compute masks for 2^(1/4) and 2^(1/2) fixups for fractional part of fixed-precision rounded values: + vand.u64 twoto1o4mask, twoto1o4mask, rounded_exponent + vand.u64 twoto1o2mask, twoto1o2mask, rounded_exponent + + @ convert back into floating point, double_n now holds (double) floor(8 y * n + 1)/8 + @ ylg2x now holds (double) floor(8 y * log2(x) + 1)/8 + vcvt.f64.s32 ylg2x, ylg2x, #3 + vcvt.f64.s32 double_n, double_n, #3 + + @ put the 2 bit (0.5) through the roof of twoto1o2mask (make it 0x0 or 0xffffffffffffffff) + vqshl.u64 twoto1o2mask, twoto1o2mask, #62 + @ put the 1 bit (0.25) through the roof of twoto1o4mask (make it 0x0 or 0xffffffffffffffff) + vqshl.u64 twoto1o4mask, twoto1o4mask, #63 + + @ center y*log2(x) fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * log2(x) + 1)/8 + vsub.f64 ylg2x_temp, ylg2x_temp, ylg2x + @ center y*n fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * n + 1)/8 + vsub.f64 yn_temp, yn_temp, double_n + + @ Add fractional parts of yn and y log2(x) together + vadd.f64 ss, ylg2x_temp, yn_temp + + @ Result = 1.0 (offset for exp(s) series) + vmov_f64 Result, #0x70 + + @ multiply fractional part of y * log2(x) by ln(2) + vmul.f64 ss, ln2, ss + +@@@@ 10th order polynomial series for Exp(s) @@@@ + + @ ss2 = (ss)^2 + vmul.f64 ss2, ss, ss + + @ twoto1o2mask = twoto1o2mask & twoto1o4 + vand.u64 twoto1o2mask, twoto1o2mask, twoto1o4 + @ twoto1o2mask = twoto1o2mask & twoto1o4 + vand.u64 twoto1o4mask, twoto1o4mask, twoto1o4 + + @ Result = 1.0 + ss + vadd.f64 Result, Result, ss + + @ k7 = ss k8 + k7 + vmla.f64 k7, ss, k8 + + @ ss4 = (ss*ss) * (ss*ss) + vmul.f64 ss4, ss2, ss2 + + @ twoto1o2mask = twoto1o2mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o2mask + vorr.u64 twoto1o2mask, twoto1o2mask, double_1 + @ twoto1o2mask = twoto1o4mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o4mask + vorr.u64 twoto1o4mask, twoto1o4mask, double_1 + + @ TODO: should setup sign here, expadjustment = 1.0 + vmov_f64 expadjustment, #0x70 + + @ ss3 = (ss*ss) * ss + vmul.f64 ss3, ss2, ss + + @ k0 = 1/2 (first non-unity coefficient) + vmov_f64 k0, #0x60 + + @ Mask out non-exponent bits to make sure we have just 2^int_n + vand.i64 int_n, int_n, EXPONENT_MASK + + @ square twoto1o2mask to get 1.0 or 2^(1/2) + vmul.f64 twoto1o2mask, twoto1o2mask, twoto1o2mask + @ multiply twoto2o4mask into the exponent output adjustment value + vmul.f64 expadjustment, expadjustment, twoto1o4mask + + @ k5 = ss k6 + k5 + vmla.f64 k5, ss, k6 + + @ k3 = ss k4 + k3 + vmla.f64 k3, ss, k4 + + @ k1 = ss k2 + k1 + vmla.f64 k1, ss, k2 + + @ multiply twoto1o2mask into exponent output adjustment value + vmul.f64 expadjustment, expadjustment, twoto1o2mask + + @ k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5 + vmla.f64 k5, ss2, k7 + + @ k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1 + vmla.f64 k1, ss2, k3 + + @ Result = 1.0 + ss + 1/2 ss^2 + vmla.f64 Result, ss2, k0 + + @ Adjust int_n so that it's a double precision value that can be multiplied by Result + vadd.i64 expadjustment, int_n, expadjustment + + @ k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 + vmla.f64 k1, ss4, k5 + + @ Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 ) + vmla.f64 Result, ss3, k1 + + @ multiply by adjustment (sign*(rounding ? sqrt(2) : 1) * 2^int_n) + vmul.f64 Result, expadjustment, Result + +.LleavePow: +#if defined(KRAIT_NO_AAPCS_VFP_MODE) + @ return Result (FP) + vmov Return_lw, Return_hw, Result +#endif +.LleavePowDirect: + @ leave directly returning whatever is in Return_lw and Return_hw + pop {r4, r5, r6, r7, r8, r9, r10, pc} + +.align 6 +.LliteralTable: +@ Least-sqares tuned constants for 11th order (log2((1+s)/(1-s)): +.LL4: @ ~3/11 + .long 0x53a79915, 0x3fd1b108 +.LL3: @ ~1/3 + .long 0x9ca0567a, 0x3fd554fa +.LL2: @ ~3/7 + .long 0x1408e660, 0x3fdb6db7 +.LL1: @ ~3/5 + .long 0x332D4313, 0x3fe33333 + +@ Least-squares tuned constants for 10th order exp(s): +.LE10: @ ~1/3628800 + .long 0x25c7ba0a, 0x3e92819b +.LE9: @ ~1/362880 + .long 0x9499b49c, 0x3ec72294 +.LE8: @ ~1/40320 + .long 0xabb79d95, 0x3efa019f +.LE7: @ ~1/5040 + .long 0x8723aeaa, 0x3f2a019f +.LE6: @ ~1/720 + .long 0x16c76a94, 0x3f56c16c +.LE5: @ ~1/120 + .long 0x11185da8, 0x3f811111 +.LE4: @ ~1/24 + .long 0x5555551c, 0x3fa55555 +.LE3: @ ~1/6 + .long 0x555554db, 0x3fc55555 + +.LbpA: @ (2^(2/5) - 1) + .long 0x4ee54db1, 0x3fd472d1 + +.LbpB: @ (2^(4/5) - 2^(2/5)) + .long 0x1c8a36cf, 0x3fdafb62 + +.Ltwofifths: @ + .long 0x9999999a, 0x3fd99999 + +.Ltwooverthreeln2: + .long 0xDC3A03FD, 0x3FEEC709 + +.Lln2: @ ln(2) + .long 0xFEFA39EF, 0x3FE62E42 + +.Ltwoto1o4: @ 2^1/4 + .long 0x0a31b715, 0x3ff306fe +END(pow) diff --git a/libm/arm/s_cos.S b/libm/arm/s_cos.S new file mode 100644 index 0000000..30a6767 --- /dev/null +++ b/libm/arm/s_cos.S @@ -0,0 +1,419 @@ +@ Copyright (c) 2012, The Linux Foundation. All rights reserved. +@ +@ Redistribution and use in source and binary forms, with or without +@ modification, are permitted provided that the following conditions are +@ met: +@ * Redistributions of source code must retain the above copyright +@ notice, this list of conditions and the following disclaimer. +@ * Redistributions in binary form must reproduce the above +@ copyright notice, this list of conditions and the following +@ disclaimer in the documentation and/or other materials provided +@ with the distribution. +@ * Neither the name of Code Aurora Forum, Inc. nor the names of its +@ contributors may be used to endorse or promote products derived +@ from this software without specific prior written permission. +@ +@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED +@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT +@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +@ +@ Additional notices preserved for attributions purposes only. +@ +@ ==================================================== +@ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. +@ +@ Developed at SunSoft, a Sun Microsystems, Inc. business. +@ Permission to use, copy, modify, and distribute this +@ software is freely granted, provided that this notice +@ is preserved. +@ ==================================================== +@ +@ ==================================================== +@ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. +@ +@ Developed at SunPro, a Sun Microsystems, Inc. business. +@ Permission to use, copy, modify, and distribute this +@ software is freely granted, provided that this notice +@ is preserved. +@ ==================================================== + +#include <machine/cpu-features.h> +#include <machine/asm.h> + +#define vmov_f64 fconstd + +ENTRY(cos) + push {r4, r6, r7, lr} + vmov d0, r0, r1 + mov r2, r0 + mov r3, r1 + movw r1, #0x21fb + movt r1, #0x3fe9 + mov r4, r3 + bic r3, r3, #0x80000000 + sub sp, sp, #48 + cmp r3, r1 + bgt .Lxgtpio4 + cmp r3, #0x3e400000 + bge .Lxnottiny + vcvt.s32.f64 s15, d0 + vmov r3, s15 + cmp r3, #0 + beq .Lreturnone +.Lxnottiny: + vmov.i64 d1, #0 + bl __kernel_cos +.Lleave_cos: + vmov r0, r1, d0 +.Lleave_cos_direct: + add sp, sp, #48 + pop {r4, r6, r7, pc} +.Lxgtpio4: + movw r2, #0xffff + movt r2, #0x7fef + cmp r3, r2 + bgt .LxisNaN + movw r0, #0xd97b + movt r0, #0x4002 + cmp r3, r0 + movw r2, #0x21fb + bgt .Lxge3pio4 + cmp r4, #0 + movt r2, #0x3ff9 + ble .Lsmallxisnegative + vldr d16, .Lpio2_1 + cmp r3, r2 + vsub.f64 d16, d0, d16 + beq .Lxnearpio2 + vldr d17, .Lpio2_1t +.Lfinalizesmallxremainder: + vsub.f64 d0, d16, d17 + vsub.f64 d16, d16, d0 + vstr d0, [sp, #8] + vsub.f64 d1, d16, d17 + vstr d1, [sp, #16] +.Lnmod3is1: + mov r0, #1 + bl __kernel_sin + vneg.f64 d0, d0 + b .Lleave_cos +.Lreturnone: + mov r0, #0 + movw r1, #0x0000 + movt r1, #0x3ff0 + vmov_f64 d0, #0x70 + b .Lleave_cos_direct +.LxisNaN: + vsub.f64 d0, d0, d0 + b .Lleave_cos +.Lxge3pio4: + movt r2, #0x4139 + cmp r3, r2 + bgt .Lxgigantic + vmov_f64 d3, #0x60 + vldr d2, .Linvpio2 + vldr d18, .Lpio2_1 + vabs.f64 d16, d0 + vmla.f64 d3, d16, d2 + vcvt.s32.f64 s3, d3 + vcvt.f64.s32 d17, s3 + vmov r0, s3 + cmp r0, #31 + vmls.f64 d16, d17, d18 + vldr d18, .Lpio2_1t + vmul.f64 d18, d17, d18 + bgt .Lcomputeremainder + ldr r2, .Lnpio2_hw_ptr + sub lr, r0, #1 +.LPICnpio2_hw0: + add r12, pc, r2 + ldr r1, [r12, lr, lsl #2] + cmp r3, r1 + beq .Lcomputeremainder +.Lfinishthirditeration: + vsub.f64 d0, d16, d18 + vstr d0, [sp, #8] +.Lfinishcomputingremainder: + vsub.f64 d16, d16, d0 + cmp r4, #0 + vsub.f64 d1, d16, d18 + vstr d1, [sp, #16] + blt .Lhandlenegativex +.Lselectregion: + and r0, r0, #3 + cmp r0, #1 + beq .Lnmod3is1 + cmp r0, #2 + beq .Lnmod3is2 + cmp r0, #0 + bne .Lnmod3is0 + bl __kernel_cos + b .Lleave_cos +.Lxgigantic: + asr r2, r3, #20 + vmov r6, r7, d0 + sub r2, r2, #1040 + mov r0, r6 + sub r2, r2, #6 + vldr d16, .Ltwo24 + sub r1, r3, r2, lsl #20 + vmov d18, r0, r1 + vcvt.s32.f64 s15, d18 + add r1, sp, #48 + mov r3, #3 + vcvt.f64.s32 d17, s15 + vsub.f64 d18, d18, d17 + vstr d17, [sp, #24] + vmul.f64 d18, d18, d16 + vcvt.s32.f64 s15, d18 + vcvt.f64.s32 d17, s15 + vsub.f64 d18, d18, d17 + vstr d17, [sp, #32] + vmul.f64 d16, d18, d16 + fcmpzd d16 + vstmdb r1!, {d16} + vmrs APSR_nzcv, fpscr + bne .Lprocessnonzeroterm +.Lskipzeroterms: + vldmdb r1!, {d16} + sub r3, r3, #1 + fcmpzd d16 + vmrs APSR_nzcv, fpscr + beq .Lskipzeroterms +.Lprocessnonzeroterm: + ldr r12, .Ltwo_over_pi_ptr + add r0, sp, #24 + add r1, sp, #8 +.LPICtwo_over_pi0: + add lr, pc, r12 + mov r12, #2 + str lr, [sp, #4] + str r12, [sp] + bl __kernel_rem_pio2 + cmp r4, #0 + vldr d0, [sp, #8] + blt .Lhandlenegativxalso + vldr d1, [sp, #16] + b .Lselectregion +.Lxnearpio2: + vldr d17, .Lpio2_2 + vsub.f64 d16, d16, d17 + vldr d17, .Lpio2_2t + b .Lfinalizesmallxremainder +.Lsmallxisnegative: + vldr d1, .Lpio2_1 + cmp r3, r2 + vadd.f64 d16, d0, d1 + beq .Lxnearnegpio2 + vldr d17, .Lpio2_1t +.Lfinalizesmallnegxremainder: + vadd.f64 d0, d16, d17 + vsub.f64 d16, d16, d0 + vstr d0, [sp, #8] + vadd.f64 d1, d16, d17 + vstr d1, [sp, #16] +.Lnmod3is0: + mov r0, #1 + bl __kernel_sin + b .Lleave_cos +.Lnmod3is2: + bl __kernel_cos + vneg.f64 d0, d0 + b .Lleave_cos +.Lcomputeremainder: + vsub.f64 d0, d16, d18 + asr r1, r3, #20 + vmov r2, r3, d0 + ubfx r3, r3, #20, #11 + rsb r3, r3, r1 + vstr d0, [sp, #8] + cmp r3, #16 + ble .Lfinishcomputingremainder + vldr d18, .Lpio2_2 + vmul.f64 d20, d17, d18 + vsub.f64 d19, d16, d20 + vsub.f64 d16, d16, d19 + vsub.f64 d18, d16, d20 + vldr d16, .Lpio2_2t + vnmls.f64 d18, d17, d16 + vsub.f64 d0, d19, d18 + vmov r2, r3, d0 + ubfx r3, r3, #20, #11 + rsb r1, r3, r1 + vstr d0, [sp, #8] + cmp r1, #49 + ble .Lfinishseconditeration + vldr d5, .Lpio2_3 + vmul.f64 d20, d17, d5 + vsub.f64 d16, d19, d20 + vsub.f64 d4, d19, d16 + vldr d19, .Lpio2_3t + vsub.f64 d18, d4, d20 + vnmls.f64 d18, d17, d19 + b .Lfinishthirditeration +.Lhandlenegativex: + vneg.f64 d0, d0 + rsb r0, r0, #0 + vneg.f64 d1, d1 + vstr d0, [sp, #8] + vstr d1, [sp, #16] + b .Lselectregion +.Lfinishseconditeration: + vmov d16, d19 + b .Lfinishcomputingremainder +.Lxnearnegpio2: + vldr d0, .Lpio2_2 + vldr d17, .Lpio2_2t + vadd.f64 d16, d16, d0 + b .Lfinalizesmallnegxremainder +.Lhandlenegativxalso: + vldr d6, [sp, #16] + vneg.f64 d0, d0 + rsb r0, r0, #0 + vneg.f64 d1, d6 + vstr d0, [sp, #8] + vstr d1, [sp, #16] + b .Lselectregion + +.align 3 +.Lpio2_1: + .word 0x54400000, 0x3ff921fb +.Lpio2_1t: + .word 0x1a626331, 0x3dd0b461 +.Linvpio2: + .word 0x6dc9c883, 0x3fe45f30 +.Ltwo24: + .word 0x00000000, 0x41700000 +.Lpio2_2: + .word 0x1a600000, 0x3dd0b461 +.Lpio2_2t: + .word 0x2e037073, 0x3ba3198a +.Lpio2_3: + .word 0x2e000000, 0x3ba3198a +.Lpio2_3t: + .word 0x252049c1, 0x397b839a +.Lnpio2_hw_ptr: + .word .Lnpio2_hw-(.LPICnpio2_hw0+8) +.Ltwo_over_pi_ptr: + .word .Ltwo_over_pi-(.LPICtwo_over_pi0+8) +END(cos) + + .section .rodata.npio2_hw,"a",%progbits + .align 2 +.Lnpio2_hw = . + 0 + .type npio2_hw, %object + .size npio2_hw, 128 +npio2_hw: + .word 0x3ff921fb + .word 0x400921fb + .word 0x4012d97c + .word 0x401921fb + .word 0x401f6a7a + .word 0x4022d97c + .word 0x4025fdbb + .word 0x402921fb + .word 0x402c463a + .word 0x402f6a7a + .word 0x4031475c + .word 0x4032d97c + .word 0x40346b9c + .word 0x4035fdbb + .word 0x40378fdb + .word 0x403921fb + .word 0x403ab41b + .word 0x403c463a + .word 0x403dd85a + .word 0x403f6a7a + .word 0x40407e4c + .word 0x4041475c + .word 0x4042106c + .word 0x4042d97c + .word 0x4043a28c + .word 0x40446b9c + .word 0x404534ac + .word 0x4045fdbb + .word 0x4046c6cb + .word 0x40478fdb + .word 0x404858eb + .word 0x404921fb + + .section .rodata.two_over_pi,"a",%progbits + .align 2 +.Ltwo_over_pi = . + 0 + .type two_over_pi, %object + .size two_over_pi, 264 +two_over_pi: + .word 0x00a2f983 + .word 0x006e4e44 + .word 0x001529fc + .word 0x002757d1 + .word 0x00f534dd + .word 0x00c0db62 + .word 0x0095993c + .word 0x00439041 + .word 0x00fe5163 + .word 0x00abdebb + .word 0x00c561b7 + .word 0x00246e3a + .word 0x00424dd2 + .word 0x00e00649 + .word 0x002eea09 + .word 0x00d1921c + .word 0x00fe1deb + .word 0x001cb129 + .word 0x00a73ee8 + .word 0x008235f5 + .word 0x002ebb44 + .word 0x0084e99c + .word 0x007026b4 + .word 0x005f7e41 + .word 0x003991d6 + .word 0x00398353 + .word 0x0039f49c + .word 0x00845f8b + .word 0x00bdf928 + .word 0x003b1ff8 + .word 0x0097ffde + .word 0x0005980f + .word 0x00ef2f11 + .word 0x008b5a0a + .word 0x006d1f6d + .word 0x00367ecf + .word 0x0027cb09 + .word 0x00b74f46 + .word 0x003f669e + .word 0x005fea2d + .word 0x007527ba + .word 0x00c7ebe5 + .word 0x00f17b3d + .word 0x000739f7 + .word 0x008a5292 + .word 0x00ea6bfb + .word 0x005fb11f + .word 0x008d5d08 + .word 0x00560330 + .word 0x0046fc7b + .word 0x006babf0 + .word 0x00cfbc20 + .word 0x009af436 + .word 0x001da9e3 + .word 0x0091615e + .word 0x00e61b08 + .word 0x00659985 + .word 0x005f14a0 + .word 0x0068408d + .word 0x00ffd880 + .word 0x004d7327 + .word 0x00310606 + .word 0x001556ca + .word 0x0073a8c9 + .word 0x0060e27b + .word 0x00c08c6b diff --git a/libm/arm/s_sin.S b/libm/arm/s_sin.S new file mode 100644 index 0000000..9c3366c --- /dev/null +++ b/libm/arm/s_sin.S @@ -0,0 +1,414 @@ +@ Copyright (c) 2012, The Linux Foundation. All rights reserved. +@ +@ Redistribution and use in source and binary forms, with or without +@ modification, are permitted provided that the following conditions are +@ met: +@ * Redistributions of source code must retain the above copyright +@ notice, this list of conditions and the following disclaimer. +@ * Redistributions in binary form must reproduce the above +@ copyright notice, this list of conditions and the following +@ disclaimer in the documentation and/or other materials provided +@ with the distribution. +@ * Neither the name of Code Aurora Forum, Inc. nor the names of its +@ contributors may be used to endorse or promote products derived +@ from this software without specific prior written permission. +@ +@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED +@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT +@ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS +@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +@ +@ Additional notices preserved for attributions purposes only. +@ +@ ==================================================== +@ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. +@ +@ Developed at SunSoft, a Sun Microsystems, Inc. business. +@ Permission to use, copy, modify, and distribute this +@ software is freely granted, provided that this notice +@ is preserved. +@ ==================================================== +@ +@ ==================================================== +@ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. +@ +@ Developed at SunPro, a Sun Microsystems, Inc. business. +@ Permission to use, copy, modify, and distribute this +@ software is freely granted, provided that this notice +@ is preserved. +@ ==================================================== + +#include <machine/cpu-features.h> +#include <machine/asm.h> + +#define vmov_f64 fconstd + +ENTRY(sin) + push {r4, r6, r7, lr} + vmov d0, r0, r1 + mov r2, r0 + mov r3, r1 + movw r1, #0x21fb + movt r1, #0x3fe9 + mov r4, r3 + bic r3, r3, #0x80000000 + sub sp, sp, #48 + cmp r3, r1 + bgt .Lxgtpio4 + cmp r3, #0x3e400000 + bge .Lxnottiny + vcvt.s32.f64 s15, d0 + vmov r3, s15 + cmp r3, #0 + bne .Lxnottiny +.Lleave_sin: + vmov r0, r1, d0 + add sp, sp, #48 + pop {r4, r6, r7, pc} +.Lxgtpio4: + movw r2, #0xffff + movt r2, #0x7fef + cmp r3, r2 + bgt .LxisNaN + movw r0, #0xd97b + movt r0, #0x4002 + cmp r3, r0 + movw r2, #0x21fb + bgt .Lxge3pio4 + cmp r4, #0 + movt r2, #0x3ff9 + ble .Lsmallxisnegative + vldr d16, .Lpio2_1 + cmp r3, r2 + vsub.f64 d16, d0, d16 + beq .Lxnearpio2 + vldr d17, .Lpio2_1t +.Lfinalizesmallxremainder: + vsub.f64 d0, d16, d17 + vsub.f64 d16, d16, d0 + vstr d0, [sp, #8] + vsub.f64 d1, d16, d17 + vstr d1, [sp, #16] +.Lnmod3is1: + bl __kernel_cos + b .Lleave_sin +.Lxnottiny: + vmov.i64 d1, #0 + mov r0, #0 + bl __kernel_sin + b .Lleave_sin +.LxisNaN: + vsub.f64 d0, d0, d0 + b .Lleave_sin +.Lxge3pio4: + movt r2, #0x4139 + cmp r3, r2 + bgt .Lxgigantic + vmov_f64 d3, #0x60 + vldr d2, .Linvpio2 + vldr d18, .Lpio2_1 + vabs.f64 d16, d0 + vmla.f64 d3, d16, d2 + vcvt.s32.f64 s3, d3 + vcvt.f64.s32 d17, s3 + vmov r0, s3 + cmp r0, #31 + vmls.f64 d16, d17, d18 + vldr d18, .Lpio2_1t + vmul.f64 d18, d17, d18 + bgt .Lcomputeremainder + ldr r2, .Lnpio2_hw_ptr + sub lr, r0, #1 +.LPICnpio2_hw0: + add r12, pc, r2 + ldr r1, [r12, lr, lsl #2] + cmp r3, r1 + beq .Lcomputeremainder +.Lfinishthirditeration: + vsub.f64 d0, d16, d18 + vstr d0, [sp, #8] +.Lfinishcomputingremainder: + vsub.f64 d16, d16, d0 + cmp r4, #0 + vsub.f64 d1, d16, d18 + vstr d1, [sp, #16] + blt .Lhandlenegativex +.Lselectregion: + and r0, r0, #3 + cmp r0, #1 + beq .Lnmod3is1 + cmp r0, #2 + beq .Lnmod3is2 + cmp r0, #0 + bne .Lnmod3is0 + mov r0, #1 + bl __kernel_sin + b .Lleave_sin +.Lxgigantic: + asr r2, r3, #20 + vmov r6, r7, d0 + sub r2, r2, #1040 + mov r0, r6 + sub r2, r2, #6 + vldr d16, .Ltwo24 + sub r1, r3, r2, lsl #20 + vmov d18, r0, r1 + vcvt.s32.f64 s15, d18 + add r1, sp, #48 + mov r3, #3 + vcvt.f64.s32 d17, s15 + vsub.f64 d18, d18, d17 + vstr d17, [sp, #24] + vmul.f64 d18, d18, d16 + vcvt.s32.f64 s15, d18 + vcvt.f64.s32 d17, s15 + vsub.f64 d18, d18, d17 + vstr d17, [sp, #32] + vmul.f64 d16, d18, d16 + fcmpzd d16 + vstmdb r1!, {d16} + vmrs APSR_nzcv, fpscr + bne .Lprocessnonzeroterm +.Lskipzeroterms: + vldmdb r1!, {d16} + sub r3, r3, #1 + fcmpzd d16 + vmrs APSR_nzcv, fpscr + beq .Lskipzeroterms +.Lprocessnonzeroterm: + ldr r12, .Ltwo_over_pi_ptr + add r0, sp, #24 + add r1, sp, #8 +.LPICtwo_over_pi0: + add lr, pc, r12 + mov r12, #2 + str lr, [sp, #4] + str r12, [sp] + bl __kernel_rem_pio2 + cmp r4, #0 + vldr d0, [sp, #8] + blt .Lhandlenegativexalso + vldr d1, [sp, #16] + b .Lselectregion +.Lxnearpio2: + vldr d17, .Lpio2_2 + vsub.f64 d16, d16, d17 + vldr d17, .Lpio2_2t + b .Lfinalizesmallxremainder +.Lsmallxisnegative: + vldr d1, .Lpio2_1 + cmp r3, r2 + vadd.f64 d16, d0, d1 + beq .Lxnearnegpio2 + vldr d17, .Lpio2_1t +.Lfinalizesmallnegxremainder: + vadd.f64 d0, d16, d17 + vsub.f64 d16, d16, d0 + vstr d0, [sp, #8] + vadd.f64 d1, d16, d17 + vstr d1, [sp, #16] +.Lnmod3is0: + bl __kernel_cos + vneg.f64 d0, d0 + b .Lleave_sin +.Lnmod3is2: + mov r0, #1 + bl __kernel_sin + vneg.f64 d0, d0 + b .Lleave_sin +.Lcomputeremainder: + vsub.f64 d0, d16, d18 + asr r1, r3, #20 + vmov r2, r3, d0 + ubfx r3, r3, #20, #11 + rsb r3, r3, r1 + vstr d0, [sp, #8] + cmp r3, #16 + ble .Lfinishcomputingremainder + vldr d18, .Lpio2_2 + vmul.f64 d20, d17, d18 + vsub.f64 d19, d16, d20 + vsub.f64 d16, d16, d19 + vsub.f64 d18, d16, d20 + vldr d16, .Lpio2_2t + vnmls.f64 d18, d17, d16 + vsub.f64 d0, d19, d18 + vmov r2, r3, d0 + ubfx r3, r3, #20, #11 + rsb r1, r3, r1 + vstr d0, [sp, #8] + cmp r1, #49 + ble .Lfinishseconditeration + vldr d5, .Lpio2_3 + vmul.f64 d20, d17, d5 + vsub.f64 d16, d19, d20 + vsub.f64 d4, d19, d16 + vldr d19, .Lpio2_3t + vsub.f64 d18, d4, d20 + vnmls.f64 d18, d17, d19 + b .Lfinishthirditeration +.Lhandlenegativex: + vneg.f64 d0, d0 + rsb r0, r0, #0 + vneg.f64 d1, d1 + vstr d0, [sp, #8] + vstr d1, [sp, #16] + b .Lselectregion +.Lfinishseconditeration: + vmov d16, d19 + b .Lfinishcomputingremainder +.Lxnearnegpio2: + vldr d0, .Lpio2_2 + vldr d17, .Lpio2_2t + vadd.f64 d16, d16, d0 + b .Lfinalizesmallnegxremainder +.Lhandlenegativexalso: + vldr d6, [sp, #16] + vneg.f64 d0, d0 + rsb r0, r0, #0 + vneg.f64 d1, d6 + vstr d0, [sp, #8] + vstr d1, [sp, #16] + b .Lselectregion + +.align 3 +.Lpio2_1: + .word 0x54400000, 0x3ff921fb +.Lpio2_1t: + .word 0x1a626331, 0x3dd0b461 +.Linvpio2: + .word 0x6dc9c883, 0x3fe45f30 +.Ltwo24: + .word 0x00000000, 0x41700000 +.Lpio2_2: + .word 0x1a600000, 0x3dd0b461 +.Lpio2_2t: + .word 0x2e037073, 0x3ba3198a +.Lpio2_3: + .word 0x2e000000, 0x3ba3198a +.Lpio2_3t: + .word 0x252049c1, 0x397b839a +.Lnpio2_hw_ptr: + .word .Lnpio2_hw-(.LPICnpio2_hw0+8) +.Ltwo_over_pi_ptr: + .word .Ltwo_over_pi-(.LPICtwo_over_pi0+8) +END(sin) + + .section .rodata.npio2_hw,"a",%progbits + .align 2 +.Lnpio2_hw = . + 0 + .type npio2_hw, %object + .size npio2_hw, 128 +npio2_hw: + .word 0x3ff921fb + .word 0x400921fb + .word 0x4012d97c + .word 0x401921fb + .word 0x401f6a7a + .word 0x4022d97c + .word 0x4025fdbb + .word 0x402921fb + .word 0x402c463a + .word 0x402f6a7a + .word 0x4031475c + .word 0x4032d97c + .word 0x40346b9c + .word 0x4035fdbb + .word 0x40378fdb + .word 0x403921fb + .word 0x403ab41b + .word 0x403c463a + .word 0x403dd85a + .word 0x403f6a7a + .word 0x40407e4c + .word 0x4041475c + .word 0x4042106c + .word 0x4042d97c + .word 0x4043a28c + .word 0x40446b9c + .word 0x404534ac + .word 0x4045fdbb + .word 0x4046c6cb + .word 0x40478fdb + .word 0x404858eb + .word 0x404921fb + + .section .rodata.two_over_pi,"a",%progbits + .align 2 +.Ltwo_over_pi = . + 0 + .type two_over_pi, %object + .size two_over_pi, 264 +two_over_pi: + .word 0x00a2f983 + .word 0x006e4e44 + .word 0x001529fc + .word 0x002757d1 + .word 0x00f534dd + .word 0x00c0db62 + .word 0x0095993c + .word 0x00439041 + .word 0x00fe5163 + .word 0x00abdebb + .word 0x00c561b7 + .word 0x00246e3a + .word 0x00424dd2 + .word 0x00e00649 + .word 0x002eea09 + .word 0x00d1921c + .word 0x00fe1deb + .word 0x001cb129 + .word 0x00a73ee8 + .word 0x008235f5 + .word 0x002ebb44 + .word 0x0084e99c + .word 0x007026b4 + .word 0x005f7e41 + .word 0x003991d6 + .word 0x00398353 + .word 0x0039f49c + .word 0x00845f8b + .word 0x00bdf928 + .word 0x003b1ff8 + .word 0x0097ffde + .word 0x0005980f + .word 0x00ef2f11 + .word 0x008b5a0a + .word 0x006d1f6d + .word 0x00367ecf + .word 0x0027cb09 + .word 0x00b74f46 + .word 0x003f669e + .word 0x005fea2d + .word 0x007527ba + .word 0x00c7ebe5 + .word 0x00f17b3d + .word 0x000739f7 + .word 0x008a5292 + .word 0x00ea6bfb + .word 0x005fb11f + .word 0x008d5d08 + .word 0x00560330 + .word 0x0046fc7b + .word 0x006babf0 + .word 0x00cfbc20 + .word 0x009af436 + .word 0x001da9e3 + .word 0x0091615e + .word 0x00e61b08 + .word 0x00659985 + .word 0x005f14a0 + .word 0x0068408d + .word 0x00ffd880 + .word 0x004d7327 + .word 0x00310606 + .word 0x001556ca + .word 0x0073a8c9 + .word 0x0060e27b + .word 0x00c08c6b diff --git a/libm/src/e_pow.c b/libm/src/e_pow.c index d213132..bd82f30 100644 --- a/libm/src/e_pow.c +++ b/libm/src/e_pow.c @@ -61,6 +61,14 @@ static char rcsid[] = "$FreeBSD: src/lib/msun/src/e_pow.c,v 1.11 2005/02/04 18:2 #include "math.h" #include "math_private.h" +#if defined(KRAIT_NEON_OPTIMIZATION) || defined(SPARROW_NEON_OPTIMIZATION) || defined(SCORPION_NEON_OPTIMIZATION) +#if defined(KRAIT_NO_AAPCS_VFP_MODE) +double pow_neon(double x, double y); +#else +double pow_neon(double x, double y, int32_t lx, int32_t hx) __attribute__((pcs("aapcs-vfp"))); +#endif +#endif + static const double bp[] = {1.0, 1.5,}, dp_h[] = { 0.0, 5.84962487220764160156e-01,}, /* 0x3FE2B803, 0x40000000 */ @@ -108,12 +116,32 @@ __ieee754_pow(double x, double y) ix = hx&0x7fffffff; iy = hy&0x7fffffff; /* y==zero: x**0 = 1 */ - if((iy|ly)==0) return one; - /* +-NaN return x+y */ - if(ix > 0x7ff00000 || ((ix==0x7ff00000)&&(lx!=0)) || - iy > 0x7ff00000 || ((iy==0x7ff00000)&&(ly!=0))) - return x+y; + if (ly == 0) { + if (hy == ly) { + /* y==0.0, x**0 = 1 */ + return one; + } + else if (iy > 0x7ff00000) { + /* y is NaN, return x+y (NaN) */ + return x+y; + } + } + else if (iy >= 0x7ff00000) { + /* y is NaN, return x+y (NaN) */ + return x+y; + } + + if (lx == 0) { + if (ix > 0x7ff00000) { + /* x is NaN, return x+y (NaN) */ + return x+y; + } + } + else if (ix >= 0x7ff00000) { + /* x is NaN, return x+y (NaN) */ + return x+y; + } /* determine if y is an odd int when x < 0 * yisint = 0 ... y is not an integer @@ -201,6 +229,14 @@ __ieee754_pow(double x, double y) t1 = u+v; SET_LOW_WORD(t1,0); t2 = v-(t1-u); +#if defined(KRAIT_NEON_OPTIMIZATION) || defined(SPARROW_NEON_OPTIMIZATION) || defined(SCORPION_NEON_OPTIMIZATION) + } else if (ix <= 0x40100000 && iy <= 0x40100000 && hy > 0 && hx > 0) { +#if defined(KRAIT_NO_AAPCS_VFP_MODE) + return pow_neon(x,y); +#else + return pow_neon(x,y,lx,hx); +#endif +#endif } else { double ss,s2,s_h,s_l,t_h,t_l; n = 0; diff --git a/libm/src/k_cos.c b/libm/src/k_cos.c index 00916d7..b8cdf8f 100644 --- a/libm/src/k_cos.c +++ b/libm/src/k_cos.c @@ -69,6 +69,17 @@ C6 = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */ double __kernel_cos(double x, double y) { +#if defined(KRAIT_NEON_OPTIMIZATION) + double hz,z,zz,r,w,k; + + z = x*x; + zz = z*z; + k = x*y; + hz = (float)0.5*z; + r = z*(z*(C1+z*(C2+z*((C3+z*C4)+zz*(C5+z*C6))))); + w = one-hz; + return w + (((one-w)-hz) + (r-k)); +#else double hz,z,r,w; z = x*x; @@ -76,4 +87,5 @@ __kernel_cos(double x, double y) hz = (float)0.5*z; w = one-hz; return w + (((one-w)-hz) + (z*r-x*y)); +#endif } diff --git a/libm/src/k_sin.c b/libm/src/k_sin.c index ae06a9d..ee641d4 100644 --- a/libm/src/k_sin.c +++ b/libm/src/k_sin.c @@ -60,6 +60,16 @@ S6 = 1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */ double __kernel_sin(double x, double y, int iy) { +#if defined(KRAIT_NEON_OPTIMIZATION) + double z,zz,r,v; + + z = x*x; + zz = z*z; + v = z*x; + r = S2+z*((S3+z*S4)+zz*(S5+z*S6)); + if(iy==0) return x+v*(S1+z*r); + else return x-((z*(half*y-v*r)-y)-v*S1); +#else double z,r,v; z = x*x; @@ -67,4 +77,5 @@ __kernel_sin(double x, double y, int iy) r = S2+z*(S3+z*(S4+z*(S5+z*S6))); if(iy==0) return x+v*(S1+z*r); else return x-((z*(half*y-v*r)-y)-v*S1); +#endif } diff --git a/libm/src/math_private.h b/libm/src/math_private.h index 5f6e088..7cda2e9 100644 --- a/libm/src/math_private.h +++ b/libm/src/math_private.h @@ -257,11 +257,19 @@ cpackl(long double x, long double y) #define __ieee754_ldexpf ldexpf /* fdlibm kernel function */ +#if defined(KRAIT_NEON_OPTIMIZATION) +int __ieee754_rem_pio2(double,double*) __attribute__((pcs("aapcs-vfp"))); +double __kernel_sin(double,double,int) __attribute__((pcs("aapcs-vfp"))); +double __kernel_cos(double,double) __attribute__((pcs("aapcs-vfp"))); +double __kernel_tan(double,double,int) __attribute__((pcs("aapcs-vfp"))); +int __kernel_rem_pio2(double*,double*,int,int,int,const int*) __attribute__((pcs("aapcs-vfp"))); +#else int __ieee754_rem_pio2(double,double*); double __kernel_sin(double,double,int); double __kernel_cos(double,double); double __kernel_tan(double,double,int); int __kernel_rem_pio2(double*,double*,int,int,int,const int*); +#endif /* float versions of fdlibm kernel functions */ int __ieee754_rem_pio2f(float,float*); diff --git a/linker/Android.mk b/linker/Android.mk index e8c81db..19f75c8 100644 --- a/linker/Android.mk +++ b/linker/Android.mk @@ -43,6 +43,9 @@ endif ifeq ($(TARGET_ARCH),mips) LOCAL_CFLAGS += -DANDROID_MIPS_LINKER endif +ifeq ($(TARGET_HAVE_TEGRA_ERRATA_657451),true) + LOCAL_CFLAGS += -DHAVE_TEGRA_ERRATA_657451 +endif LOCAL_MODULE:= linker LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk diff --git a/linker/linker.cpp b/linker/linker.cpp index 46d1335..2362099 100644 --- a/linker/linker.cpp +++ b/linker/linker.cpp @@ -641,33 +641,35 @@ static int open_library(const char *name) return -1; } -// Returns 'true' if the library is prelinked or on failure so we error out -// either way. We no longer support prelinking. -static bool is_prelinked(int fd, const char* name) +typedef struct { + long mmap_addr; + char tag[4]; /* 'P', 'R', 'E', ' ' */ +} prelink_info_t; + +/* Returns the requested base address if the library is prelinked, + * and 0 otherwise. */ +static unsigned long +is_prelinked(int fd, const char *name) { - struct prelink_info_t { - long mmap_addr; - char tag[4]; // "PRE ". - }; - off_t sz = lseek(fd, -sizeof(prelink_info_t), SEEK_END); if (sz < 0) { - DL_ERR("lseek failed: %s", strerror(errno)); - return true; + DL_ERR("lseek() failed!"); + return 0; } prelink_info_t info; int rc = TEMP_FAILURE_RETRY(read(fd, &info, sizeof(info))); if (rc != sizeof(info)) { - DL_ERR("could not read prelink_info_t structure for \"%s\":", name, strerror(errno)); - return true; + WARN("Could not read prelink_info_t structure for `%s`\n", name); + return 0; } - if (memcmp(info.tag, "PRE ", 4) == 0) { - DL_ERR("prelinked libraries no longer supported: %s", name); - return true; + if (memcmp(info.tag, "PRE ", 4)) { + WARN("`%s` is not a prelinked library\n", name); + return 0; } - return false; + + return (unsigned long)info.mmap_addr; } /* verify_elf_header @@ -781,10 +783,21 @@ static soinfo* load_library(const char* name) return NULL; } - // We no longer support pre-linked libraries. - if (is_prelinked(fd.fd, name)) { + unsigned req_base = (unsigned) is_prelinked(fd.fd, name); + if (req_base == (unsigned)-1) { + DL_ERR("%5d can't read end of library: %s: %s", pid, name, + strerror(errno)); return NULL; } + if (req_base != 0) { + TRACE("[ %5d - Prelinked library '%s' requesting base @ 0x%08x ]\n", + pid, name, req_base); + } else { + TRACE("[ %5d - Non-prelinked library '%s' found. ]\n", pid, name); + } + + TRACE("[ %5d - '%s' (%s) wants base=0x%08x sz=0x%08x ]\n", pid, name, + (req_base ? "prelinked" : "not pre-linked"), req_base, ext_sz); // Reserve address space for all loadable segments. void* load_start = NULL; @@ -792,6 +805,7 @@ static soinfo* load_library(const char* name) Elf32_Addr load_bias = 0; ret = phdr_table_reserve_memory(phdr_table, phdr_count, + req_base, &load_start, &load_size, &load_bias); diff --git a/linker/linker_phdr.c b/linker/linker_phdr.c index 250ca20..36f848b 100644 --- a/linker/linker_phdr.c +++ b/linker/linker_phdr.c @@ -218,6 +218,8 @@ Elf32_Addr phdr_table_get_load_size(const Elf32_Phdr* phdr_table, * Input: * phdr_table -> program header table * phdr_count -> number of entries in the tables + * required_base -> for prelinked libraries, mandatory load address + * of the first loadable segment. 0 otherwise. * Output: * load_start -> first page of reserved address space range * load_size -> size in bytes of reserved address space range @@ -229,18 +231,22 @@ Elf32_Addr phdr_table_get_load_size(const Elf32_Phdr* phdr_table, int phdr_table_reserve_memory(const Elf32_Phdr* phdr_table, size_t phdr_count, + Elf32_Addr required_base, void** load_start, Elf32_Addr* load_size, Elf32_Addr* load_bias) { Elf32_Addr size = phdr_table_get_load_size(phdr_table, phdr_count); + if (size == 0) { errno = EINVAL; return -1; } int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS; - void* start = mmap(NULL, size, PROT_NONE, mmap_flags, -1, 0); + if (required_base != 0) + mmap_flags |= MAP_FIXED; + void* start = mmap((void*)required_base, size, PROT_NONE, mmap_flags, -1, 0); if (start == MAP_FAILED) { return -1; } diff --git a/linker/linker_phdr.h b/linker/linker_phdr.h index a759262..19e281b 100644 --- a/linker/linker_phdr.h +++ b/linker/linker_phdr.h @@ -61,6 +61,7 @@ phdr_table_get_load_size(const Elf32_Phdr* phdr_table, int phdr_table_reserve_memory(const Elf32_Phdr* phdr_table, size_t phdr_count, + Elf32_Addr required_base, void** load_start, Elf32_Addr* load_size, Elf32_Addr* load_bias); |