diff options
-rw-r--r-- | libc/Android.mk | 21 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memcpy.S | 124 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memmove.S | 356 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memset.S | 87 |
4 files changed, 573 insertions, 15 deletions
diff --git a/libc/Android.mk b/libc/Android.mk index 9e6bdfb..dcd7ba4 100644 --- a/libc/Android.mk +++ b/libc/Android.mk @@ -355,12 +355,21 @@ libc_common_src_files += \ arch-arm/bionic/sigsetjmp.S \ arch-arm/bionic/strlen.c.arm \ arch-arm/bionic/syscall.S \ - string/memmove.c.arm \ - string/bcopy.c \ string/strcmp.c \ string/strncmp.c \ unistd/socketcalls.c +# Check if we want a neonized version of memmove instead of the +# current ARM version +ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true) +libc_common_src_files += \ + arch-arm/bionic/memmove.S +else # Non-Scorpion-based ARM +libc_common_src_files += \ + string/bcopy.c \ + string/memmove.c.arm +endif # !TARGET_USE_SCORPION_BIONIC_OPTIMIZATION + # These files need to be arm so that gdbserver # can set breakpoints in them without messing # up any thumb code. @@ -494,6 +503,14 @@ ifeq ($(TARGET_ARCH),arm) ifeq ($(ARCH_ARM_HAVE_TLS_REGISTER),true) libc_common_cflags += -DHAVE_ARM_TLS_REGISTER endif + # Add in defines to activate SCORPION_NEON_OPTIMIZATION + ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true) + libc_common_cflags += -DSCORPION_NEON_OPTIMIZATION + ifeq ($(TARGET_USE_SCORPION_PLD_SET),true) + libc_common_cflags += -DPLDOFFS=$(TARGET_SCORPION_BIONIC_PLDOFFS) + libc_common_cflags += -DPLDSIZE=$(TARGET_SCORPION_BIONIC_PLDSIZE) + endif + endif else # !arm ifeq ($(TARGET_ARCH),x86) libc_crt_target_cflags := -m32 diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index b8d1007..e92ff5e 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -2,6 +2,8 @@ * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * + * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -29,7 +31,114 @@ #include <machine/cpu-features.h> #if defined(__ARM_NEON__) - +#if defined(SCORPION_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true + * TARGET_USE_SCORPION_PLD_SET := true + * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset> + * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize> + */ +#ifndef PLDOFFS +#define PLDOFFS (6) +#endif +#ifndef PLDSIZE +#define PLDSIZE (128) /* L2 cache line size */ +#endif + .code 32 + .align 5 + .globl memcpy + .func +memcpy: + push {r0} + cmp r2, #4 + blt .Lneon_lt4 + cmp r2, #16 + blt .Lneon_lt16 + cmp r2, #32 + blt .Lneon_16 + cmp r2, #128 + blt .Lneon_copy_32_a + /* Copy blocks of 128-bytes (word-aligned) at a time*/ + /* Code below is optimized for PLDSIZE=128 only */ + mov r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #(PLDOFFS-1)*PLDSIZE] +.Lneon_copy_128_loop_outer: + pld [r1, #(PLDOFFS*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_copy_128_loop_nopld: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_exit + cmp r2, #32 + blt .Lneon_16 + nop + /* Copy blocks of 32-bytes (word aligned) at a time*/ +.Lneon_copy_32_a: + mov r12, r2, lsr #5 +.Lneon_copy_32_loop_a: + vld1.32 {q0,q1}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + bne .Lneon_copy_32_loop_a + ands r2, r2, #0x1f + beq .Lneon_exit +.Lneon_16: + subs r2, r2, #16 + blt .Lneon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + beq .Lneon_exit +.Lneon_lt16: + movs r12, r2, lsl #29 + bcc .Lneon_skip8 + ldr r3, [r1], #4 + ldr r12, [r1], #4 + str r3, [r0], #4 + str r12, [r0], #4 +.Lneon_skip8: + bpl .Lneon_lt4 + ldr r3, [r1], #4 + str r3, [r0], #4 +.Lneon_lt4: + movs r2, r2, lsl #31 + bcc .Lneon_lt2 + ldrh r3, [r1], #2 + strh r3, [r0], #2 +.Lneon_lt2: + bpl .Lneon_exit + ldrb r12, [r1] + strb r12, [r0] +.Lneon_exit: + pop {r0} + bx lr + .endfunc + .end +#else /* !SCORPION_NEON_OPTIMIZATION */ .text .fpu neon @@ -145,7 +254,7 @@ memcpy: bx lr .fnend - +#endif /* !SCORPION_NEON_OPTIMIZATION */ #else /* __ARM_ARCH__ < 7 */ @@ -260,31 +369,20 @@ cached_aligned32: * */ -#if __ARM_ARCH__ == 5 // Align the preload register to a cache-line because the cpu does // "critical word first" (the first word requested is loaded first). bic r12, r1, #0x1F add r12, r12, #64 -#endif 1: ldmia r1!, { r4-r11 } - -#if __ARM_ARCH__ == 5 PLD (r12, #64) -#else - PLD (r1, #64) -#endif - subs r2, r2, #32 -#if __ARM_ARCH__ == 5 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi // for ARM9 preload will not be safely guarded by the preceding subs. // When it is safely guarded the only possibility to have SIGSEGV here // is because the caller overstates the length. ldrhi r3, [r12], #32 /* cheap ARM9 preload */ -#endif - stmia r0!, { r4-r11 } bhs 1b diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S new file mode 100644 index 0000000..1234195 --- /dev/null +++ b/libc/arch-arm/bionic/memmove.S @@ -0,0 +1,356 @@ +/*************************************************************************** + Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Code Aurora nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +/*************************************************************************** + * Neon memmove: Attempts to do a memmove with Neon registers if possible, + * Inputs: + * dest: The destination buffer + * src: The source buffer + * n: The size of the buffer to transfer + * Outputs: + * + ***************************************************************************/ + +#include <machine/cpu-features.h> + +#if defined(SCORPION_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true + * TARGET_USE_SCORPION_PLD_SET := true + * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset> + * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize> + */ +#ifndef PLDOFFS +#define PLDOFFS (6) +#endif +#ifndef PLDSIZE +#define PLDSIZE (128) /* L2 cache line size */ +#endif + + .code 32 + .align 5 + .global memmove + .type memmove, %function + + .global bcopy + .type bcopy, %function + +bcopy: + mov r12, r0 + mov r0, r1 + mov r1, r12 +memmove: + push {r0} + + /* + * The requirements for memmove state that the function should + * operate as if data were being copied from the source to a + * buffer, then to the destination. This is to allow a user + * to copy data from a source and target that overlap. + * + * We can't just do byte copies front-to-back automatically, since + * there's a good chance we may have an overlap (why else would someone + * intentionally use memmove then?). + * + * We'll break this into two parts. Front-to-back, or back-to-front + * copies. + */ +.Lneon_memmove_cmf: + cmp r0, r1 + blt .Lneon_front_to_back_copy + bgt .Lneon_back_to_front_copy + b .Lneon_memmove_done + + /* ############################################################# + * Front to Back copy + */ +.Lneon_front_to_back_copy: + /* + * For small copies, just do a quick memcpy. We can do this for + * front-to-back copies, aligned or unaligned, since we're only + * doing 1 byte at a time... + */ + cmp r2, #4 + bgt .Lneon_f2b_gt4 + cmp r2, #0 +.Lneon_f2b_smallcopy_loop: + beq .Lneon_memmove_done + ldrb r12, [r1], #1 + subs r2, r2, #1 + strb r12, [r0], #1 + b .Lneon_f2b_smallcopy_loop +.Lneon_f2b_gt4: + /* The window size is in r3. */ + sub r3, r1, r0 + /* ############################################################# + * Front to Back copy + */ + /* + * Note that we can't just route based on the size in r2. If that's + * larger than the overlap window in r3, we could potentially + * (and likely!) destroy data we're copying. + */ + cmp r2, r3 + movle r12, r2 + movgt r12, r3 + cmp r12, #256 + bge .Lneon_f2b_copy_128 + cmp r12, #64 + bge .Lneon_f2b_copy_32 + cmp r12, #16 + bge .Lneon_f2b_copy_16 + cmp r12, #8 + bge .Lneon_f2b_copy_8 + cmp r12, #4 + bge .Lneon_f2b_copy_4 + b .Lneon_f2b_copy_1 + nop +.Lneon_f2b_copy_128: + mov r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_f2b_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #(PLDOFFS-1)*PLDSIZE] +.Lneon_f2b_copy_128_loop_outer: + pld [r1, #(PLDOFFS*PLDSIZE)] + vld1.32 {q0,q1}, [r1]! + vld1.32 {q2,q3}, [r1]! + vld1.32 {q8,q9}, [r1]! + vld1.32 {q10,q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + vst1.32 {q2,q3}, [r0]! + vst1.32 {q8,q9}, [r0]! + vst1.32 {q10,q11}, [r0]! + bne .Lneon_f2b_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_f2b_copy_128_loop_nopld: + vld1.32 {q0,q1}, [r1]! + vld1.32 {q2,q3}, [r1]! + vld1.32 {q8,q9}, [r1]! + vld1.32 {q10,q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + vst1.32 {q2,q3}, [r0]! + vst1.32 {q8,q9}, [r0]! + vst1.32 {q10,q11}, [r0]! + bne .Lneon_f2b_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_memmove_done + cmp r2, #32 + bge .Lneon_f2b_copy_32 + b .Lneon_f2b_copy_finish +.Lneon_f2b_copy_32: + mov r12, r2, lsr #5 +.Lneon_f2b_copy_32_loop: + vld1.32 {q0,q1}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + bne .Lneon_f2b_copy_32_loop + ands r2, r2, #0x1f + beq .Lneon_memmove_done +.Lneon_f2b_copy_finish: +.Lneon_f2b_copy_16: + movs r12, r2, lsr #4 + beq .Lneon_f2b_copy_8 +.Lneon_f2b_copy_16_loop: + vld1.32 {q0}, [r1]! + subs r12, r12, #1 + vst1.32 {q0}, [r0]! + bne .Lneon_f2b_copy_16_loop + ands r2, r2, #0xf + beq .Lneon_memmove_done +.Lneon_f2b_copy_8: + movs r12, r2, lsr #3 + beq .Lneon_f2b_copy_4 +.Lneon_f2b_copy_8_loop: + vld1.32 {d0}, [r1]! + subs r12, r12, #1 + vst1.32 {d0}, [r0]! + bne .Lneon_f2b_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_f2b_copy_4: + movs r12, r2, lsr #2 + beq .Lneon_f2b_copy_1 +.Lneon_f2b_copy_4_loop: + ldr r3, [r1], #4 + subs r12, r12, #1 + str r3, [r0], #4 + bne .Lneon_f2b_copy_4_loop + ands r2, r2, #0x3 + nop +.Lneon_f2b_copy_1: + cmp r2, #0 + beq .Lneon_memmove_done +.Lneon_f2b_copy_1_loop: + ldrb r12, [r1], #1 + subs r2, r2, #1 + strb r12, [r0], #1 + bne .Lneon_f2b_copy_1_loop +.Lneon_f2b_finish: + b .Lneon_memmove_done + + /* ############################################################# + * Back to Front copy + */ +.Lneon_back_to_front_copy: + /* + * Here, we'll want to shift to the end of the buffers. This + * actually points us one past where we need to go, but since + * we'll pre-decrement throughout, this will be fine. + */ + add r0, r0, r2 + add r1, r1, r2 + cmp r2, #4 + bgt .Lneon_b2f_gt4 + cmp r2, #0 +.Lneon_b2f_smallcopy_loop: + beq .Lneon_memmove_done + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + b .Lneon_b2f_smallcopy_loop +.Lneon_b2f_gt4: + /* + * The minimum of the overlap window size and the copy size + * is in r3. + */ + sub r3, r0, r1 + /* + * ############################################################# + * Back to Front copy - + */ + cmp r2, r3 + movle r12, r2 + movgt r12, r3 + cmp r12, #256 + bge .Lneon_b2f_copy_128 + cmp r12, #64 + bge .Lneon_b2f_copy_32 + cmp r12, #8 + bge .Lneon_b2f_copy_8 + cmp r12, #4 + bge .Lneon_b2f_copy_4 + b .Lneon_b2f_copy_1 + nop +.Lneon_b2f_copy_128: + movs r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_b2f_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #-(PLDOFFS-1)*PLDSIZE] +.Lneon_b2f_copy_128_loop_outer: + pld [r1, #-(PLDOFFS*PLDSIZE)] + sub r1, r1, #128 + sub r0, r0, #128 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + sub r1, r1, #128 + sub r0, r0, #128 + bne .Lneon_b2f_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_b2f_copy_128_loop_nopld: + sub r1, r1, #128 + sub r0, r0, #128 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + sub r1, r1, #128 + sub r0, r0, #128 + bne .Lneon_b2f_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_memmove_done + cmp r2, #32 + bge .Lneon_b2f_copy_32 + b .Lneon_b2f_copy_finish +.Lneon_b2f_copy_32: + mov r12, r2, lsr #5 +.Lneon_b2f_copy_32_loop: + sub r1, r1, #32 + sub r0, r0, #32 + vld1.32 {q0,q1}, [r1] + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0] + bne .Lneon_b2f_copy_32_loop + ands r2, r2, #0x1f + beq .Lneon_memmove_done +.Lneon_b2f_copy_finish: +.Lneon_b2f_copy_8: + movs r12, r2, lsr #0x3 + beq .Lneon_b2f_copy_4 +.Lneon_b2f_copy_8_loop: + sub r1, r1, #8 + sub r0, r0, #8 + vld1.32 {d0}, [r1] + subs r12, r12, #1 + vst1.32 {d0}, [r0] + bne .Lneon_b2f_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_b2f_copy_4: + movs r12, r2, lsr #0x2 + beq .Lneon_b2f_copy_1 +.Lneon_b2f_copy_4_loop: + ldr r3, [r1, #-4]! + subs r12, r12, #1 + str r3, [r0, #-4]! + bne .Lneon_b2f_copy_4_loop + ands r2, r2, #0x3 + nop +.Lneon_b2f_copy_1: + cmp r2, #0 + beq .Lneon_memmove_done +.Lneon_b2f_copy_1_loop: + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + bne .Lneon_b2f_copy_1_loop + +.Lneon_memmove_done: + pop {r0} + bx lr + + .end +#endif /* SCORPION_NEON_OPTIMIZATION */ + diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S index 93abe15..8ecd80c 100644 --- a/libc/arch-arm/bionic/memset.S +++ b/libc/arch-arm/bionic/memset.S @@ -2,6 +2,8 @@ * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * + * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -25,6 +27,90 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +#include <machine/cpu-features.h> + +#if defined(SCORPION_NEON_OPTIMIZATION) + .code 32 + .align 8 + .global memset + .type memset, %function + + .global bzero + .type bzero, %function + +bzero: + mov r2, r1 + mov r1, #0 +memset: + push {r0} + + cmp r2, #6 + bgt .Lmemset_gt6 + cmp r2, #0 + beq .Lmemset_smallcopy_done +.Lmemset_smallcopy_loop: + strb r1, [r0], #1 + subs r2, r2, #1 + bne .Lmemset_smallcopy_loop +.Lmemset_smallcopy_done: + pop {r0} + bx lr + +.Lmemset_gt6: + vdup.8 q0, r1 + vmov r1, s0 + + /* + * Decide where to route for the maximum copy sizes. + */ + cmp r2, #4 + blt .Lmemset_lt4 + cmp r2, #16 + blt .Lmemset_lt16 + vmov q1, q0 + cmp r2, #128 + blt .Lmemset_32 +.Lmemset_128: + mov r12, r2, lsr #7 +.Lmemset_128_loop: + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + subs r12, r12, #1 + bne .Lmemset_128_loop + ands r2, r2, #0x7f + beq .Lmemset_end +.Lmemset_32: + movs r12, r2, lsr #5 + beq .Lmemset_lt32 +.Lmemset_32_loop: + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + bne .Lmemset_32_loop + ands r2, r2, #0x1f + beq .Lmemset_end +.Lmemset_lt32: + cmp r2, #16 + blt .Lmemset_lt16 + vst1.64 {q0}, [r0]! + subs r2, r2, #16 + beq .Lmemset_end +.Lmemset_lt16: + movs r12, r2, lsl #29 + strcs r1, [r0], #4 + strcs r1, [r0], #4 + strmi r1, [r0], #4 +.Lmemset_lt4: + movs r2, r2, lsl #31 + strcsh r1, [r0], #2 + strmib r1, [r0] +.Lmemset_end: + pop {r0} + bx lr + + .end +#else /* !SCORPION_NEON_OPTIMIZATION */ .text .global memset @@ -115,3 +201,4 @@ memset: bx lr .fnend +#endif /* SCORPION_NEON_OPTIMIZATION */ |