summaryrefslogtreecommitdiffstats
path: root/libc/arch-arm
diff options
context:
space:
mode:
Diffstat (limited to 'libc/arch-arm')
-rw-r--r--libc/arch-arm/bionic/memcpy.S115
-rw-r--r--libc/arch-arm/bionic/memmove.S356
-rw-r--r--libc/arch-arm/bionic/memset.S87
3 files changed, 555 insertions, 3 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index ba55996..e92ff5e 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -2,6 +2,8 @@
* Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -29,7 +31,114 @@
#include <machine/cpu-features.h>
#if defined(__ARM_NEON__)
-
+#if defined(SCORPION_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_SCORPION_PLD_SET := true
+ * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (128) /* L2 cache line size */
+#endif
+ .code 32
+ .align 5
+ .globl memcpy
+ .func
+memcpy:
+ push {r0}
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #128
+ blt .Lneon_copy_32_a
+ /* Copy blocks of 128-bytes (word-aligned) at a time*/
+ /* Code below is optimized for PLDSIZE=128 only */
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_exit
+ cmp r2, #32
+ blt .Lneon_16
+ nop
+ /* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+ mov r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne .Lneon_copy_32_loop_a
+ ands r2, r2, #0x1f
+ beq .Lneon_exit
+.Lneon_16:
+ subs r2, r2, #16
+ blt .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ beq .Lneon_exit
+.Lneon_lt16:
+ movs r12, r2, lsl #29
+ bcc .Lneon_skip8
+ ldr r3, [r1], #4
+ ldr r12, [r1], #4
+ str r3, [r0], #4
+ str r12, [r0], #4
+.Lneon_skip8:
+ bpl .Lneon_lt4
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ bcc .Lneon_lt2
+ ldrh r3, [r1], #2
+ strh r3, [r0], #2
+.Lneon_lt2:
+ bpl .Lneon_exit
+ ldrb r12, [r1]
+ strb r12, [r0]
+.Lneon_exit:
+ pop {r0}
+ bx lr
+ .endfunc
+ .end
+#else /* !SCORPION_NEON_OPTIMIZATION */
.text
.fpu neon
@@ -38,7 +147,7 @@
.align 4
/* a prefetch distance of 4 cache-lines works best experimentally */
-#define CACHE_LINE_SIZE 64
+#define CACHE_LINE_SIZE 32
#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
memcpy:
@@ -145,7 +254,7 @@ memcpy:
bx lr
.fnend
-
+#endif /* !SCORPION_NEON_OPTIMIZATION */
#else /* __ARM_ARCH__ < 7 */
diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
new file mode 100644
index 0000000..1234195
--- /dev/null
+++ b/libc/arch-arm/bionic/memmove.S
@@ -0,0 +1,356 @@
+/***************************************************************************
+ Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Code Aurora nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************/
+
+/***************************************************************************
+ * Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ * Inputs:
+ * dest: The destination buffer
+ * src: The source buffer
+ * n: The size of the buffer to transfer
+ * Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_SCORPION_PLD_SET := true
+ * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (128) /* L2 cache line size */
+#endif
+
+ .code 32
+ .align 5
+ .global memmove
+ .type memmove, %function
+
+ .global bcopy
+ .type bcopy, %function
+
+bcopy:
+ mov r12, r0
+ mov r0, r1
+ mov r1, r12
+memmove:
+ push {r0}
+
+ /*
+ * The requirements for memmove state that the function should
+ * operate as if data were being copied from the source to a
+ * buffer, then to the destination. This is to allow a user
+ * to copy data from a source and target that overlap.
+ *
+ * We can't just do byte copies front-to-back automatically, since
+ * there's a good chance we may have an overlap (why else would someone
+ * intentionally use memmove then?).
+ *
+ * We'll break this into two parts. Front-to-back, or back-to-front
+ * copies.
+ */
+.Lneon_memmove_cmf:
+ cmp r0, r1
+ blt .Lneon_front_to_back_copy
+ bgt .Lneon_back_to_front_copy
+ b .Lneon_memmove_done
+
+ /* #############################################################
+ * Front to Back copy
+ */
+.Lneon_front_to_back_copy:
+ /*
+ * For small copies, just do a quick memcpy. We can do this for
+ * front-to-back copies, aligned or unaligned, since we're only
+ * doing 1 byte at a time...
+ */
+ cmp r2, #4
+ bgt .Lneon_f2b_gt4
+ cmp r2, #0
+.Lneon_f2b_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1], #1
+ subs r2, r2, #1
+ strb r12, [r0], #1
+ b .Lneon_f2b_smallcopy_loop
+.Lneon_f2b_gt4:
+ /* The window size is in r3. */
+ sub r3, r1, r0
+ /* #############################################################
+ * Front to Back copy
+ */
+ /*
+ * Note that we can't just route based on the size in r2. If that's
+ * larger than the overlap window in r3, we could potentially
+ * (and likely!) destroy data we're copying.
+ */
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #256
+ bge .Lneon_f2b_copy_128
+ cmp r12, #64
+ bge .Lneon_f2b_copy_32
+ cmp r12, #16
+ bge .Lneon_f2b_copy_16
+ cmp r12, #8
+ bge .Lneon_f2b_copy_8
+ cmp r12, #4
+ bge .Lneon_f2b_copy_4
+ b .Lneon_f2b_copy_1
+ nop
+.Lneon_f2b_copy_128:
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_f2b_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_f2b_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ vld1.32 {q0,q1}, [r1]!
+ vld1.32 {q2,q3}, [r1]!
+ vld1.32 {q8,q9}, [r1]!
+ vld1.32 {q10,q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ vst1.32 {q2,q3}, [r0]!
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q10,q11}, [r0]!
+ bne .Lneon_f2b_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_f2b_copy_128_loop_nopld:
+ vld1.32 {q0,q1}, [r1]!
+ vld1.32 {q2,q3}, [r1]!
+ vld1.32 {q8,q9}, [r1]!
+ vld1.32 {q10,q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ vst1.32 {q2,q3}, [r0]!
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q10,q11}, [r0]!
+ bne .Lneon_f2b_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_memmove_done
+ cmp r2, #32
+ bge .Lneon_f2b_copy_32
+ b .Lneon_f2b_copy_finish
+.Lneon_f2b_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_f2b_copy_32_loop:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne .Lneon_f2b_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_finish:
+.Lneon_f2b_copy_16:
+ movs r12, r2, lsr #4
+ beq .Lneon_f2b_copy_8
+.Lneon_f2b_copy_16_loop:
+ vld1.32 {q0}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0}, [r0]!
+ bne .Lneon_f2b_copy_16_loop
+ ands r2, r2, #0xf
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_8:
+ movs r12, r2, lsr #3
+ beq .Lneon_f2b_copy_4
+.Lneon_f2b_copy_8_loop:
+ vld1.32 {d0}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]!
+ bne .Lneon_f2b_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_4:
+ movs r12, r2, lsr #2
+ beq .Lneon_f2b_copy_1
+.Lneon_f2b_copy_4_loop:
+ ldr r3, [r1], #4
+ subs r12, r12, #1
+ str r3, [r0], #4
+ bne .Lneon_f2b_copy_4_loop
+ ands r2, r2, #0x3
+ nop
+.Lneon_f2b_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_1_loop:
+ ldrb r12, [r1], #1
+ subs r2, r2, #1
+ strb r12, [r0], #1
+ bne .Lneon_f2b_copy_1_loop
+.Lneon_f2b_finish:
+ b .Lneon_memmove_done
+
+ /* #############################################################
+ * Back to Front copy
+ */
+.Lneon_back_to_front_copy:
+ /*
+ * Here, we'll want to shift to the end of the buffers. This
+ * actually points us one past where we need to go, but since
+ * we'll pre-decrement throughout, this will be fine.
+ */
+ add r0, r0, r2
+ add r1, r1, r2
+ cmp r2, #4
+ bgt .Lneon_b2f_gt4
+ cmp r2, #0
+.Lneon_b2f_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ b .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+ /*
+ * The minimum of the overlap window size and the copy size
+ * is in r3.
+ */
+ sub r3, r0, r1
+ /*
+ * #############################################################
+ * Back to Front copy -
+ */
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #256
+ bge .Lneon_b2f_copy_128
+ cmp r12, #64
+ bge .Lneon_b2f_copy_32
+ cmp r12, #8
+ bge .Lneon_b2f_copy_8
+ cmp r12, #4
+ bge .Lneon_b2f_copy_4
+ b .Lneon_b2f_copy_1
+ nop
+.Lneon_b2f_copy_128:
+ movs r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_b2f_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #-(PLDOFFS-1)*PLDSIZE]
+.Lneon_b2f_copy_128_loop_outer:
+ pld [r1, #-(PLDOFFS*PLDSIZE)]
+ sub r1, r1, #128
+ sub r0, r0, #128
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ sub r1, r1, #128
+ sub r0, r0, #128
+ bne .Lneon_b2f_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_b2f_copy_128_loop_nopld:
+ sub r1, r1, #128
+ sub r0, r0, #128
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ sub r1, r1, #128
+ sub r0, r0, #128
+ bne .Lneon_b2f_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_memmove_done
+ cmp r2, #32
+ bge .Lneon_b2f_copy_32
+ b .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+ sub r1, r1, #32
+ sub r0, r0, #32
+ vld1.32 {q0,q1}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]
+ bne .Lneon_b2f_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+ movs r12, r2, lsr #0x3
+ beq .Lneon_b2f_copy_4
+.Lneon_b2f_copy_8_loop:
+ sub r1, r1, #8
+ sub r0, r0, #8
+ vld1.32 {d0}, [r1]
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]
+ bne .Lneon_b2f_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+ movs r12, r2, lsr #0x2
+ beq .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+ ldr r3, [r1, #-4]!
+ subs r12, r12, #1
+ str r3, [r0, #-4]!
+ bne .Lneon_b2f_copy_4_loop
+ ands r2, r2, #0x3
+ nop
+.Lneon_b2f_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_1_loop:
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ bne .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+ pop {r0}
+ bx lr
+
+ .end
+#endif /* SCORPION_NEON_OPTIMIZATION */
+
diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S
index 93abe15..8ecd80c 100644
--- a/libc/arch-arm/bionic/memset.S
+++ b/libc/arch-arm/bionic/memset.S
@@ -2,6 +2,8 @@
* Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -25,6 +27,90 @@
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
+#include <machine/cpu-features.h>
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+ .code 32
+ .align 8
+ .global memset
+ .type memset, %function
+
+ .global bzero
+ .type bzero, %function
+
+bzero:
+ mov r2, r1
+ mov r1, #0
+memset:
+ push {r0}
+
+ cmp r2, #6
+ bgt .Lmemset_gt6
+ cmp r2, #0
+ beq .Lmemset_smallcopy_done
+.Lmemset_smallcopy_loop:
+ strb r1, [r0], #1
+ subs r2, r2, #1
+ bne .Lmemset_smallcopy_loop
+.Lmemset_smallcopy_done:
+ pop {r0}
+ bx lr
+
+.Lmemset_gt6:
+ vdup.8 q0, r1
+ vmov r1, s0
+
+ /*
+ * Decide where to route for the maximum copy sizes.
+ */
+ cmp r2, #4
+ blt .Lmemset_lt4
+ cmp r2, #16
+ blt .Lmemset_lt16
+ vmov q1, q0
+ cmp r2, #128
+ blt .Lmemset_32
+.Lmemset_128:
+ mov r12, r2, lsr #7
+.Lmemset_128_loop:
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q0, q1}, [r0]!
+ subs r12, r12, #1
+ bne .Lmemset_128_loop
+ ands r2, r2, #0x7f
+ beq .Lmemset_end
+.Lmemset_32:
+ movs r12, r2, lsr #5
+ beq .Lmemset_lt32
+.Lmemset_32_loop:
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ bne .Lmemset_32_loop
+ ands r2, r2, #0x1f
+ beq .Lmemset_end
+.Lmemset_lt32:
+ cmp r2, #16
+ blt .Lmemset_lt16
+ vst1.64 {q0}, [r0]!
+ subs r2, r2, #16
+ beq .Lmemset_end
+.Lmemset_lt16:
+ movs r12, r2, lsl #29
+ strcs r1, [r0], #4
+ strcs r1, [r0], #4
+ strmi r1, [r0], #4
+.Lmemset_lt4:
+ movs r2, r2, lsl #31
+ strcsh r1, [r0], #2
+ strmib r1, [r0]
+.Lmemset_end:
+ pop {r0}
+ bx lr
+
+ .end
+#else /* !SCORPION_NEON_OPTIMIZATION */
.text
.global memset
@@ -115,3 +201,4 @@ memset:
bx lr
.fnend
+#endif /* SCORPION_NEON_OPTIMIZATION */