summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrent DeGraaf <bdegraaf@codeaurora.org>2013-10-02 09:47:11 -0400
committerSteve Kondik <shade@chemlab.org>2013-11-13 05:41:33 -0800
commita46374c5f2a039662c7cb32f4d1e8f5e6a483a2d (patch)
treedeae94faf2e2c383b0aa039b25d02676b9ccba23
parentb59b790f97dc58a931719524499269f2f3b904f2 (diff)
downloadbionic-cm-10.2.0.zip
bionic-cm-10.2.0.tar.gz
bionic-cm-10.2.0.tar.bz2
libc: krait: Re-factor high-performance memcpy for thumb2cm-10.2.1cm-10.2.0
The majority of libc under bionic is built for thumb2. Refactor the high performance memcpy used in previous builds for thumb2, including information that can be used for stack-unwinding. Change-Id: Ib5f7ab354f39313758402ec02b0aea27b15d45fa
-rw-r--r--libc/arch-arm/krait/bionic/memcpy.S198
-rw-r--r--libc/arch-arm/krait/bionic/memcpy_base.S215
2 files changed, 240 insertions, 173 deletions
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 818c3a4..aca96a8 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2013 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -26,185 +26,37 @@
* SUCH DAMAGE.
*/
-/* Assumes neon instructions and a cache line size of 64 bytes. */
+/* Assumes neon instructions and a cache line size of 32 bytes. */
-#include <machine/cpu-features.h>
#include <machine/asm.h>
/*
- * These can be overridden in:
- * device/<vendor>/<board>/BoardConfig.mk
- * by setting the following:
- * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
- * TARGET_USE_KRAIT_PLD_SET := true
- * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
- * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
- * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
- * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
+ * This code assumes it is running on a processor that supports all arm v7
+ * instructions, that supports neon instructions, and that has a 32 byte
+ * cache line.
*/
-#ifndef PLDOFFS
-#define PLDOFFS (10)
-#endif
-#ifndef PLDTHRESH
-#define PLDTHRESH (PLDOFFS)
-#endif
-#ifndef BBTHRESH
-#define BBTHRESH (4096/64)
-#endif
-#if (PLDOFFS < 1)
-#error Routine does not support offsets less than 1
-#endif
-#if (PLDTHRESH < PLDOFFS)
-#error PLD threshold must be greater than or equal to the PLD offset
-#endif
-#ifndef PLDSIZE
-#define PLDSIZE (64)
-#endif
-#define NOP_OPCODE (0xe320f000)
-
- .text
- .fpu neon
+ .text
+ .syntax unified
+ .fpu neon
+ .thumb
+ .thumb_func
ENTRY(memcpy)
- .save {r0, lr}
- mov r12, r0
- cmp r2, #4
- blt .Lneon_lt4
- cmp r2, #16
- blt .Lneon_lt16
- cmp r2, #32
- blt .Lneon_16
- cmp r2, #64
- blt .Lneon_copy_32_a
- stmfd sp!, {r0}
-
- mov r12, r2, lsr #6
- cmp r12, #PLDTHRESH
- ble .Lneon_copy_64_loop_nopld
-
- stmfd sp!, {r9, r10, lr}
-
- cmp r12, #BBTHRESH
- ble .Lneon_prime_pump
-
- add lr, r0, #0x400
- add r9, r1, #(PLDOFFS*PLDSIZE)
- sub lr, lr, r9
- lsl lr, lr, #21
- lsr lr, lr, #21
- add lr, lr, #(PLDOFFS*PLDSIZE)
- cmp r12, lr, lsr #6
- movle lr, #(PLDOFFS*PLDSIZE)
- ble .Lneon_prime_pump
-
- movgt r9, #(PLDOFFS)
- rsbgts r9, r9, lr, lsr #6
- ble .Lneon_prime_pump
-
- add r10, r1, lr
- bic r10, #0x3F
-
- sub r12, lr, lsr #6
- cmp r9, r12
- suble r12, r12, r9
- movgt r9, r12
- movgt r12, #0
-
- pld [r1, #((PLDOFFS-1)*PLDSIZE)]
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_copy_64_loop_outer_doublepld:
- pld [r1, #((PLDOFFS)*PLDSIZE)]
- vld1.32 {q0, q1}, [r1]!
- vld1.32 {q2, q3}, [r1]!
- ldr r3, [r10]
- subs r9, r9, #1
- vst1.32 {q0, q1}, [r0]!
- vst1.32 {q2, q3}, [r0]!
- add r10, #64
- bne .Lneon_copy_64_loop_outer_doublepld
- cmp r12, #0
- beq .Lneon_pop_before_nopld
-
- cmp r12, #(512*1024/64)
- blt .Lneon_copy_64_loop_outer
-
- .balignl 64, NOP_OPCODE, 8
-.Lneon_copy_64_loop_ddr:
- vld1.32 {q0, q1}, [r1]!
- vld1.32 {q2, q3}, [r1]!
- pld [r10]
- subs r12, r12, #1
- vst1.32 {q0, q1}, [r0]!
- vst1.32 {q2, q3}, [r0]!
- add r10, #64
- bne .Lneon_copy_64_loop_ddr
- b .Lneon_pop_before_nopld
-
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_prime_pump:
- mov lr, #(PLDOFFS*PLDSIZE)
- add r10, r1, #(PLDOFFS*PLDSIZE)
- bic r10, #0x3F
- sub r12, r12, #PLDOFFS
- ldr r3, [r10, #(-1*PLDSIZE)]
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_copy_64_loop_outer:
- vld1.32 {q0, q1}, [r1]!
- vld1.32 {q2, q3}, [r1]!
- ldr r3, [r10]
- subs r12, r12, #1
- vst1.32 {q0, q1}, [r0]!
- vst1.32 {q2, q3}, [r0]!
- add r10, #64
- bne .Lneon_copy_64_loop_outer
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_pop_before_nopld:
- mov r12, lr, lsr #6
- ldmfd sp!, {r9, r10, lr}
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_copy_64_loop_nopld:
- vld1.32 {q8, q9}, [r1]!
- vld1.32 {q10, q11}, [r1]!
- subs r12, r12, #1
- vst1.32 {q8, q9}, [r0]!
- vst1.32 {q10, q11}, [r0]!
- bne .Lneon_copy_64_loop_nopld
- ands r2, r2, #0x3f
- ldmfd sp!, {r12}
- beq .Lneon_exit
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_copy_32_a:
- movs r3, r2, lsl #27
- bcc .Lneon_16
- vld1.32 {q0,q1}, [r1]!
- vst1.32 {q0,q1}, [r0]!
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_16:
- bpl .Lneon_lt16
- vld1.32 {q8}, [r1]!
- vst1.32 {q8}, [r0]!
- ands r2, r2, #0x0f
- beq .Lneon_exit
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_lt16:
- movs r3, r2, lsl #29
- ldrcs r3, [r1], #4
- strcs r3, [r0], #4
- ldrcs r3, [r1], #4
- strcs r3, [r0], #4
- ldrmi r3, [r1], #4
- strmi r3, [r0], #4
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_lt4:
- movs r2, r2, lsl #31
- ldrcsh r3, [r1], #2
- strcsh r3, [r0], #2
- ldrmib r3, [r1]
- strmib r3, [r0]
- .balignl 64, NOP_OPCODE, 4*2
-.Lneon_exit:
- mov r0, r12
- bx lr
+ .cfi_startproc
+ pld [r1, #64]
+ stmfd sp!, {r0, lr}
+ .save {r0, lr}
+ .cfi_def_cfa_offset 8
+ .cfi_rel_offset r0, 0
+ .cfi_rel_offset lr, 4
+ .cfi_endproc
END(memcpy)
+#define MEMCPY_BASE __memcpy_base
+#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
+#include "memcpy_base.S"
+
+ .data
+error_string:
+ .string "memcpy buffer overflow"
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S
new file mode 100644
index 0000000..e80e738
--- /dev/null
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -0,0 +1,215 @@
+/***************************************************************************
+ Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of The Linux Foundation nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************/
+
+/* Assumes neon instructions and a cache line size of 64 bytes. */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+/*
+ * These default settings are good for all Krait-based systems
+ * as of this writing, but they can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_KRAIT_PLD_SET := true
+ * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
+ */
+
+#ifndef PLDOFFS
+#define PLDOFFS (10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#ifndef BBTHRESH
+#define BBTHRESH (4096/64)
+#endif
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+ .text
+ .fpu neon
+
+ENTRY(MEMCPY_BASE)
+MEMCPY_BASE_ALIGNED:
+ .cfi_startproc
+ .save {r0, r9, r10, lr}
+ .cfi_def_cfa_offset 8
+ .cfi_rel_offset r0, 0
+ .cfi_rel_offset lr, 4
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #64
+ blt .Lneon_copy_32_a
+
+ mov r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .Lneon_copy_64_loop_nopld
+
+ push {r9, r10}
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset r9, 0
+ .cfi_rel_offset r10, 4
+
+ cmp r12, #BBTHRESH
+ ble .Lneon_prime_pump
+
+ add lr, r0, #0x400
+ add r9, r1, #(PLDOFFS*PLDSIZE)
+ sub lr, lr, r9
+ lsl lr, lr, #21
+ lsr lr, lr, #21
+ add lr, lr, #(PLDOFFS*PLDSIZE)
+ cmp r12, lr, lsr #6
+ ble .Lneon_prime_pump
+
+ itt gt
+ movgt r9, #(PLDOFFS)
+ rsbsgt r9, r9, lr, lsr #6
+ ble .Lneon_prime_pump
+
+ add r10, r1, lr
+ bic r10, #0x3F
+
+ sub r12, r12, lr, lsr #6
+
+ cmp r9, r12
+ itee le
+ suble r12, r12, r9
+ movgt r9, r12
+ movgt r12, #0
+
+ pld [r1, #((PLDOFFS-1)*PLDSIZE)]
+.Lneon_copy_64_loop_outer_doublepld:
+ pld [r1, #((PLDOFFS)*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r9, r9, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_outer_doublepld
+ cmp r12, #0
+ beq .Lneon_pop_before_nopld
+
+ cmp r12, #(512*1024/64)
+ blt .Lneon_copy_64_loop_outer
+
+.Lneon_copy_64_loop_ddr:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ pld [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_ddr
+ b .Lneon_pop_before_nopld
+
+.Lneon_prime_pump:
+ mov lr, #(PLDOFFS*PLDSIZE)
+ add r10, r1, #(PLDOFFS*PLDSIZE)
+ bic r10, #0x3F
+ sub r12, r12, #PLDOFFS
+ ldr r3, [r10, #(-1*PLDSIZE)]
+.Lneon_copy_64_loop_outer:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_outer
+.Lneon_pop_before_nopld:
+ mov r12, lr, lsr #6
+ pop {r9, r10}
+ .cfi_restore r9
+ .cfi_restore r10
+ .cfi_adjust_cfa_offset -8
+
+.Lneon_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ .cfi_restore r0
+ .cfi_adjust_cfa_offset -4
+ beq .Lneon_exit
+.Lneon_copy_32_a:
+ movs r3, r2, lsl #27
+ bcc .Lneon_16
+ vld1.32 {q0,q1}, [r1]!
+ vst1.32 {q0,q1}, [r0]!
+.Lneon_16:
+ bpl .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ ands r2, r2, #0x0f
+ beq .Lneon_exit
+.Lneon_lt16:
+ movs r3, r2, lsl #29
+ itttt cs
+ ldrcs r3, [r1], #4
+ strcs r3, [r0], #4
+ ldrcs r3, [r1], #4
+ strcs r3, [r0], #4
+ itt mi
+ ldrmi r3, [r1], #4
+ strmi r3, [r0], #4
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ itt cs
+ ldrhcs r3, [r1], #2
+ strhcs r3, [r0], #2
+ itt mi
+ ldrbmi r3, [r1]
+ strbmi r3, [r0]
+.Lneon_exit:
+ pop {r0, lr}
+ bx lr
+ .cfi_endproc
+END(MEMCPY_BASE)
+