summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrent DeGraaf <bdegraaf@codeaurora.org>2013-10-02 13:47:11 +0000
committerSteve Kondik <steve@cyngn.com>2015-10-29 22:44:26 -0700
commitf8a907d25a9f319e67fcf005638adb52fa09dd8b (patch)
tree03d23d53765fb0370976486166160e25b213525f
parent35188756d314056f9a706288a000c7e85c54c2e0 (diff)
downloadbionic-f8a907d25a9f319e67fcf005638adb52fa09dd8b.zip
bionic-f8a907d25a9f319e67fcf005638adb52fa09dd8b.tar.gz
bionic-f8a907d25a9f319e67fcf005638adb52fa09dd8b.tar.bz2
[AOSP Master] libc: krait: Use performance version of memcpy
* This commit improves performance for small copies compared to the original CAF one. It also cleans up some functions. Change-Id: Iaa52635240da8b8746693186b66b69778e833c32
-rw-r--r--libc/arch-arm/krait/bionic/__strcat_chk.S19
-rw-r--r--libc/arch-arm/krait/bionic/__strcpy_chk.S15
-rw-r--r--libc/arch-arm/krait/bionic/memcpy.S17
-rw-r--r--libc/arch-arm/krait/bionic/memcpy_base.S124
4 files changed, 71 insertions, 104 deletions
diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S
index 246f159..1a39c5b 100644
--- a/libc/arch-arm/krait/bionic/__strcat_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcat_chk.S
@@ -40,7 +40,7 @@
ENTRY(__strcat_chk)
pld [r0, #0]
push {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
push {r4, r5}
@@ -177,7 +177,7 @@ ENTRY(__strcat_chk)
.L_strlen_done:
add r2, r3, r4
cmp r2, lr
- bhi __strcat_chk_failed
+ bhi .L_strcat_chk_failed
// Set up the registers for the memcpy code.
mov r1, r5
@@ -185,20 +185,17 @@ ENTRY(__strcat_chk)
mov r2, r4
add r0, r0, r3
pop {r4, r5}
-END(__strcat_chk)
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r4
+ .cfi_restore r5
-#define MEMCPY_BASE __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__strcat_chk_failed)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
+ // Undo the above cfi directives.
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r4, 0
.cfi_rel_offset r5, 4
-
+.L_strcat_chk_failed:
ldr r0, error_message
ldr r1, error_code
1:
@@ -208,7 +205,7 @@ error_code:
.word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__strcat_chk_failed)
+END(__strcat_chk)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S
index db76686..00202f3 100644
--- a/libc/arch-arm/krait/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S
@@ -39,7 +39,7 @@
ENTRY(__strcpy_chk)
pld [r0, #0]
push {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
@@ -149,21 +149,14 @@ ENTRY(__strcpy_chk)
pld [r1, #64]
ldr r0, [sp]
cmp r3, lr
- bhs __strcpy_chk_failed
+ bhs .L_strcpy_chk_failed
// Add 1 for copy length to get the string terminator.
add r2, r3, #1
-END(__strcpy_chk)
-#define MEMCPY_BASE __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__strcpy_chk_failed)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
-
+.L_strcpy_chk_failed:
ldr r0, error_message
ldr r1, error_code
1:
@@ -173,7 +166,7 @@ error_code:
.word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__strcpy_chk_failed)
+END(__strcpy_chk)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 9ff46a8..5d27b57 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -45,7 +45,7 @@
ENTRY(__memcpy_chk)
cmp r2, r3
- bhi __memcpy_chk_fail
+ bhi .L_memcpy_chk_fail
// Fall through to memcpy...
END(__memcpy_chk)
@@ -53,19 +53,20 @@ END(__memcpy_chk)
ENTRY(memcpy)
pld [r1, #64]
stmfd sp!, {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
-END(memcpy)
-#define MEMCPY_BASE __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__memcpy_chk_fail)
+ // Undo the cfi directives from above.
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r0
+ .cfi_restore lr
+.L_memcpy_chk_fail:
// Preserve lr for backtrace.
push {lr}
- .cfi_def_cfa_offset 4
+ .cfi_adjust_cfa_offset 4
.cfi_rel_offset lr, 0
ldr r0, error_message
@@ -77,7 +78,7 @@ error_code:
.word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__memcpy_chk_fail)
+END(memcpy)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S
index 068f2f6..76c5a84 100644
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -30,59 +30,35 @@
#include <machine/cpu-features.h>
#include <machine/asm.h>
-/*
- * These default settings are good for all Krait-based systems
- * as of this writing, but they can be overridden in:
- * device/<vendor>/<board>/BoardConfig.mk
- * by setting the following:
- * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
- * TARGET_USE_KRAIT_PLD_SET := true
- * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
- * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
- * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
- * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
- */
-
-#ifndef PLDOFFS
#define PLDOFFS (10)
-#endif
-#ifndef PLDTHRESH
#define PLDTHRESH (PLDOFFS)
-#endif
-#ifndef BBTHRESH
#define BBTHRESH (4096/64)
-#endif
+#define PLDSIZE (64)
+
#if (PLDOFFS < 1)
#error Routine does not support offsets less than 1
#endif
+
#if (PLDTHRESH < PLDOFFS)
#error PLD threshold must be greater than or equal to the PLD offset
#endif
-#ifndef PLDSIZE
-#define PLDSIZE (64)
-#endif
+
.text
.fpu neon
-ENTRY(MEMCPY_BASE)
-MEMCPY_BASE_ALIGNED:
- // .cfi_startproc
- .save {r0, r9, r10, lr}
- // .cfi_def_cfa_offset 8
- //.cfi_rel_offset r0, 0
- //.cfi_rel_offset lr, 4
+.L_memcpy_base:
cmp r2, #4
- blt .Lneon_lt4
+ blt .L_neon_lt4
cmp r2, #16
- blt .Lneon_lt16
+ blt .L_neon_lt16
cmp r2, #32
- blt .Lneon_16
+ blt .L_neon_16
cmp r2, #64
- blt .Lneon_copy_32_a
+ blt .L_neon_copy_32_a
mov r12, r2, lsr #6
cmp r12, #PLDTHRESH
- ble .Lneon_copy_64_loop_nopld
+ ble .L_neon_copy_64_loop_nopld
push {r9, r10}
.cfi_adjust_cfa_offset 8
@@ -90,7 +66,7 @@ MEMCPY_BASE_ALIGNED:
.cfi_rel_offset r10, 4
cmp r12, #BBTHRESH
- ble .Lneon_prime_pump
+ ble .L_neon_prime_pump
add lr, r0, #0x400
add r9, r1, #(PLDOFFS*PLDSIZE)
@@ -99,12 +75,12 @@ MEMCPY_BASE_ALIGNED:
lsr lr, lr, #21
add lr, lr, #(PLDOFFS*PLDSIZE)
cmp r12, lr, lsr #6
- ble .Lneon_prime_pump
+ ble .L_neon_prime_pump
itt gt
movgt r9, #(PLDOFFS)
rsbsgt r9, r9, lr, lsr #6
- ble .Lneon_prime_pump
+ ble .L_neon_prime_pump
add r10, r1, lr
bic r10, #0x3F
@@ -118,7 +94,7 @@ MEMCPY_BASE_ALIGNED:
movgt r12, #0
pld [r1, #((PLDOFFS-1)*PLDSIZE)]
-.Lneon_copy_64_loop_outer_doublepld:
+.L_neon_copy_64_loop_outer_doublepld:
pld [r1, #((PLDOFFS)*PLDSIZE)]
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
@@ -127,14 +103,14 @@ MEMCPY_BASE_ALIGNED:
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
- bne .Lneon_copy_64_loop_outer_doublepld
+ bne .L_neon_copy_64_loop_outer_doublepld
cmp r12, #0
- beq .Lneon_pop_before_nopld
+ beq .L_neon_pop_before_nopld
cmp r12, #(512*1024/64)
- blt .Lneon_copy_64_loop_outer
+ blt .L_neon_copy_64_loop_outer
-.Lneon_copy_64_loop_ddr:
+.L_neon_copy_64_loop_ddr:
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
pld [r10]
@@ -142,16 +118,17 @@ MEMCPY_BASE_ALIGNED:
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
- bne .Lneon_copy_64_loop_ddr
- b .Lneon_pop_before_nopld
+ bne .L_neon_copy_64_loop_ddr
+ b .L_neon_pop_before_nopld
-.Lneon_prime_pump:
+.L_neon_prime_pump:
mov lr, #(PLDOFFS*PLDSIZE)
add r10, r1, #(PLDOFFS*PLDSIZE)
bic r10, #0x3F
sub r12, r12, #PLDOFFS
ldr r3, [r10, #(-1*PLDSIZE)]
-.Lneon_copy_64_loop_outer:
+
+.L_neon_copy_64_loop_outer:
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
ldr r3, [r10]
@@ -159,47 +136,49 @@ MEMCPY_BASE_ALIGNED:
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
- bne .Lneon_copy_64_loop_outer
-.Lneon_pop_before_nopld:
+ bne .L_neon_copy_64_loop_outer
+
+.L_neon_pop_before_nopld:
mov r12, lr, lsr #6
pop {r9, r10}
+ .cfi_adjust_cfa_offset -8
.cfi_restore r9
.cfi_restore r10
- .cfi_adjust_cfa_offset -8
-.Lneon_copy_64_loop_nopld:
+.L_neon_copy_64_loop_nopld:
vld1.32 {q8, q9}, [r1]!
vld1.32 {q10, q11}, [r1]!
subs r12, r12, #1
vst1.32 {q8, q9}, [r0]!
vst1.32 {q10, q11}, [r0]!
- bne .Lneon_copy_64_loop_nopld
+ bne .L_neon_copy_64_loop_nopld
ands r2, r2, #0x3f
- .cfi_restore r0
- .cfi_adjust_cfa_offset -4
- beq .Lneon_exit
-.Lneon_copy_32_a:
+ beq .L_neon_exit
+
+.L_neon_copy_32_a:
movs r3, r2, lsl #27
- bcc .Lneon_16
+ bcc .L_neon_16
vld1.32 {q0,q1}, [r1]!
vst1.32 {q0,q1}, [r0]!
-.Lneon_16:
- bpl .Lneon_lt16
+
+.L_neon_16:
+ bpl .L_neon_lt16
vld1.32 {q8}, [r1]!
vst1.32 {q8}, [r0]!
ands r2, r2, #0x0f
- beq .Lneon_exit
-.Lneon_lt16:
+ beq .L_neon_exit
+
+.L_neon_lt16:
movs r3, r2, lsl #29
- itttt cs
- ldrcs r3, [r1], #4
- strcs r3, [r0], #4
- ldrcs r3, [r1], #4
- strcs r3, [r0], #4
- itt mi
- ldrmi r3, [r1], #4
- strmi r3, [r0], #4
-.Lneon_lt4:
+ bcc 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0]!
+1:
+ bge .L_neon_lt4
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+.L_neon_lt4:
movs r2, r2, lsl #31
itt cs
ldrhcs r3, [r1], #2
@@ -207,9 +186,6 @@ MEMCPY_BASE_ALIGNED:
itt mi
ldrbmi r3, [r1]
strbmi r3, [r0]
-.Lneon_exit:
- pop {r0, lr}
- bx lr
- //.cfi_endproc
-END(MEMCPY_BASE)
+.L_neon_exit:
+ pop {r0, pc}