summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWilco Dijkstra <Wilco.Dijkstra@arm.com>2015-11-06 14:09:00 +0000
committerJake Weinstein <xboxlover360@gmail.com>2015-11-19 01:15:57 +0000
commit3b22fc9feba8c2c77deaa8ad58bce6d48e617ba4 (patch)
tree3e7811e9a3db9d2b9fa736250254bdd166e13131
parent81bc2674dc8abe27ee20ba8bd566f73446234641 (diff)
downloadbionic-3b22fc9feba8c2c77deaa8ad58bce6d48e617ba4.zip
bionic-3b22fc9feba8c2c77deaa8ad58bce6d48e617ba4.tar.gz
bionic-3b22fc9feba8c2c77deaa8ad58bce6d48e617ba4.tar.bz2
libc: AArch64: Tune memcpy
* Further tuning for performance. Change-Id: Id08eaab885f9743fa7575077924a947c1b88e4ff
-rw-r--r--libc/arch-arm64/generic/bionic/memcpy_base.S57
1 files changed, 33 insertions, 24 deletions
diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S
index 82431a3..f850624 100644
--- a/libc/arch-arm64/generic/bionic/memcpy_base.S
+++ b/libc/arch-arm64/generic/bionic/memcpy_base.S
@@ -71,6 +71,7 @@
#define A_h x7
#define A_hw w7
#define B_l x8
+#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
@@ -93,21 +94,41 @@
well as non-overlapping copies.
*/
+ prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
- cmp count, 16
- b.hs L(copy_medium)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, 1f
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
+ .p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
@@ -115,33 +136,21 @@ L(copy16):
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
- .p2align 4
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
+ lsr tmp1, count, 1
ldrb A_lw, [src]
- tbz count, 1, 1f
- ldrh A_hw, [srcend, -2]
- strh A_hw, [dstend, -2]
-1: strb A_lw, [dstin]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
2: ret
.p2align 4
- /* Medium copies: 17..96 bytes. */
-L(copy_medium):
- ldp A_l, A_h, [src]
- tbnz count, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz count, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):