diff options
author | Wilco Dijkstra <Wilco.Dijkstra@arm.com> | 2015-11-06 14:09:00 +0000 |
---|---|---|
committer | Jake Weinstein <xboxlover360@gmail.com> | 2015-11-19 01:15:57 +0000 |
commit | 3b22fc9feba8c2c77deaa8ad58bce6d48e617ba4 (patch) | |
tree | 3e7811e9a3db9d2b9fa736250254bdd166e13131 | |
parent | 81bc2674dc8abe27ee20ba8bd566f73446234641 (diff) | |
download | bionic-3b22fc9feba8c2c77deaa8ad58bce6d48e617ba4.zip bionic-3b22fc9feba8c2c77deaa8ad58bce6d48e617ba4.tar.gz bionic-3b22fc9feba8c2c77deaa8ad58bce6d48e617ba4.tar.bz2 |
libc: AArch64: Tune memcpy
* Further tuning for performance.
Change-Id: Id08eaab885f9743fa7575077924a947c1b88e4ff
-rw-r--r-- | libc/arch-arm64/generic/bionic/memcpy_base.S | 57 |
1 files changed, 33 insertions, 24 deletions
diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S index 82431a3..f850624 100644 --- a/libc/arch-arm64/generic/bionic/memcpy_base.S +++ b/libc/arch-arm64/generic/bionic/memcpy_base.S @@ -71,6 +71,7 @@ #define A_h x7 #define A_hw w7 #define B_l x8 +#define B_lw w8 #define B_h x9 #define C_l x10 #define C_h x11 @@ -93,21 +94,41 @@ well as non-overlapping copies. */ + prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) cmp count, 96 b.hi L(copy_long) - cmp count, 16 - b.hs L(copy_medium) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 /* Small copies: 0..16 bytes. */ L(copy16): - tbz count, 3, 1f + cmp count, 8 + b.lo 1f ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret + .p2align 4 1: tbz count, 2, 1f ldr A_lw, [src] @@ -115,33 +136,21 @@ L(copy16): str A_lw, [dstin] str A_hw, [dstend, -4] ret - .p2align 4 + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1: cbz count, 2f + lsr tmp1, count, 1 ldrb A_lw, [src] - tbz count, 1, 1f - ldrh A_hw, [srcend, -2] - strh A_hw, [dstend, -2] -1: strb A_lw, [dstin] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] 2: ret .p2align 4 - /* Medium copies: 17..96 bytes. */ -L(copy_medium): - ldp A_l, A_h, [src] - tbnz count, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz count, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 /* Copy 64..96 bytes. Copy 64 bytes from the start and 32 bytes from the end. */ L(copy96): |