diff options
author | David 'Digit' Turner <digit@google.com> | 2009-08-26 21:50:42 +0200 |
---|---|---|
committer | David 'Digit' Turner <digit@google.com> | 2009-09-02 23:21:52 +0200 |
commit | 1bbc56cd227546cb155bb47721cdb717780a3400 (patch) | |
tree | d8fa2782b57382a9f94eb4cd51113c842d67eab7 /libc/arch-arm | |
parent | 898cc98f3d6536f7ae1b38340537edecf9a529f2 (diff) | |
download | bionic-1bbc56cd227546cb155bb47721cdb717780a3400.zip bionic-1bbc56cd227546cb155bb47721cdb717780a3400.tar.gz bionic-1bbc56cd227546cb155bb47721cdb717780a3400.tar.bz2 |
Neon-optimized versions of memcpy.
This optimization come from the external 0xdroid repository.
Original patch can be found here:
http://gitorious.org/0xdroid/bionic/commit/ebafe41c2c02f8c09a3c1d7746047083df180ac5
Diffstat (limited to 'libc/arch-arm')
-rw-r--r-- | libc/arch-arm/bionic/memcpy.S | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index fcb58cd..97331d3 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -28,6 +28,109 @@ #include <machine/cpu-features.h> +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + + .text + .fpu neon + + .global memcpy + .type memcpy, %function + .align 4 + +#define NEON_MAX_PREFETCH_DISTANCE 320 + +memcpy: + mov ip, r0 + cmp r2, #16 + blt 4f @ Have less than 16 bytes to copy + + @ First ensure 16 byte alignment for the destination buffer + tst r0, #0xF + beq 2f + tst r0, #1 + ldrneb r3, [r1], #1 + strneb r3, [ip], #1 + subne r2, r2, #1 + tst ip, #2 + ldrneb r3, [r1], #1 + strneb r3, [ip], #1 + ldrneb r3, [r1], #1 + strneb r3, [ip], #1 + subne r2, r2, #2 + + tst ip, #4 + beq 1f + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]! + sub r2, r2, #4 +1: + tst ip, #8 + beq 2f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [ip, :64]! + sub r2, r2, #8 +2: + subs r2, r2, #32 + blt 3f + mov r3, #32 + + @ Main copy loop, 32 bytes are processed per iteration. + @ ARM instructions are used for doing fine-grained prefetch, + @ increasing prefetch distance progressively up to + @ NEON_MAX_PREFETCH_DISTANCE at runtime +1: + vld1.8 {d0-d3}, [r1]! + cmp r3, #(NEON_MAX_PREFETCH_DISTANCE - 32) + pld [r1, r3] + addle r3, r3, #32 + vst1.8 {d0-d3}, [ip, :128]! + sub r2, r2, #32 + cmp r2, r3 + bge 1b + cmp r2, #0 + blt 3f +1: @ Copy the remaining part of the buffer (already prefetched) + vld1.8 {d0-d3}, [r1]! + subs r2, r2, #32 + vst1.8 {d0-d3}, [ip, :128]! + bge 1b +3: @ Copy up to 31 remaining bytes + tst r2, #16 + beq 4f + vld1.8 {d0, d1}, [r1]! + vst1.8 {d0, d1}, [ip, :128]! +4: + @ Use ARM instructions exclusively for the final trailing part + @ not fully fitting into full 16 byte aligned block in order + @ to avoid "ARM store after NEON store" hazard. Also NEON + @ pipeline will be (mostly) flushed by the time when the + @ control returns to the caller, making the use of NEON mostly + @ transparent (and avoiding hazards in the caller code) + + movs r3, r2, lsl #29 + bcc 1f + .rept 8 + ldrcsb r3, [r1], #1 + strcsb r3, [ip], #1 + .endr +1: + bpl 1f + .rept 4 + ldrmib r3, [r1], #1 + strmib r3, [ip], #1 + .endr +1: + movs r2, r2, lsl #31 + ldrcsb r3, [r1], #1 + strcsb r3, [ip], #1 + ldrcsb r3, [r1], #1 + strcsb r3, [ip], #1 + ldrmib r3, [r1], #1 + strmib r3, [ip], #1 + bx lr + +#else /* __ARM_ARCH__ < 7 */ + .text .global memcpy @@ -385,3 +488,5 @@ copy_last_3_and_return: bx lr .fnend +#endif + |