Neon-optimized versions of memcpy.

This optimization come from the external 0xdroid repository. Original patch can be found here: http://gitorious.org/0xdroid/bionic/commit/ebafe41c2c02f8c09a3c1d7746047083df180ac5
author: David 'Digit' Turner <digit@google.com> 2009-08-26 21:50:42 +0200
committer: David 'Digit' Turner <digit@google.com> 2009-09-02 23:21:52 +0200
commit: 1bbc56cd227546cb155bb47721cdb717780a3400 (patch)
tree: d8fa2782b57382a9f94eb4cd51113c842d67eab7 /libc/arch-arm
parent: 898cc98f3d6536f7ae1b38340537edecf9a529f2 (diff)
download: bionic-1bbc56cd227546cb155bb47721cdb717780a3400.zip
bionic-1bbc56cd227546cb155bb47721cdb717780a3400.tar.gz
bionic-1bbc56cd227546cb155bb47721cdb717780a3400.tar.bz2
1 files changed, 105 insertions, 0 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index fcb58cd..97331d3 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -28,6 +28,109 @@
 
 #include <machine/cpu-features.h>
 
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+
+		.text
+		.fpu    neon
+
+		.global memcpy
+		.type memcpy, %function
+		.align 4
+
+#define NEON_MAX_PREFETCH_DISTANCE 320
+
+memcpy:
+		mov	ip, r0
+		cmp	r2, #16
+		blt     4f	@ Have less than 16 bytes to copy
+
+		@ First ensure 16 byte alignment for the destination buffer
+		tst	r0, #0xF
+		beq	2f
+		tst	r0, #1
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		subne	r2, r2, #1
+		tst	ip, #2
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		ldrneb	r3, [r1], #1
+		strneb	r3, [ip], #1
+		subne	r2, r2, #2
+
+		tst	ip, #4
+		beq	1f
+		vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
+		vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+		sub	r2, r2, #4
+1:
+		tst	ip, #8
+		beq	2f
+		vld1.8	{d0}, [r1]!
+		vst1.8	{d0}, [ip, :64]!
+		sub	r2, r2, #8
+2:
+		subs	r2, r2, #32
+		blt	3f
+		mov	r3, #32
+
+		@ Main copy loop, 32 bytes are processed per iteration.
+		@ ARM instructions are used for doing fine-grained prefetch,
+		@ increasing prefetch distance progressively up to
+		@ NEON_MAX_PREFETCH_DISTANCE at runtime
+1:
+		vld1.8	{d0-d3}, [r1]!
+		cmp	r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
+		pld	[r1, r3]
+		addle	r3, r3, #32
+		vst1.8	{d0-d3}, [ip, :128]!
+		sub	r2, r2, #32
+		cmp	r2, r3
+		bge	1b
+		cmp	r2, #0
+		blt	3f
+1:		@ Copy the remaining part of the buffer (already prefetched)
+		vld1.8	{d0-d3}, [r1]!
+		subs	r2, r2, #32
+		vst1.8	{d0-d3}, [ip, :128]!
+		bge	1b
+3:		@ Copy up to 31 remaining bytes
+		tst	r2, #16
+		beq	4f
+		vld1.8	{d0, d1}, [r1]!
+		vst1.8	{d0, d1}, [ip, :128]!
+4:
+		@ Use ARM instructions exclusively for the final trailing part
+		@ not fully fitting into full 16 byte aligned block in order
+		@ to avoid "ARM store after NEON store" hazard. Also NEON
+		@ pipeline will be (mostly) flushed by the time when the
+		@ control returns to the caller, making the use of NEON mostly
+		@ transparent (and avoiding hazards in the caller code)
+
+		movs	r3, r2, lsl #29
+		bcc	1f
+	.rept	8
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+	.endr
+1:
+		bpl	1f
+	.rept	4
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+	.endr
+1:
+		movs	r2, r2, lsl #31
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+		ldrcsb	r3, [r1], #1
+		strcsb	r3, [ip], #1
+		ldrmib	r3, [r1], #1
+		strmib	r3, [ip], #1
+		bx	lr
+
+#else	/* __ARM_ARCH__ < 7 */
+
 	.text
 
     .global memcpy
@@ -385,3 +488,5 @@ copy_last_3_and_return:
 		bx			lr
         .fnend
 
+#endif
+
author	David 'Digit' Turner <digit@google.com>	2009-08-26 21:50:42 +0200
committer	David 'Digit' Turner <digit@google.com>	2009-09-02 23:21:52 +0200
commit	1bbc56cd227546cb155bb47721cdb717780a3400 (patch)
tree	d8fa2782b57382a9f94eb4cd51113c842d67eab7 /libc/arch-arm
parent	898cc98f3d6536f7ae1b38340537edecf9a529f2 (diff)
download	bionic-1bbc56cd227546cb155bb47721cdb717780a3400.zip bionic-1bbc56cd227546cb155bb47721cdb717780a3400.tar.gz bionic-1bbc56cd227546cb155bb47721cdb717780a3400.tar.bz2