summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenrik Smiding <henrik.smiding@stericsson.com>2011-01-17 16:05:41 +0100
committerSteve Kondik <shade@chemlab.org>2013-02-15 10:02:23 -0800
commit8c35d7eaeb67ace9a96922f16ba9e491dcde6534 (patch)
treeda20a762a3b0d75b83c318e0f04324f5e3e5c169
parent29af685e7a54c3196b69911106603c155bc99f83 (diff)
downloadbionic-8c35d7eaeb67ace9a96922f16ba9e491dcde6534.zip
bionic-8c35d7eaeb67ace9a96922f16ba9e491dcde6534.tar.gz
bionic-8c35d7eaeb67ace9a96922f16ba9e491dcde6534.tar.bz2
Add optimized version of memcpy for Cortex A9
Adds new code to memcpy function, optimized for Cortex A9. Adds new ARM-only loop, for operations where source and destination are aligned. Copyright (C) ST-Ericsson SA 2010 Modified neon implementation to fit Cortex A9 cache line size, for those running 32 bytes L2 cache line size. Also split the implementation in aligned and unaligned access, for those that allows unaligned memory access with Neon. For totally aligned operations, arm-only code is used. Change-Id: I95ebf6164cd6486b12a7e3e98e369db21e7e18d2 Author: Henrik Smiding henrik.smiding@stericsson.com for ST-Ericsson. Signed-off-by: Christian Bejram <christian.bejram@stericsson.com>
-rw-r--r--libc/arch-arm/bionic/memcpy.S214
1 files changed, 188 insertions, 26 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 80f1bf5..ca6a8be 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -429,28 +429,35 @@ memcpy:
#ifdef HAVE_32_BYTE_CACHE_LINE
/* a prefetch distance of 2 cache-lines */
#define CACHE_LINE_SIZE 32
-#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*2)
#else
/* a prefetch distance of 4 cache-lines works best experimentally */
#define CACHE_LINE_SIZE 64
-#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
#endif
ENTRY(memcpy)
.save {r0, lr}
/* start preloading as early as possible */
- pld [r1, #(CACHE_LINE_SIZE*0)]
+ pld [r1, #(CACHE_LINE_SIZE * 0)]
stmfd sp!, {r0, lr}
- pld [r1, #(CACHE_LINE_SIZE*1)]
+ pld [r1, #(CACHE_LINE_SIZE * 1)]
+/* If Neon supports unaligned access then remove the align code,
+ * unless a size limit has been specified.
+ */
+#ifndef NEON_UNALIGNED_ACCESS
/* do we have at least 16-bytes to copy (needed for alignment below) */
cmp r2, #16
blo 5f
+ /* check if buffers are aligned. If so, run arm-only version */
+ eor r3, r0, r1
+ ands r3, r3, #0x3
+ beq 11f
+
/* align destination to cache-line for the write-buffer */
rsb r3, r0, #0
ands r3, r3, #0xF
- beq 0f
+ beq 2f
/* copy up to 15-bytes (count in r3) */
sub r2, r2, r3
@@ -471,10 +478,9 @@ ENTRY(memcpy)
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0, :64]!
2:
-
-0: /* preload immediately the next cache line, which we may need */
- pld [r1, #(CACHE_LINE_SIZE*0)]
- pld [r1, #(CACHE_LINE_SIZE*1)]
+ /* preload immediately the next cache line, which we may need */
+ pld [r1, #(CACHE_LINE_SIZE * 0)]
+ pld [r1, #(CACHE_LINE_SIZE * 1)]
#ifdef HAVE_32_BYTE_CACHE_LINE
/* make sure we have at least 32 bytes to copy */
@@ -500,23 +506,22 @@ ENTRY(memcpy)
subs r2, r2, #64
blo 2f
- /* preload all the cache lines we need.
- * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
- * ideally would would increase the distance in the main loop to
- * avoid the goofy code below. In practice this doesn't seem to make
- * a big difference.
- */
- pld [r1, #(CACHE_LINE_SIZE*2)]
- pld [r1, #(CACHE_LINE_SIZE*3)]
- pld [r1, #(PREFETCH_DISTANCE)]
+ /* preload all the cache lines we need. */
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+ pld [r1, #(CACHE_LINE_SIZE * 3)]
1: /* The main loop copies 64 bytes at a time */
- vld1.8 {d0 - d3}, [r1]!
- vld1.8 {d4 - d7}, [r1]!
- pld [r1, #(PREFETCH_DISTANCE)]
+ vld1.8 {d0 - d3}, [r1]!
+ vld1.8 {d4 - d7}, [r1]!
+#ifdef HAVE_32_BYTE_CACHE_LINE
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+ pld [r1, #(CACHE_LINE_SIZE * 3)]
+#else
+ pld [r1, #(CACHE_LINE_SIZE * 3)]
+#endif
subs r2, r2, #64
- vst1.8 {d0 - d3}, [r0, :128]!
- vst1.8 {d4 - d7}, [r0, :128]!
+ vst1.8 {d0 - d3}, [r0, :128]!
+ vst1.8 {d4 - d7}, [r0, :128]!
bhs 1b
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
@@ -525,9 +530,9 @@ ENTRY(memcpy)
blo 4f
3: /* 32 bytes at a time. These cache lines were already preloaded */
- vld1.8 {d0 - d3}, [r1]!
+ vld1.8 {d0 - d3}, [r1]!
subs r2, r2, #32
- vst1.8 {d0 - d3}, [r0, :128]!
+ vst1.8 {d0 - d3}, [r0, :128]!
bhs 3b
#endif
4: /* less than 32 left */
@@ -537,7 +542,6 @@ ENTRY(memcpy)
// copies 16 bytes, 128-bits aligned
vld1.8 {d0, d1}, [r1]!
vst1.8 {d0, d1}, [r0, :128]!
-
5: /* copy up to 15-bytes (count in r2) */
movs ip, r2, lsl #29
bcc 1f
@@ -556,6 +560,164 @@ ENTRY(memcpy)
ldmfd sp!, {r0, lr}
bx lr
+
+#else /* NEON_UNALIGNED_ACCESS */
+
+ // Check so divider is at least 16 bytes, needed for alignment code.
+ cmp r2, #16
+ blo 5f
+
+#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
+ /* Check the upper size limit for Neon unaligned memory access in memcpy */
+#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
+ cmp r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
+ blo 3f
+#endif
+ /* check if buffers are aligned. If so, run arm-only version */
+ eor r3, r0, r1
+ ands r3, r3, #0x3
+ beq 11f
+
+ /* align destination to 16 bytes for the write-buffer */
+ rsb r3, r0, #0
+ ands r3, r3, #0xF
+ beq 3f
+
+ /* copy up to 15-bytes (count in r3) */
+ sub r2, r2, r3
+ movs ip, r3, lsl #31
+ ldrmib lr, [r1], #1
+ strmib lr, [r0], #1
+ ldrcsb ip, [r1], #1
+ ldrcsb lr, [r1], #1
+ strcsb ip, [r0], #1
+ strcsb lr, [r0], #1
+ movs ip, r3, lsl #29
+ bge 1f
+ // copies 4 bytes, destination 32-bits aligned
+ vld1.32 {d0[0]}, [r1]!
+ vst1.32 {d0[0]}, [r0, :32]!
+1: bcc 2f
+ // copies 8 bytes, destination 64-bits aligned
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0, :64]!
+2:
+ /* preload immediately the next cache line, which we may need */
+ pld [r1, #(CACHE_LINE_SIZE * 0)]
+ pld [r1, #(CACHE_LINE_SIZE * 1)]
+3:
+#endif
+ /* make sure we have at least 64 bytes to copy */
+ subs r2, r2, #64
+ blo 2f
+
+ /* preload all the cache lines we need */
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+ pld [r1, #(CACHE_LINE_SIZE * 3)]
+
+1: /* The main loop copies 64 bytes at a time */
+ vld1.8 {d0 - d3}, [r1]!
+ vld1.8 {d4 - d7}, [r1]!
+#ifdef HAVE_32_BYTE_CACHE_LINE
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+ pld [r1, #(CACHE_LINE_SIZE * 3)]
+#else
+ pld [r1, #(CACHE_LINE_SIZE * 3)]
+#endif
+ subs r2, r2, #64
+ vst1.8 {d0 - d3}, [r0]!
+ vst1.8 {d4 - d7}, [r0]!
+ bhs 1b
+
+2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
+ add r2, r2, #64
+ subs r2, r2, #32
+ blo 4f
+
+3: /* 32 bytes at a time. These cache lines were already preloaded */
+ vld1.8 {d0 - d3}, [r1]!
+ subs r2, r2, #32
+ vst1.8 {d0 - d3}, [r0]!
+ bhs 3b
+
+4: /* less than 32 left */
+ add r2, r2, #32
+ tst r2, #0x10
+ beq 5f
+ // copies 16 bytes, 128-bits aligned
+ vld1.8 {d0, d1}, [r1]!
+ vst1.8 {d0, d1}, [r0]!
+5: /* copy up to 15-bytes (count in r2) */
+ movs ip, r2, lsl #29
+ bcc 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0]!
+1: bge 2f
+ vld1.32 {d0[0]}, [r1]!
+ vst1.32 {d0[0]}, [r0]!
+2: movs ip, r2, lsl #31
+ ldrmib r3, [r1], #1
+ ldrcsb ip, [r1], #1
+ ldrcsb lr, [r1], #1
+ strmib r3, [r0], #1
+ strcsb ip, [r0], #1
+ strcsb lr, [r0], #1
+
+ ldmfd sp!, {r0, lr}
+ bx lr
+#endif /* NEON_UNALIGNED_ACCESS */
+11:
+ /* Simple arm-only copy loop to handle aligned copy operations */
+ stmfd sp!, {r4, r5, r6, r7, r8}
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+
+ /* Check alignment */
+ rsb r3, r1, #0
+ ands r3, #3
+ beq 2f
+
+ /* align source to 32 bits. We need to insert 2 instructions between
+ * a ldr[b|h] and str[b|h] because byte and half-word instructions
+ * stall 2 cycles.
+ */
+ movs r12, r3, lsl #31
+ sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
+ ldrmib r3, [r1], #1
+ ldrcsb r4, [r1], #1
+ ldrcsb r5, [r1], #1
+ strmib r3, [r0], #1
+ strcsb r4, [r0], #1
+ strcsb r5, [r0], #1
+2:
+ subs r2, #32
+ blt 5f
+ pld [r1, #(CACHE_LINE_SIZE * 3)]
+3: /* Main copy loop, copying 32 bytes at a time */
+ pld [r1, #(CACHE_LINE_SIZE * 4)]
+ ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
+ subs r2, r2, #32
+ stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
+ bge 3b
+5: /* Handle any remaining bytes */
+ adds r2, #32
+ beq 6f
+
+ movs r12, r2, lsl #28
+ ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */
+ ldmmiia r1!, {r7, r8} /* 8 bytes */
+ stmcsia r0!, {r3, r4, r5, r6}
+ stmmiia r0!, {r7, r8}
+ movs r12, r2, lsl #30
+ ldrcs r3, [r1], #4 /* 4 bytes */
+ ldrmih r4, [r1], #2 /* 2 bytes */
+ strcs r3, [r0], #4
+ strmih r4, [r0], #2
+ tst r2, #0x1
+ ldrneb r3, [r1] /* last byte */
+ strneb r3, [r0]
+6:
+ ldmfd sp!, {r4, r5, r6, r7, r8}
+ ldmfd sp!, {r0, pc}
END(memcpy)
#endif /* CORTEX_CACHE_LINE_32 */
#endif /* !SCORPION_NEON_OPTIMIZATION */