summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteve Kondik <shade@chemlab.org>2013-02-16 15:11:05 -0800
committerSteve Kondik <shade@chemlab.org>2013-02-16 15:11:05 -0800
commit44a555de45943f75e2fcb84393366e89a203a783 (patch)
tree67824ac18d14adf0b59a116ae30c185d8766278c
parent4732399982e2ec9feb0bdc0129dd33a7875e5c78 (diff)
downloadbionic-44a555de45943f75e2fcb84393366e89a203a783.zip
bionic-44a555de45943f75e2fcb84393366e89a203a783.tar.gz
bionic-44a555de45943f75e2fcb84393366e89a203a783.tar.bz2
Revert "Add optimized version of memcpy for Cortex A9"
* Too many weird regressions in this codepath. This reverts commit 8c35d7eaeb67ace9a96922f16ba9e491dcde6534.
-rw-r--r--libc/arch-arm/bionic/memcpy.S214
1 files changed, 26 insertions, 188 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index ca6a8be..80f1bf5 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -429,35 +429,28 @@ memcpy:
#ifdef HAVE_32_BYTE_CACHE_LINE
/* a prefetch distance of 2 cache-lines */
#define CACHE_LINE_SIZE 32
+#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*2)
#else
/* a prefetch distance of 4 cache-lines works best experimentally */
#define CACHE_LINE_SIZE 64
+#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
#endif
ENTRY(memcpy)
.save {r0, lr}
/* start preloading as early as possible */
- pld [r1, #(CACHE_LINE_SIZE * 0)]
+ pld [r1, #(CACHE_LINE_SIZE*0)]
stmfd sp!, {r0, lr}
- pld [r1, #(CACHE_LINE_SIZE * 1)]
+ pld [r1, #(CACHE_LINE_SIZE*1)]
-/* If Neon supports unaligned access then remove the align code,
- * unless a size limit has been specified.
- */
-#ifndef NEON_UNALIGNED_ACCESS
/* do we have at least 16-bytes to copy (needed for alignment below) */
cmp r2, #16
blo 5f
- /* check if buffers are aligned. If so, run arm-only version */
- eor r3, r0, r1
- ands r3, r3, #0x3
- beq 11f
-
/* align destination to cache-line for the write-buffer */
rsb r3, r0, #0
ands r3, r3, #0xF
- beq 2f
+ beq 0f
/* copy up to 15-bytes (count in r3) */
sub r2, r2, r3
@@ -478,9 +471,10 @@ ENTRY(memcpy)
vld1.8 {d0}, [r1]!
vst1.8 {d0}, [r0, :64]!
2:
- /* preload immediately the next cache line, which we may need */
- pld [r1, #(CACHE_LINE_SIZE * 0)]
- pld [r1, #(CACHE_LINE_SIZE * 1)]
+
+0: /* preload immediately the next cache line, which we may need */
+ pld [r1, #(CACHE_LINE_SIZE*0)]
+ pld [r1, #(CACHE_LINE_SIZE*1)]
#ifdef HAVE_32_BYTE_CACHE_LINE
/* make sure we have at least 32 bytes to copy */
@@ -506,22 +500,23 @@ ENTRY(memcpy)
subs r2, r2, #64
blo 2f
- /* preload all the cache lines we need. */
- pld [r1, #(CACHE_LINE_SIZE * 2)]
- pld [r1, #(CACHE_LINE_SIZE * 3)]
+ /* preload all the cache lines we need.
+ * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
+ * ideally would would increase the distance in the main loop to
+ * avoid the goofy code below. In practice this doesn't seem to make
+ * a big difference.
+ */
+ pld [r1, #(CACHE_LINE_SIZE*2)]
+ pld [r1, #(CACHE_LINE_SIZE*3)]
+ pld [r1, #(PREFETCH_DISTANCE)]
1: /* The main loop copies 64 bytes at a time */
- vld1.8 {d0 - d3}, [r1]!
- vld1.8 {d4 - d7}, [r1]!
-#ifdef HAVE_32_BYTE_CACHE_LINE
- pld [r1, #(CACHE_LINE_SIZE * 2)]
- pld [r1, #(CACHE_LINE_SIZE * 3)]
-#else
- pld [r1, #(CACHE_LINE_SIZE * 3)]
-#endif
+ vld1.8 {d0 - d3}, [r1]!
+ vld1.8 {d4 - d7}, [r1]!
+ pld [r1, #(PREFETCH_DISTANCE)]
subs r2, r2, #64
- vst1.8 {d0 - d3}, [r0, :128]!
- vst1.8 {d4 - d7}, [r0, :128]!
+ vst1.8 {d0 - d3}, [r0, :128]!
+ vst1.8 {d4 - d7}, [r0, :128]!
bhs 1b
2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
@@ -530,9 +525,9 @@ ENTRY(memcpy)
blo 4f
3: /* 32 bytes at a time. These cache lines were already preloaded */
- vld1.8 {d0 - d3}, [r1]!
+ vld1.8 {d0 - d3}, [r1]!
subs r2, r2, #32
- vst1.8 {d0 - d3}, [r0, :128]!
+ vst1.8 {d0 - d3}, [r0, :128]!
bhs 3b
#endif
4: /* less than 32 left */
@@ -542,6 +537,7 @@ ENTRY(memcpy)
// copies 16 bytes, 128-bits aligned
vld1.8 {d0, d1}, [r1]!
vst1.8 {d0, d1}, [r0, :128]!
+
5: /* copy up to 15-bytes (count in r2) */
movs ip, r2, lsl #29
bcc 1f
@@ -560,164 +556,6 @@ ENTRY(memcpy)
ldmfd sp!, {r0, lr}
bx lr
-
-#else /* NEON_UNALIGNED_ACCESS */
-
- // Check so divider is at least 16 bytes, needed for alignment code.
- cmp r2, #16
- blo 5f
-
-#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
- /* Check the upper size limit for Neon unaligned memory access in memcpy */
-#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
- cmp r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
- blo 3f
-#endif
- /* check if buffers are aligned. If so, run arm-only version */
- eor r3, r0, r1
- ands r3, r3, #0x3
- beq 11f
-
- /* align destination to 16 bytes for the write-buffer */
- rsb r3, r0, #0
- ands r3, r3, #0xF
- beq 3f
-
- /* copy up to 15-bytes (count in r3) */
- sub r2, r2, r3
- movs ip, r3, lsl #31
- ldrmib lr, [r1], #1
- strmib lr, [r0], #1
- ldrcsb ip, [r1], #1
- ldrcsb lr, [r1], #1
- strcsb ip, [r0], #1
- strcsb lr, [r0], #1
- movs ip, r3, lsl #29
- bge 1f
- // copies 4 bytes, destination 32-bits aligned
- vld1.32 {d0[0]}, [r1]!
- vst1.32 {d0[0]}, [r0, :32]!
-1: bcc 2f
- // copies 8 bytes, destination 64-bits aligned
- vld1.8 {d0}, [r1]!
- vst1.8 {d0}, [r0, :64]!
-2:
- /* preload immediately the next cache line, which we may need */
- pld [r1, #(CACHE_LINE_SIZE * 0)]
- pld [r1, #(CACHE_LINE_SIZE * 1)]
-3:
-#endif
- /* make sure we have at least 64 bytes to copy */
- subs r2, r2, #64
- blo 2f
-
- /* preload all the cache lines we need */
- pld [r1, #(CACHE_LINE_SIZE * 2)]
- pld [r1, #(CACHE_LINE_SIZE * 3)]
-
-1: /* The main loop copies 64 bytes at a time */
- vld1.8 {d0 - d3}, [r1]!
- vld1.8 {d4 - d7}, [r1]!
-#ifdef HAVE_32_BYTE_CACHE_LINE
- pld [r1, #(CACHE_LINE_SIZE * 2)]
- pld [r1, #(CACHE_LINE_SIZE * 3)]
-#else
- pld [r1, #(CACHE_LINE_SIZE * 3)]
-#endif
- subs r2, r2, #64
- vst1.8 {d0 - d3}, [r0]!
- vst1.8 {d4 - d7}, [r0]!
- bhs 1b
-
-2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
- add r2, r2, #64
- subs r2, r2, #32
- blo 4f
-
-3: /* 32 bytes at a time. These cache lines were already preloaded */
- vld1.8 {d0 - d3}, [r1]!
- subs r2, r2, #32
- vst1.8 {d0 - d3}, [r0]!
- bhs 3b
-
-4: /* less than 32 left */
- add r2, r2, #32
- tst r2, #0x10
- beq 5f
- // copies 16 bytes, 128-bits aligned
- vld1.8 {d0, d1}, [r1]!
- vst1.8 {d0, d1}, [r0]!
-5: /* copy up to 15-bytes (count in r2) */
- movs ip, r2, lsl #29
- bcc 1f
- vld1.8 {d0}, [r1]!
- vst1.8 {d0}, [r0]!
-1: bge 2f
- vld1.32 {d0[0]}, [r1]!
- vst1.32 {d0[0]}, [r0]!
-2: movs ip, r2, lsl #31
- ldrmib r3, [r1], #1
- ldrcsb ip, [r1], #1
- ldrcsb lr, [r1], #1
- strmib r3, [r0], #1
- strcsb ip, [r0], #1
- strcsb lr, [r0], #1
-
- ldmfd sp!, {r0, lr}
- bx lr
-#endif /* NEON_UNALIGNED_ACCESS */
-11:
- /* Simple arm-only copy loop to handle aligned copy operations */
- stmfd sp!, {r4, r5, r6, r7, r8}
- pld [r1, #(CACHE_LINE_SIZE * 2)]
-
- /* Check alignment */
- rsb r3, r1, #0
- ands r3, #3
- beq 2f
-
- /* align source to 32 bits. We need to insert 2 instructions between
- * a ldr[b|h] and str[b|h] because byte and half-word instructions
- * stall 2 cycles.
- */
- movs r12, r3, lsl #31
- sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
- ldrmib r3, [r1], #1
- ldrcsb r4, [r1], #1
- ldrcsb r5, [r1], #1
- strmib r3, [r0], #1
- strcsb r4, [r0], #1
- strcsb r5, [r0], #1
-2:
- subs r2, #32
- blt 5f
- pld [r1, #(CACHE_LINE_SIZE * 3)]
-3: /* Main copy loop, copying 32 bytes at a time */
- pld [r1, #(CACHE_LINE_SIZE * 4)]
- ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
- subs r2, r2, #32
- stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
- bge 3b
-5: /* Handle any remaining bytes */
- adds r2, #32
- beq 6f
-
- movs r12, r2, lsl #28
- ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */
- ldmmiia r1!, {r7, r8} /* 8 bytes */
- stmcsia r0!, {r3, r4, r5, r6}
- stmmiia r0!, {r7, r8}
- movs r12, r2, lsl #30
- ldrcs r3, [r1], #4 /* 4 bytes */
- ldrmih r4, [r1], #2 /* 2 bytes */
- strcs r3, [r0], #4
- strmih r4, [r0], #2
- tst r2, #0x1
- ldrneb r3, [r1] /* last byte */
- strneb r3, [r0]
-6:
- ldmfd sp!, {r4, r5, r6, r7, r8}
- ldmfd sp!, {r0, pc}
END(memcpy)
#endif /* CORTEX_CACHE_LINE_32 */
#endif /* !SCORPION_NEON_OPTIMIZATION */