summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libc/arch-arm/krait/bionic/memcpy.S280
1 files changed, 172 insertions, 108 deletions
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 0cd4d44..818c3a4 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -26,121 +26,185 @@
* SUCH DAMAGE.
*/
-/* Assumes neon instructions and a cache line size of 32 bytes. */
+/* Assumes neon instructions and a cache line size of 64 bytes. */
#include <machine/cpu-features.h>
#include <machine/asm.h>
/*
- * This code assumes it is running on a processor that supports all arm v7
- * instructions, that supports neon instructions, and that has a 32 byte
- * cache line.
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_KRAIT_PLD_SET := true
+ * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
*/
- .text
- .fpu neon
-
-#define CACHE_LINE_SIZE 32
+#ifndef PLDOFFS
+#define PLDOFFS (10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#ifndef BBTHRESH
+#define BBTHRESH (4096/64)
+#endif
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+#define NOP_OPCODE (0xe320f000)
+
+ .text
+ .fpu neon
ENTRY(memcpy)
- .save {r0, lr}
- /* start preloading as early as possible */
- pld [r1, #(CACHE_LINE_SIZE*0)]
- stmfd sp!, {r0, lr}
- pld [r1, #(CACHE_LINE_SIZE*2)]
-
- /* do we have at least 16-bytes to copy (needed for alignment below) */
- cmp r2, #16
- blo 5f
-
- /* align destination to cache-line for the write-buffer */
- rsb r3, r0, #0
- ands r3, r3, #0xF
- beq 0f
-
- /* copy up to 15-bytes (count in r3) */
- sub r2, r2, r3
- movs ip, r3, lsl #31
- ldrmib lr, [r1], #1
- strmib lr, [r0], #1
- ldrcsb ip, [r1], #1
- ldrcsb lr, [r1], #1
- strcsb ip, [r0], #1
- strcsb lr, [r0], #1
- movs ip, r3, lsl #29
- bge 1f
- // copies 4 bytes, destination 32-bits aligned
- vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
- vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
-1: bcc 2f
- // copies 8 bytes, destination 64-bits aligned
- vld1.8 {d0}, [r1]!
- vst1.8 {d0}, [r0, :64]!
-2:
-
-0: /* preload immediately the next cache line, which we may need */
- pld [r1, #(CACHE_LINE_SIZE*0)]
- pld [r1, #(CACHE_LINE_SIZE*2)]
-
- /* make sure we have at least 64 bytes to copy */
- subs r2, r2, #64
- blo 2f
-
- /* Preload all the cache lines we need.
- * NOTE: The number of pld below depends on CACHE_LINE_SIZE,
- * ideally we would increase the distance in the main loop to
- * avoid the goofy code below. In practice this doesn't seem to make
- * a big difference.
- * NOTE: The value CACHE_LINE_SIZE * 8 was chosen through
- * experimentation.
- */
- pld [r1, #(CACHE_LINE_SIZE*4)]
- pld [r1, #(CACHE_LINE_SIZE*6)]
- pld [r1, #(CACHE_LINE_SIZE*8)]
-
-1: /* The main loop copies 64 bytes at a time */
- vld1.8 {d0 - d3}, [r1]!
- vld1.8 {d4 - d7}, [r1]!
- pld [r1, #(CACHE_LINE_SIZE*8)]
- subs r2, r2, #64
- vst1.8 {d0 - d3}, [r0, :128]!
- vst1.8 {d4 - d7}, [r0, :128]!
- bhs 1b
-
-2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
- add r2, r2, #64
- subs r2, r2, #32
- blo 4f
-
-3: /* 32 bytes at a time. These cache lines were already preloaded */
- vld1.8 {d0 - d3}, [r1]!
- subs r2, r2, #32
- vst1.8 {d0 - d3}, [r0, :128]!
- bhs 3b
-4: /* less than 32 left */
- add r2, r2, #32
- tst r2, #0x10
- beq 5f
- // copies 16 bytes, 128-bits aligned
- vld1.8 {d0, d1}, [r1]!
- vst1.8 {d0, d1}, [r0, :128]!
-
-5: /* copy up to 15-bytes (count in r2) */
- movs ip, r2, lsl #29
- bcc 1f
- vld1.8 {d0}, [r1]!
- vst1.8 {d0}, [r0]!
-1: bge 2f
- vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
- vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
-2: movs ip, r2, lsl #31
- ldrmib r3, [r1], #1
- ldrcsb ip, [r1], #1
- ldrcsb lr, [r1], #1
- strmib r3, [r0], #1
- strcsb ip, [r0], #1
- strcsb lr, [r0], #1
-
- ldmfd sp!, {r0, lr}
- bx lr
+ .save {r0, lr}
+ mov r12, r0
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #64
+ blt .Lneon_copy_32_a
+ stmfd sp!, {r0}
+
+ mov r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .Lneon_copy_64_loop_nopld
+
+ stmfd sp!, {r9, r10, lr}
+
+ cmp r12, #BBTHRESH
+ ble .Lneon_prime_pump
+
+ add lr, r0, #0x400
+ add r9, r1, #(PLDOFFS*PLDSIZE)
+ sub lr, lr, r9
+ lsl lr, lr, #21
+ lsr lr, lr, #21
+ add lr, lr, #(PLDOFFS*PLDSIZE)
+ cmp r12, lr, lsr #6
+ movle lr, #(PLDOFFS*PLDSIZE)
+ ble .Lneon_prime_pump
+
+ movgt r9, #(PLDOFFS)
+ rsbgts r9, r9, lr, lsr #6
+ ble .Lneon_prime_pump
+
+ add r10, r1, lr
+ bic r10, #0x3F
+
+ sub r12, lr, lsr #6
+ cmp r9, r12
+ suble r12, r12, r9
+ movgt r9, r12
+ movgt r12, #0
+
+ pld [r1, #((PLDOFFS-1)*PLDSIZE)]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_outer_doublepld:
+ pld [r1, #((PLDOFFS)*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r9, r9, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_outer_doublepld
+ cmp r12, #0
+ beq .Lneon_pop_before_nopld
+
+ cmp r12, #(512*1024/64)
+ blt .Lneon_copy_64_loop_outer
+
+ .balignl 64, NOP_OPCODE, 8
+.Lneon_copy_64_loop_ddr:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ pld [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_ddr
+ b .Lneon_pop_before_nopld
+
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_prime_pump:
+ mov lr, #(PLDOFFS*PLDSIZE)
+ add r10, r1, #(PLDOFFS*PLDSIZE)
+ bic r10, #0x3F
+ sub r12, r12, #PLDOFFS
+ ldr r3, [r10, #(-1*PLDSIZE)]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_outer:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_outer
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_pop_before_nopld:
+ mov r12, lr, lsr #6
+ ldmfd sp!, {r9, r10, lr}
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ ldmfd sp!, {r12}
+ beq .Lneon_exit
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_32_a:
+ movs r3, r2, lsl #27
+ bcc .Lneon_16
+ vld1.32 {q0,q1}, [r1]!
+ vst1.32 {q0,q1}, [r0]!
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_16:
+ bpl .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ ands r2, r2, #0x0f
+ beq .Lneon_exit
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_lt16:
+ movs r3, r2, lsl #29
+ ldrcs r3, [r1], #4
+ strcs r3, [r0], #4
+ ldrcs r3, [r1], #4
+ strcs r3, [r0], #4
+ ldrmi r3, [r1], #4
+ strmi r3, [r0], #4
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ ldrcsh r3, [r1], #2
+ strcsh r3, [r0], #2
+ ldrmib r3, [r1]
+ strmib r3, [r0]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_exit:
+ mov r0, r12
+ bx lr
END(memcpy)
+