summaryrefslogtreecommitdiffstats
path: root/libc/arch-arm/bionic/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'libc/arch-arm/bionic/memcpy.S')
-rw-r--r--libc/arch-arm/bionic/memcpy.S115
1 files changed, 112 insertions, 3 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index ba55996..e92ff5e 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -2,6 +2,8 @@
* Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -29,7 +31,114 @@
#include <machine/cpu-features.h>
#if defined(__ARM_NEON__)
-
+#if defined(SCORPION_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_SCORPION_PLD_SET := true
+ * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (128) /* L2 cache line size */
+#endif
+ .code 32
+ .align 5
+ .globl memcpy
+ .func
+memcpy:
+ push {r0}
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #128
+ blt .Lneon_copy_32_a
+ /* Copy blocks of 128-bytes (word-aligned) at a time*/
+ /* Code below is optimized for PLDSIZE=128 only */
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_exit
+ cmp r2, #32
+ blt .Lneon_16
+ nop
+ /* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+ mov r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne .Lneon_copy_32_loop_a
+ ands r2, r2, #0x1f
+ beq .Lneon_exit
+.Lneon_16:
+ subs r2, r2, #16
+ blt .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ beq .Lneon_exit
+.Lneon_lt16:
+ movs r12, r2, lsl #29
+ bcc .Lneon_skip8
+ ldr r3, [r1], #4
+ ldr r12, [r1], #4
+ str r3, [r0], #4
+ str r12, [r0], #4
+.Lneon_skip8:
+ bpl .Lneon_lt4
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ bcc .Lneon_lt2
+ ldrh r3, [r1], #2
+ strh r3, [r0], #2
+.Lneon_lt2:
+ bpl .Lneon_exit
+ ldrb r12, [r1]
+ strb r12, [r0]
+.Lneon_exit:
+ pop {r0}
+ bx lr
+ .endfunc
+ .end
+#else /* !SCORPION_NEON_OPTIMIZATION */
.text
.fpu neon
@@ -38,7 +147,7 @@
.align 4
/* a prefetch distance of 4 cache-lines works best experimentally */
-#define CACHE_LINE_SIZE 64
+#define CACHE_LINE_SIZE 32
#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
memcpy:
@@ -145,7 +254,7 @@ memcpy:
bx lr
.fnend
-
+#endif /* !SCORPION_NEON_OPTIMIZATION */
#else /* __ARM_ARCH__ < 7 */