summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarshad Bhutada <hbhutada@codeaurora.org>2010-11-12 16:45:27 +0530
committerHarshad Bhutada <hbhutada@codeaurora.org>2010-11-15 11:43:59 +0530
commit6c5c6849ec677d7683d5f34d80e94256ce47e12c (patch)
treed2dbee9f11bac91fd83c03148534ce4766e42e5f
parentd5977d3a4afaeed38c5cd70e41e608145312c7d3 (diff)
downloadbionic-6c5c6849ec677d7683d5f34d80e94256ce47e12c.zip
bionic-6c5c6849ec677d7683d5f34d80e94256ce47e12c.tar.gz
bionic-6c5c6849ec677d7683d5f34d80e94256ce47e12c.tar.bz2
Added memcpy version to be used for 8650A.Q8650DTBCANLYA206020Q8650DTBCANLYA206015Q8650DTBCANLYA206010
This will resolve the issue of drop in performance at 1.3 GHz. CRs-fixed:262696 Change-Id: I04df3298914f94ebb5c5859a0131d41b616c6d98
-rw-r--r--libc/Android.mk7
-rw-r--r--libc/arch-arm/bionic/memcpy_8650A.S136
2 files changed, 142 insertions, 1 deletions
diff --git a/libc/Android.mk b/libc/Android.mk
index d2e5e1f..1890332 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -306,7 +306,6 @@ libc_common_src_files += \
arch-arm/bionic/tkill.S \
arch-arm/bionic/memcmp.S \
arch-arm/bionic/memcmp16.S \
- arch-arm/bionic/memcpy.S \
arch-arm/bionic/memset.S \
arch-arm/bionic/setjmp.S \
arch-arm/bionic/sigsetjmp.S \
@@ -315,6 +314,12 @@ libc_common_src_files += \
string/memmove.c.arm \
unistd/socketcalls.c
+ifeq ($(TARGET_USES_OPTIMIZED_MEMCPY_FOR_SCORPION),true)
+libc_common_src_files += arch-arm/bionic/memcpy_8650A.S
+else
+libc_common_src_files += arch-arm/bionic/memcpy.S
+endif
+
# These files need to be arm so that gdbserver
# can set breakpoints in them without messing
# up any thumb code.
diff --git a/libc/arch-arm/bionic/memcpy_8650A.S b/libc/arch-arm/bionic/memcpy_8650A.S
new file mode 100644
index 0000000..69b4885
--- /dev/null
+++ b/libc/arch-arm/bionic/memcpy_8650A.S
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/***************************************************************************
+ Neon memcpy: Attempts to do a memcpy with Neon registers if possible,
+ Inputs:
+ dest: The destination buffer
+ src: The source buffer
+ n: The size of the buffer to transfer
+ Outputs:
+
+***************************************************************************/
+
+#define PLDOFFS (3) /* For 8650a (set to 6 for 8660) */
+#define PLDSIZE (128) /* L2 cache line size */
+
+ .code 32
+ .align 5
+ .globl memcpy
+ .func
+
+memcpy:
+ push {r0}
+ cmp r2, #4
+ blt neon_lt4
+ cmp r2, #16
+ blt neon_lt16
+ cmp r2, #32
+ blt neon_16
+ cmp r2, #128
+ blt neon_copy_32_a
+ /* Copy blocks of 128-bytes (word-aligned) at a time*/
+ /* Code below is optimized for PLDSIZE=128 only */
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble neon_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+neon_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne neon_copy_128_loop_outer
+ mov r12, #PLDOFFS
+neon_copy_128_loop_nopld:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne neon_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq neon_exit
+ cmp r2, #32
+ blt neon_16
+ nop
+ /* Copy blocks of 32-bytes (word aligned) at a time*/
+neon_copy_32_a:
+ mov r12, r2, lsr #5
+neon_copy_32_loop_a:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne neon_copy_32_loop_a
+ ands r2, r2, #0x1f
+ beq neon_exit
+neon_16:
+ subs r2, r2, #16
+ blt neon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ beq neon_exit
+neon_lt16:
+ movs r12, r2, lsl #29
+ bcc neon_skip8
+ ldr r3, [r1], #4
+ ldr r12, [r1], #4
+ str r3, [r0], #4
+ str r12, [r0], #4
+neon_skip8:
+ bpl neon_lt4
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+neon_lt4:
+ movs r2, r2, lsl #31
+ bcc neon_lt2
+ ldrh r3, [r1], #2
+ strh r3, [r0], #2
+neon_lt2:
+ bpl neon_exit
+ ldrb r12, [r1]
+ strb r12, [r0]
+neon_exit:
+ pop {r0}
+ bx lr
+
+ .endfunc
+ .end