summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteve Kondik <shade@chemlab.org>2013-02-16 18:48:17 -0800
committerSteve Kondik <shade@chemlab.org>2013-02-16 21:09:14 -0800
commite7655287f2e344ace481c11c63eaebb9b56d6a3f (patch)
treea97e4819da97bcb456df7c41fe0ad08110f5d061
parentb24c0fa61e261d1b9e144b51252bcb925948d326 (diff)
downloadbionic-e7655287f2e344ace481c11c63eaebb9b56d6a3f.zip
bionic-e7655287f2e344ace481c11c63eaebb9b56d6a3f.tar.gz
bionic-e7655287f2e344ace481c11c63eaebb9b56d6a3f.tar.bz2
libc: Add optimized memcpy for Cortex-A15
* This version is from Newlib and is written/recommended by ARM as the best routine for Cortex-A15. Change-Id: Ie9c2817dabda929ae4efaada5f6d467d53551ba4
-rw-r--r--libc/Android.mk9
-rw-r--r--libc/arch-arm/bionic/memcpy-a15.S419
2 files changed, 427 insertions, 1 deletions
diff --git a/libc/Android.mk b/libc/Android.mk
index ffa075f..14889db 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -379,7 +379,6 @@ libc_common_src_files += \
arch-arm/bionic/tgkill.S \
arch-arm/bionic/memcmp.S \
arch-arm/bionic/memcmp16.S \
- arch-arm/bionic/memcpy.S \
arch-arm/bionic/memset.S \
arch-arm/bionic/setjmp.S \
arch-arm/bionic/sigsetjmp.S \
@@ -388,12 +387,20 @@ libc_common_src_files += \
arch-arm/bionic/syscall.S \
string/strncmp.c \
unistd/socketcalls.c
+
ifeq ($(ARCH_ARM_HAVE_ARMV7A),true)
libc_common_src_files += arch-arm/bionic/strlen-armv7.S
else
libc_common_src_files += arch-arm/bionic/strlen.c.arm
endif
+# We have a special memcpy for A15 currently
+ifeq ($(TARGET_ARCH_VARIANT_CPU),cortex-a15)
+libc_common_src_files += arch-arm/bionic/memcpy-a15.S
+else
+libc_common_src_files += arch-arm/bionic/memcpy.S
+endif
+
# Check if we want a neonized version of memmove instead of the
# current ARM version
ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true)
diff --git a/libc/arch-arm/bionic/memcpy-a15.S b/libc/arch-arm/bionic/memcpy-a15.S
new file mode 100644
index 0000000..ff24803
--- /dev/null
+++ b/libc/arch-arm/bionic/memcpy-a15.S
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2011 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+ /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */
+
+ /* Use the version of memcpy implemented using LDRD and STRD.
+ This version is tuned for Cortex-A15.
+ This might not be the best for other ARMv7-A CPUs,
+ but there is no predefine to distinguish between
+ different CPUs in the same architecture,
+ and this version is better than the plain memcpy provided in newlib.
+
+ Therefore, we use this version for all ARMv7-A CPUS. */
+
+ /* To make the same code compile for both ARM and Thumb instruction
+ sets, switch to unified syntax at the beginning of this function.
+ However, by using the same code, we may be missing optimization
+ opportunities. For instance, in LDRD/STRD instructions, the first
+ destination register must be even and the second consecutive in
+ ARM state, but not in Thumb state. */
+
+ .syntax unified
+
+#if defined (__thumb__)
+ .thumb
+ .thumb_func
+#endif
+
+ .global memcpy
+ .type memcpy, %function
+ENTRY(memcpy)
+
+ /* Assumes that n >= 0, and dst, src are valid pointers.
+ If there is at least 8 bytes to copy, use LDRD/STRD.
+ If src and dst are misaligned with different offsets,
+ first copy byte by byte until dst is aligned,
+ and then copy using LDRD/STRD and shift if needed.
+ When less than 8 left, copy a word and then byte by byte. */
+
+ /* Save registers (r0 holds the return value):
+ optimized push {r0, r4, r5, lr}.
+ To try and improve performance, stack layout changed,
+ i.e., not keeping the stack looking like users expect
+ (highest numbered register at highest address). */
+ push {r0, lr}
+ strd r4, r5, [sp, #-8]!
+
+ /* TODO: Add debug frame directives.
+ We don't need exception unwind directives, because the code below
+ does not throw any exceptions and does not call any other functions.
+ Generally, newlib functions like this lack debug information for
+ assembler source. */
+
+ /* Get copying of tiny blocks out of the way first. */
+ /* Is there at least 4 bytes to copy? */
+ subs r2, r2, #4
+ blt copy_less_than_4 /* If n < 4. */
+
+ /* Check word alignment. */
+ ands ip, r0, #3 /* ip = last 2 bits of dst. */
+ bne dst_not_word_aligned /* If dst is not word-aligned. */
+
+ /* Get here if dst is word-aligned. */
+ ands ip, r1, #3 /* ip = last 2 bits of src. */
+ bne src_not_word_aligned /* If src is not word-aligned. */
+word_aligned:
+ /* Get here if source and dst both are word-aligned.
+ The number of bytes remaining to copy is r2+4. */
+
+ /* Is there is at least 64 bytes to copy? */
+ subs r2, r2, #60
+ blt copy_less_than_64 /* If r2 + 4 < 64. */
+
+ /* First, align the destination buffer to 8-bytes,
+ to make sure double loads and stores don't cross cache line boundary,
+ as they are then more expensive even if the data is in the cache
+ (require two load/store issue cycles instead of one).
+ If only one of the buffers is not 8-bytes aligned,
+ then it's more important to align dst than src,
+ because there is more penalty for stores
+ than loads that cross cacheline boundary.
+ This check and realignment are only worth doing
+ if there is a lot to copy. */
+
+ /* Get here if dst is word aligned,
+ i.e., the 2 least significant bits are 0.
+ If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
+ then copy 1 word (4 bytes). */
+ ands r3, r0, #4
+ beq 11f /* If dst already two-word aligned. */
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+ subs r2, r2, #4
+ blt copy_less_than_64
+
+11:
+ /* TODO: Align to cacheline (useful for PLD optimization). */
+
+ /* Every loop iteration copies 64 bytes. */
+1:
+ .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
+ ldrd r4, r5, [r1, \offset]
+ strd r4, r5, [r0, \offset]
+ .endr
+
+ add r0, r0, #64
+ add r1, r1, #64
+ subs r2, r2, #64
+ bge 1b /* If there is more to copy. */
+
+copy_less_than_64:
+
+ /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
+ Restore the count if there is more than 7 bytes to copy. */
+ adds r2, r2, #56
+ blt copy_less_than_8
+
+ /* Copy 8 bytes at a time. */
+2:
+ ldrd r4, r5, [r1], #8
+ strd r4, r5, [r0], #8
+ subs r2, r2, #8
+ bge 2b /* If there is more to copy. */
+
+copy_less_than_8:
+
+ /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
+ Check if there is more to copy. */
+ cmn r2, #8
+ beq return /* If r2 + 8 == 0. */
+
+ /* Restore the count if there is more than 3 bytes to copy. */
+ adds r2, r2, #4
+ blt copy_less_than_4
+
+ /* Copy 4 bytes. */
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+
+copy_less_than_4:
+ /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
+
+ /* Restore the count, check if there is more to copy. */
+ adds r2, r2, #4
+ beq return /* If r2 == 0. */
+
+ /* Get here with r2 is in {1,2,3}={01,10,11}. */
+ /* Logical shift left r2, insert 0s, update flags. */
+ lsls r2, r2, #31
+
+ /* Copy byte by byte.
+ Condition ne means the last bit of r2 is 0.
+ Condition cs means the second to last bit of r2 is set,
+ i.e., r2 is 1 or 3. */
+ itt ne
+ ldrbne r3, [r1], #1
+ strbne r3, [r0], #1
+
+ itttt cs
+ ldrbcs r4, [r1], #1
+ ldrbcs r5, [r1]
+ strbcs r4, [r0], #1
+ strbcs r5, [r0]
+
+return:
+ /* Restore registers: optimized pop {r0, r4, r5, pc} */
+ ldrd r4, r5, [sp], #8
+ pop {r0, pc} /* This is the only return point of memcpy. */
+
+#ifndef __ARM_FEATURE_UNALIGNED
+
+ /* The following assembly macro implements misaligned copy in software.
+ Assumes that dst is word aligned, src is at offset "pull" bits from
+ word, push = 32 - pull, and the number of bytes that remain to copy
+ is r2 + 4, r2 >= 0. */
+
+ /* In the code below, r2 is the number of bytes that remain to be
+ written. The number of bytes read is always larger, because we have
+ partial words in the shift queue. */
+
+ .macro miscopy pull push shiftleft shiftright
+
+ /* Align src to the previous word boundary. */
+ bic r1, r1, #3
+
+ /* Initialize the shift queue. */
+ ldr r5, [r1], #4 /* Load a word from source. */
+
+ subs r2, r2, #4
+ blt 6f /* Go to misaligned copy of less than 8 bytes. */
+
+ /* Get here if there is more than 8 bytes to copy.
+ The number of bytes to copy is r2+8, r2 >= 0. */
+
+ /* Save registers: push { r6, r7 }.
+ We need additional registers for LDRD and STRD, because in ARM state
+ the first destination register must be even and the second
+ consecutive. */
+ strd r6, r7, [sp, #-8]!
+
+ subs r2, r2, #56
+ blt 4f /* Go to misaligned copy of less than 64 bytes. */
+
+3:
+ /* Get here if there is more than 64 bytes to copy.
+ The number of bytes to copy is r2+64, r2 >= 0. */
+
+ /* Copy 64 bytes in every iteration.
+ Use a partial word from the shift queue. */
+ .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
+ mov r6, r5, \shiftleft #\pull
+ ldrd r4, r5, [r1, \offset]
+ orr r6, r6, r4, \shiftright #\push
+ mov r7, r4, \shiftleft #\pull
+ orr r7, r7, r5, \shiftright #\push
+ strd r6, r7, [r0, \offset]
+ .endr
+
+ add r1, r1, #64
+ add r0, r0, #64
+ subs r2, r2, #64
+ bge 3b
+
+4:
+ /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
+ and they are misaligned. */
+
+ /* Restore the count if there is more than 7 bytes to copy. */
+ adds r2, r2, #56
+
+ /* If less than 8 bytes to copy,
+ restore registers saved for this loop: optimized poplt { r6, r7 }. */
+ itt lt
+ ldrdlt r6, r7, [sp], #8
+ blt 6f /* Go to misaligned copy of less than 8 bytes. */
+
+5:
+ /* Copy 8 bytes at a time.
+ Use a partial word from the shift queue. */
+ mov r6, r5, \shiftleft #\pull
+ ldrd r4, r5, [r1], #8
+ orr r6, r6, r4, \shiftright #\push
+ mov r7, r4, \shiftleft #\pull
+ orr r7, r7, r5, \shiftright #\push
+ strd r6, r7, [r0], #8
+
+ subs r2, r2, #8
+ bge 5b /* If there is more to copy. */
+
+ /* Restore registers saved for this loop: optimized pop { r6, r7 }. */
+ ldrd r6, r7, [sp], #8
+
+6:
+ /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
+ and they are misaligned. */
+
+ /* Check if there is more to copy. */
+ cmn r2, #8
+ beq return
+
+ /* Check if there is less than 4 bytes to copy. */
+ cmn r2, #4
+
+ itt lt
+ /* Restore src offset from word-align. */
+ sublt r1, r1, #(\push / 8)
+ blt copy_less_than_4
+
+ /* Use a partial word from the shift queue. */
+ mov r3, r5, \shiftleft #\pull
+ /* Load a word from src, but without writeback
+ (this word is not fully written to dst). */
+ ldr r5, [r1]
+
+ /* Restore src offset from word-align. */
+ add r1, r1, #(\pull / 8)
+
+ /* Shift bytes to create one dst word and store it. */
+ orr r3, r3, r5, \shiftright #\push
+ str r3, [r0], #4
+
+ /* Use single byte copying of the remaining bytes. */
+ b copy_less_than_4
+
+ .endm
+
+#endif /* not __ARM_FEATURE_UNALIGNED */
+
+dst_not_word_aligned:
+
+ /* Get here when dst is not aligned and ip has the last 2 bits of dst,
+ i.e., ip is the offset of dst from word.
+ The number of bytes that remains to copy is r2 + 4,
+ i.e., there are at least 4 bytes to copy.
+ Write a partial word (0 to 3 bytes), such that dst becomes
+ word-aligned. */
+
+ /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
+ then there are (4 - ip) bytes to fill up to align dst to the next
+ word. */
+ rsb ip, ip, #4 /* ip = #4 - ip. */
+ cmp ip, #2
+
+ /* Copy byte by byte with conditionals. */
+ itt gt
+ ldrbgt r3, [r1], #1
+ strbgt r3, [r0], #1
+
+ itt ge
+ ldrbge r4, [r1], #1
+ strbge r4, [r0], #1
+
+ ldrb lr, [r1], #1
+ strb lr, [r0], #1
+
+ /* Update the count.
+ ip holds the number of bytes we have just copied. */
+ subs r2, r2, ip /* r2 = r2 - ip. */
+ blt copy_less_than_4 /* If r2 < ip. */
+
+ /* Get here if there are more than 4 bytes to copy.
+ Check if src is aligned. If beforehand src and dst were not word
+ aligned but congruent (same offset), then now they are both
+ word-aligned, and we can copy the rest efficiently (without
+ shifting). */
+ ands ip, r1, #3 /* ip = last 2 bits of src. */
+ beq word_aligned /* If r1 is word-aligned. */
+
+src_not_word_aligned:
+ /* Get here when src is not word-aligned, but dst is word-aligned.
+ The number of bytes that remains to copy is r2+4. */
+
+#ifdef __ARM_FEATURE_UNALIGNED
+ /* Copy word by word using LDR when alignment can be done in hardware,
+ i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
+ subs r2, r2, #60
+ blt 8f
+
+7:
+ /* Copy 64 bytes in every loop iteration. */
+ .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
+ ldr r3, [r1, \offset]
+ str r3, [r0, \offset]
+ .endr
+
+ add r0, r0, #64
+ add r1, r1, #64
+ subs r2, r2, #64
+ bge 7b
+
+8:
+ /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
+ Check if there is more than 3 bytes to copy. */
+ adds r2, r2, #60
+ blt copy_less_than_4
+
+9:
+ /* Get here if there is less than 64 but at least 4 bytes to copy,
+ where the number of bytes to copy is r2+4. */
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+ subs r2, r2, #4
+ bge 9b
+
+ b copy_less_than_4
+
+#else /* not __ARM_FEATURE_UNALIGNED */
+
+ /* ip has last 2 bits of src,
+ i.e., ip is the offset of src from word, and ip > 0.
+ Compute shifts needed to copy from src to dst. */
+ cmp ip, #2
+ beq miscopy_16_16 /* If ip == 2. */
+ bge miscopy_24_8 /* If ip == 3. */
+
+ /* Get here if ip == 1. */
+
+ /* Endian independent macros for shifting bytes within registers. */
+
+#ifndef __ARMEB__
+miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
+miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
+miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
+#else /* not __ARMEB__ */
+miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
+miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
+miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
+#endif /* not __ARMEB__ */
+
+#endif /* not __ARM_FEATURE_UNALIGNED */
+END(memcpy)