diff options
author | Henrik Smiding <henrik.smiding@stericsson.com> | 2010-11-05 15:07:53 +0100 |
---|---|---|
committer | Steve Kondik <shade@chemlab.org> | 2013-02-15 10:02:22 -0800 |
commit | 656c4c9712856b4882df762da68890fff96cc9d5 (patch) | |
tree | 382c4405c0079002240fafda64180be641662e88 | |
parent | 1cdc6cc3548b0cb20ca212cab92a14d0cfb5d8a7 (diff) | |
download | bionic-656c4c9712856b4882df762da68890fff96cc9d5.zip bionic-656c4c9712856b4882df762da68890fff96cc9d5.tar.gz bionic-656c4c9712856b4882df762da68890fff96cc9d5.tar.bz2 |
Add optimized version of memset for Cortex A9
Adds new code to function memset, optimized for Cortex A9.
Copyright (C) ST-Ericsson SA 2010
Added neon implementation
Author: Henrik Smiding henrik.smiding@stericsson.com for ST-Ericsson.
Change-Id: Id3c87767953439269040e15bd30a27aba709aef6
Signed-off-by: Christian Bejram <christian.bejram@stericsson.com>
-rw-r--r-- | libc/arch-arm/bionic/memset.S | 98 |
1 files changed, 94 insertions, 4 deletions
diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S index c386e7e..a0da108 100644 --- a/libc/arch-arm/bionic/memset.S +++ b/libc/arch-arm/bionic/memset.S @@ -28,6 +28,7 @@ * SUCH DAMAGE. */ +#include <machine/cpu-features.h> #include <machine/asm.h> #if( defined(SCORPION_NEON_OPTIMIZATION) || defined(CORTEX_CACHE_LINE_32)) @@ -119,16 +120,105 @@ memset: * * memset() returns its first argument. */ - + +#if defined(__ARM_NEON__) + .fpu neon +#endif + ENTRY(bzero) mov r2, r1 mov r1, #0 END(bzero) ENTRY(memset) +#if defined(__ARM_NEON__) + +#ifdef NEON_MEMSET_DIVIDER + cmp r2, #NEON_MEMSET_DIVIDER + bhi 11f +#endif + .save {r0} + stmfd sp!, {r0} + + vdup.8 q0, r1 + +#ifndef NEON_UNALIGNED_ACCESS + /* do we have at least 16-bytes to write (needed for alignment below) */ + cmp r2, #16 + blo 3f + + /* align destination to 16 bytes for the write-buffer */ + rsb r3, r0, #0 + ands r3, r3, #0xF + beq 2f + + /* write up to 15-bytes (count in r3) */ + sub r2, r2, r3 + movs ip, r3, lsl #31 + strmib r1, [r0], #1 + strcsb r1, [r0], #1 + strcsb r1, [r0], #1 + movs ip, r3, lsl #29 + bge 1f + + // writes 4 bytes, 32-bits aligned + vst1.32 {d0[0]}, [r0, :32]! +1: bcc 2f + + // writes 8 bytes, 64-bits aligned + vst1.8 {d0}, [r0, :64]! +2: +#endif + /* make sure we have at least 32 bytes to write */ + subs r2, r2, #32 + blo 2f + vmov q1, q0 + +1: /* The main loop writes 32 bytes at a time */ + subs r2, r2, #32 +#ifndef NEON_UNALIGNED_ACCESS + vst1.8 {d0 - d3}, [r0, :128]! +#else + vst1.8 {d0 - d3}, [r0]! +#endif + bhs 1b + +2: /* less than 32 left */ + add r2, r2, #32 + tst r2, #0x10 + beq 3f + + // writes 16 bytes, 128-bits aligned +#ifndef NEON_UNALIGNED_ACCESS + vst1.8 {d0, d1}, [r0, :128]! +#else + vst1.8 {d0, d1}, [r0]! +#endif +3: /* write up to 15-bytes (count in r2) */ + movs ip, r2, lsl #29 + bcc 1f + vst1.8 {d0}, [r0]! +1: bge 2f + vst1.32 {d0[0]}, [r0]! +2: movs ip, r2, lsl #31 + strmib r1, [r0], #1 + strcsb r1, [r0], #1 + strcsb r1, [r0], #1 + ldmfd sp!, {r0} + bx lr +11: +#endif + + /* + * Optimized memset() for ARM. + * + * memset() returns its first argument. + */ + /* compute the offset to align the destination * offset = (4-(src&3))&3 = -src & 3 */ + .save {r0, r4-r7, lr} stmfd sp!, {r0, r4-r7, lr} rsb r3, r0, #0 @@ -156,7 +246,7 @@ ENTRY(memset) mov r5, r1 mov r6, r1 mov r7, r1 - + rsb r3, r0, #0 ands r3, r3, #0x1C beq 3f @@ -164,7 +254,7 @@ ENTRY(memset) andhi r3, r2, #0x1C sub r2, r2, r3 - /* conditionnaly writes 0 to 7 words (length in r3) */ + /* conditionally writes 0 to 7 words (length in r3) */ movs r3, r3, lsl #28 stmcsia r0!, {r1, lr} stmcsia r0!, {r1, lr} @@ -181,7 +271,7 @@ ENTRY(memset) bhs 1b 2: add r2, r2, #32 - /* conditionnaly stores 0 to 31 bytes */ + /* conditionally stores 0 to 31 bytes */ movs r2, r2, lsl #28 stmcsia r0!, {r1,r3,r12,lr} stmmiia r0!, {r1, lr} |