diff options
author | Xin Qi <xqi@codeaurora.org> | 2014-08-15 11:12:32 -0700 |
---|---|---|
committer | Linux Build Service Account <lnxbuild@localhost> | 2015-10-06 03:29:28 -0600 |
commit | a766545d561e092df9fd768bb06fec72e632c43c (patch) | |
tree | 7379870ae0984605d79c65aa6873ef46537db68f /libc/arch-arm/krait/bionic/memmove.S | |
parent | 2d9285e5c9b2aa94fa6d2c2b24756230ca72bdaa (diff) | |
download | bionic-a766545d561e092df9fd768bb06fec72e632c43c.zip bionic-a766545d561e092df9fd768bb06fec72e632c43c.tar.gz bionic-a766545d561e092df9fd768bb06fec72e632c43c.tar.bz2 |
Performance: krait: Implement optimized versions of memmove
Code has been refactored to thumb2 for consistency with the rest
of bionic libc, as well as performance and correctness.
Change-Id: I5f738ef3eb12ece6b55285f1588eab3d4bbbe27d
Diffstat (limited to 'libc/arch-arm/krait/bionic/memmove.S')
-rw-r--r-- | libc/arch-arm/krait/bionic/memmove.S | 219 |
1 files changed, 219 insertions, 0 deletions
diff --git a/libc/arch-arm/krait/bionic/memmove.S b/libc/arch-arm/krait/bionic/memmove.S new file mode 100644 index 0000000..aea7315 --- /dev/null +++ b/libc/arch-arm/krait/bionic/memmove.S @@ -0,0 +1,219 @@ +/*************************************************************************** + Copyright (c) 2009-2014 The Linux Foundation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of The Linux Foundation nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +/*************************************************************************** + * Neon memmove: Attempts to do a memmove with Neon registers if possible, + * Inputs: + * dest: The destination buffer + * src: The source buffer + * n: The size of the buffer to transfer + * Outputs: + * + ***************************************************************************/ + +#include <private/bionic_asm.h> +#include <private/libc_events.h> +/* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true + * TARGET_USE_KRAIT_PLD_SET := true + * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> + * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> + * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> + */ +#ifndef PLDOFFS +#define PLDOFFS (10) +#endif +#ifndef PLDTHRESH +#define PLDTHRESH (PLDOFFS) +#endif +#if (PLDOFFS < 5) +#error Routine does not support offsets less than 5 +#endif +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif +#ifndef PLDSIZE +#define PLDSIZE (64) +#endif + + .text + .syntax unified + .fpu neon + .thumb + .thumb_func + +//ENTRY(bcopy) +// //.cfi_startproc +// mov r12, r0 +// mov r0, r1 +// mov r1, r12 +// // Fall through to memmove +// //.cfi_endproc +//END(bcopy) + +ENTRY(memmove) +_memmove_words: + //.cfi_startproc + .save {r0, lr} + cmp r2, #0 + it ne + subsne r12, r0, r1 // Warning: do not combine these "it" blocks + it eq + bxeq lr +// memmove only if r1 < r0 < r1+r2 + cmp r0, r1 + itt ge + addge r12, r1, r2 + cmpge r12, r0 + it le + ble memcpy + cmp r2, #4 + it le + ble .Lneon_b2f_smallcopy_loop + push {r0, lr} + add r0, r0, r2 + add r1, r1, r2 + cmp r2, #64 + it ge + bge .Lneon_b2f_copy_64 + cmp r2, #32 + it ge + bge .Lneon_b2f_copy_32 + cmp r2, #8 + it ge + bge .Lneon_b2f_copy_8 + b .Lneon_b2f_copy_1 +.Lneon_b2f_copy_64: + mov r12, r2, lsr #6 + add r0, r0, #32 + add r1, r1, #32 + cmp r12, #PLDTHRESH + it le + ble .Lneon_b2f_copy_64_loop_nopld + sub r12, #PLDOFFS + sub lr, r1, #(PLDOFFS)*PLDSIZE +.Lneon_b2f_copy_64_loop_outer: + pld [lr] + sub r1, r1, #96 + sub r0, r0, #96 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1] + sub lr, lr, #64 + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0] + it ne + bne .Lneon_b2f_copy_64_loop_outer + mov r12, #PLDOFFS +.Lneon_b2f_copy_64_loop_nopld: + sub r1, r1, #96 + sub r0, r0, #96 + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1] + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0] + it ne + bne .Lneon_b2f_copy_64_loop_nopld + ands r2, r2, #0x3f + it eq + beq .Lneon_memmove_done + sub r1, r1, #32 + sub r0, r0, #32 + cmp r2, #32 + it lt + blt .Lneon_b2f_copy_8 +.Lneon_b2f_copy_32: + sub r1, r1, #32 + sub r0, r0, #32 + vld1.32 {q0, q1}, [r1] + vst1.32 {q0, q1}, [r0] + ands r2, r2, #0x1f + it eq + beq .Lneon_memmove_done +.Lneon_b2f_copy_8: + movs r12, r2, lsr #0x3 + it eq + beq .Lneon_b2f_copy_1 +.Lneon_b2f_copy_8_loop: + sub r1, r1, #8 + sub r0, r0, #8 + vld1.32 {d0}, [r1] + subs r12, r12, #1 + vst1.32 {d0}, [r0] + it ne + bne .Lneon_b2f_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_b2f_copy_1: + movs r12, r2, lsl #29 + itttt mi + submi r1, r1, #4 + submi r0, r0, #4 + ldrmi r3, [r1] + strmi r3, [r0] + movs r2, r2, lsl #31 + itttt cs + subcs r1, r1, #2 + subcs r0, r0, #2 + ldrhcs r3, [r1] + strhcs r3, [r0] + itttt mi + submi r1, r1, #1 + submi r0, r0, #1 + ldrbmi r12, [r1] + strbmi r12, [r0] +.Lneon_memmove_done: + pop {r0, pc} +.Lneon_b2f_smallcopy_loop: + // 4 bytes or less + add r1, r1, r2 + add r0, r0, r2 + movs r12, r2, lsl #29 + itttt mi + submi r1, r1, #4 + submi r0, r0, #4 + ldrmi r3, [r1] + strmi r3, [r0] + movs r2, r2, lsl #31 + itttt cs + subcs r1, r1, #2 + subcs r0, r0, #2 + ldrhcs r3, [r1] + strhcs r3, [r0] + itttt mi + submi r1, r1, #1 + submi r0, r0, #1 + ldrbmi r12, [r1] + strbmi r12, [r0] + bx lr +// .cfi_endproc +END(memmove) + |