diff options
author | Brent DeGraaf <bdegraaf@codeaurora.org> | 2013-10-02 09:47:11 -0400 |
---|---|---|
committer | Steve Kondik <shade@chemlab.org> | 2013-11-13 05:41:33 -0800 |
commit | a46374c5f2a039662c7cb32f4d1e8f5e6a483a2d (patch) | |
tree | deae94faf2e2c383b0aa039b25d02676b9ccba23 | |
parent | b59b790f97dc58a931719524499269f2f3b904f2 (diff) | |
download | bionic-cm-10.2.0.zip bionic-cm-10.2.0.tar.gz bionic-cm-10.2.0.tar.bz2 |
The majority of libc under bionic is built for thumb2. Refactor the
high performance memcpy used in previous builds for thumb2, including
information that can be used for stack-unwinding.
Change-Id: Ib5f7ab354f39313758402ec02b0aea27b15d45fa
-rw-r--r-- | libc/arch-arm/krait/bionic/memcpy.S | 198 | ||||
-rw-r--r-- | libc/arch-arm/krait/bionic/memcpy_base.S | 215 |
2 files changed, 240 insertions, 173 deletions
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S index 818c3a4..aca96a8 100644 --- a/libc/arch-arm/krait/bionic/memcpy.S +++ b/libc/arch-arm/krait/bionic/memcpy.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 The Android Open Source Project + * Copyright (C) 2013 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -26,185 +26,37 @@ * SUCH DAMAGE. */ -/* Assumes neon instructions and a cache line size of 64 bytes. */ +/* Assumes neon instructions and a cache line size of 32 bytes. */ -#include <machine/cpu-features.h> #include <machine/asm.h> /* - * These can be overridden in: - * device/<vendor>/<board>/BoardConfig.mk - * by setting the following: - * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true - * TARGET_USE_KRAIT_PLD_SET := true - * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> - * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> - * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> - * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold> + * This code assumes it is running on a processor that supports all arm v7 + * instructions, that supports neon instructions, and that has a 32 byte + * cache line. */ -#ifndef PLDOFFS -#define PLDOFFS (10) -#endif -#ifndef PLDTHRESH -#define PLDTHRESH (PLDOFFS) -#endif -#ifndef BBTHRESH -#define BBTHRESH (4096/64) -#endif -#if (PLDOFFS < 1) -#error Routine does not support offsets less than 1 -#endif -#if (PLDTHRESH < PLDOFFS) -#error PLD threshold must be greater than or equal to the PLD offset -#endif -#ifndef PLDSIZE -#define PLDSIZE (64) -#endif -#define NOP_OPCODE (0xe320f000) - - .text - .fpu neon + .text + .syntax unified + .fpu neon + .thumb + .thumb_func ENTRY(memcpy) - .save {r0, lr} - mov r12, r0 - cmp r2, #4 - blt .Lneon_lt4 - cmp r2, #16 - blt .Lneon_lt16 - cmp r2, #32 - blt .Lneon_16 - cmp r2, #64 - blt .Lneon_copy_32_a - stmfd sp!, {r0} - - mov r12, r2, lsr #6 - cmp r12, #PLDTHRESH - ble .Lneon_copy_64_loop_nopld - - stmfd sp!, {r9, r10, lr} - - cmp r12, #BBTHRESH - ble .Lneon_prime_pump - - add lr, r0, #0x400 - add r9, r1, #(PLDOFFS*PLDSIZE) - sub lr, lr, r9 - lsl lr, lr, #21 - lsr lr, lr, #21 - add lr, lr, #(PLDOFFS*PLDSIZE) - cmp r12, lr, lsr #6 - movle lr, #(PLDOFFS*PLDSIZE) - ble .Lneon_prime_pump - - movgt r9, #(PLDOFFS) - rsbgts r9, r9, lr, lsr #6 - ble .Lneon_prime_pump - - add r10, r1, lr - bic r10, #0x3F - - sub r12, lr, lsr #6 - cmp r9, r12 - suble r12, r12, r9 - movgt r9, r12 - movgt r12, #0 - - pld [r1, #((PLDOFFS-1)*PLDSIZE)] - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_copy_64_loop_outer_doublepld: - pld [r1, #((PLDOFFS)*PLDSIZE)] - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - ldr r3, [r10] - subs r9, r9, #1 - vst1.32 {q0, q1}, [r0]! - vst1.32 {q2, q3}, [r0]! - add r10, #64 - bne .Lneon_copy_64_loop_outer_doublepld - cmp r12, #0 - beq .Lneon_pop_before_nopld - - cmp r12, #(512*1024/64) - blt .Lneon_copy_64_loop_outer - - .balignl 64, NOP_OPCODE, 8 -.Lneon_copy_64_loop_ddr: - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - pld [r10] - subs r12, r12, #1 - vst1.32 {q0, q1}, [r0]! - vst1.32 {q2, q3}, [r0]! - add r10, #64 - bne .Lneon_copy_64_loop_ddr - b .Lneon_pop_before_nopld - - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_prime_pump: - mov lr, #(PLDOFFS*PLDSIZE) - add r10, r1, #(PLDOFFS*PLDSIZE) - bic r10, #0x3F - sub r12, r12, #PLDOFFS - ldr r3, [r10, #(-1*PLDSIZE)] - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_copy_64_loop_outer: - vld1.32 {q0, q1}, [r1]! - vld1.32 {q2, q3}, [r1]! - ldr r3, [r10] - subs r12, r12, #1 - vst1.32 {q0, q1}, [r0]! - vst1.32 {q2, q3}, [r0]! - add r10, #64 - bne .Lneon_copy_64_loop_outer - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_pop_before_nopld: - mov r12, lr, lsr #6 - ldmfd sp!, {r9, r10, lr} - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_copy_64_loop_nopld: - vld1.32 {q8, q9}, [r1]! - vld1.32 {q10, q11}, [r1]! - subs r12, r12, #1 - vst1.32 {q8, q9}, [r0]! - vst1.32 {q10, q11}, [r0]! - bne .Lneon_copy_64_loop_nopld - ands r2, r2, #0x3f - ldmfd sp!, {r12} - beq .Lneon_exit - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_copy_32_a: - movs r3, r2, lsl #27 - bcc .Lneon_16 - vld1.32 {q0,q1}, [r1]! - vst1.32 {q0,q1}, [r0]! - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_16: - bpl .Lneon_lt16 - vld1.32 {q8}, [r1]! - vst1.32 {q8}, [r0]! - ands r2, r2, #0x0f - beq .Lneon_exit - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_lt16: - movs r3, r2, lsl #29 - ldrcs r3, [r1], #4 - strcs r3, [r0], #4 - ldrcs r3, [r1], #4 - strcs r3, [r0], #4 - ldrmi r3, [r1], #4 - strmi r3, [r0], #4 - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_lt4: - movs r2, r2, lsl #31 - ldrcsh r3, [r1], #2 - strcsh r3, [r0], #2 - ldrmib r3, [r1] - strmib r3, [r0] - .balignl 64, NOP_OPCODE, 4*2 -.Lneon_exit: - mov r0, r12 - bx lr + .cfi_startproc + pld [r1, #64] + stmfd sp!, {r0, lr} + .save {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + .cfi_endproc END(memcpy) +#define MEMCPY_BASE __memcpy_base +#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned +#include "memcpy_base.S" + + .data +error_string: + .string "memcpy buffer overflow" diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S new file mode 100644 index 0000000..e80e738 --- /dev/null +++ b/libc/arch-arm/krait/bionic/memcpy_base.S @@ -0,0 +1,215 @@ +/*************************************************************************** + Copyright (c) 2009-2013 The Linux Foundation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of The Linux Foundation nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +/* Assumes neon instructions and a cache line size of 64 bytes. */ + +#include <machine/cpu-features.h> +#include <machine/asm.h> + +/* + * These default settings are good for all Krait-based systems + * as of this writing, but they can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true + * TARGET_USE_KRAIT_PLD_SET := true + * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> + * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> + * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> + * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold> + */ + +#ifndef PLDOFFS +#define PLDOFFS (10) +#endif +#ifndef PLDTHRESH +#define PLDTHRESH (PLDOFFS) +#endif +#ifndef BBTHRESH +#define BBTHRESH (4096/64) +#endif +#if (PLDOFFS < 1) +#error Routine does not support offsets less than 1 +#endif +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif +#ifndef PLDSIZE +#define PLDSIZE (64) +#endif + .text + .fpu neon + +ENTRY(MEMCPY_BASE) +MEMCPY_BASE_ALIGNED: + .cfi_startproc + .save {r0, r9, r10, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + cmp r2, #4 + blt .Lneon_lt4 + cmp r2, #16 + blt .Lneon_lt16 + cmp r2, #32 + blt .Lneon_16 + cmp r2, #64 + blt .Lneon_copy_32_a + + mov r12, r2, lsr #6 + cmp r12, #PLDTHRESH + ble .Lneon_copy_64_loop_nopld + + push {r9, r10} + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r9, 0 + .cfi_rel_offset r10, 4 + + cmp r12, #BBTHRESH + ble .Lneon_prime_pump + + add lr, r0, #0x400 + add r9, r1, #(PLDOFFS*PLDSIZE) + sub lr, lr, r9 + lsl lr, lr, #21 + lsr lr, lr, #21 + add lr, lr, #(PLDOFFS*PLDSIZE) + cmp r12, lr, lsr #6 + ble .Lneon_prime_pump + + itt gt + movgt r9, #(PLDOFFS) + rsbsgt r9, r9, lr, lsr #6 + ble .Lneon_prime_pump + + add r10, r1, lr + bic r10, #0x3F + + sub r12, r12, lr, lsr #6 + + cmp r9, r12 + itee le + suble r12, r12, r9 + movgt r9, r12 + movgt r12, #0 + + pld [r1, #((PLDOFFS-1)*PLDSIZE)] +.Lneon_copy_64_loop_outer_doublepld: + pld [r1, #((PLDOFFS)*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r9, r9, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .Lneon_copy_64_loop_outer_doublepld + cmp r12, #0 + beq .Lneon_pop_before_nopld + + cmp r12, #(512*1024/64) + blt .Lneon_copy_64_loop_outer + +.Lneon_copy_64_loop_ddr: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + pld [r10] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .Lneon_copy_64_loop_ddr + b .Lneon_pop_before_nopld + +.Lneon_prime_pump: + mov lr, #(PLDOFFS*PLDSIZE) + add r10, r1, #(PLDOFFS*PLDSIZE) + bic r10, #0x3F + sub r12, r12, #PLDOFFS + ldr r3, [r10, #(-1*PLDSIZE)] +.Lneon_copy_64_loop_outer: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .Lneon_copy_64_loop_outer +.Lneon_pop_before_nopld: + mov r12, lr, lsr #6 + pop {r9, r10} + .cfi_restore r9 + .cfi_restore r10 + .cfi_adjust_cfa_offset -8 + +.Lneon_copy_64_loop_nopld: + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_64_loop_nopld + ands r2, r2, #0x3f + .cfi_restore r0 + .cfi_adjust_cfa_offset -4 + beq .Lneon_exit +.Lneon_copy_32_a: + movs r3, r2, lsl #27 + bcc .Lneon_16 + vld1.32 {q0,q1}, [r1]! + vst1.32 {q0,q1}, [r0]! +.Lneon_16: + bpl .Lneon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + ands r2, r2, #0x0f + beq .Lneon_exit +.Lneon_lt16: + movs r3, r2, lsl #29 + itttt cs + ldrcs r3, [r1], #4 + strcs r3, [r0], #4 + ldrcs r3, [r1], #4 + strcs r3, [r0], #4 + itt mi + ldrmi r3, [r1], #4 + strmi r3, [r0], #4 +.Lneon_lt4: + movs r2, r2, lsl #31 + itt cs + ldrhcs r3, [r1], #2 + strhcs r3, [r0], #2 + itt mi + ldrbmi r3, [r1] + strbmi r3, [r0] +.Lneon_exit: + pop {r0, lr} + bx lr + .cfi_endproc +END(MEMCPY_BASE) + |