diff options
author | xkonni <konstantin.koslowski@gmail.com> | 2012-06-21 03:07:53 +0200 |
---|---|---|
committer | Ricardo Cerqueira <cyanogenmod@cerqueira.org> | 2012-07-10 20:52:27 +0100 |
commit | 00b8d2378513ab558a8442c20b5b0a73e54c4710 (patch) | |
tree | acd461581a151ca9f55749d299fcee470b83626a | |
parent | a39464c0509f5fd75531ff1e468bb90b4cc51495 (diff) | |
download | bionic-00b8d2378513ab558a8442c20b5b0a73e54c4710.zip bionic-00b8d2378513ab558a8442c20b5b0a73e54c4710.tar.gz bionic-00b8d2378513ab558a8442c20b5b0a73e54c4710.tar.bz2 |
msm8960: Improve performance of memcpy, memmove, bcopy and memmove_words (codeaurora)
Taken from the following commits to codeaurora
https://www.codeaurora.org/gitweb/quic/la/?p=platform/bionic.git;a=commit;h=a5333c8fbeb5190e3a8dc9f99af66eb50b01462c
https://www.codeaurora.org/gitweb/quic/la/?p=platform/bionic.git;a=commit;h=6077a9577667fc9999312a2c6daf4d3c77bdf294
Uses following variables in BoardConfig.mk
TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
TARGET_USE_KRAIT_PLD_SET := true
TARGET_KRAIT_BIONIC_PLDOFFS := 10
TARGET_KRAIT_BIONIC_PLDTHRESH := 10
TARGET_KRAIT_BIONIC_BBTHRESH := 64
TARGET_KRAIT_BIONIC_PLDSIZE := 64
Change-Id: Iee66f7698dc301507a012e27c91141f3f6925dcb
-rw-r--r-- | libc/Android.mk | 24 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memcpy.S | 157 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memmove.S | 174 |
3 files changed, 348 insertions, 7 deletions
diff --git a/libc/Android.mk b/libc/Android.mk index e3e7a17..f1b6072 100644 --- a/libc/Android.mk +++ b/libc/Android.mk @@ -269,7 +269,6 @@ libc_common_src_files := \ bionic/libc_init_common.c \ bionic/logd_write.c \ bionic/md5.c \ - bionic/memmove_words.c \ bionic/pututline.c \ bionic/realpath.c \ bionic/sched_getaffinity.c \ @@ -381,11 +380,18 @@ endif # current ARM version ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true) libc_common_src_files += \ + arch-arm/bionic/memmove.S \ + bionic/memmove_words.c +else + ifeq ($(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION),true) + libc_common_src_files += \ arch-arm/bionic/memmove.S -else # Non-Scorpion-based ARM -libc_common_src_files += \ + else # Other ARM + libc_common_src_files += \ string/bcopy.c \ - string/memmove.c.arm + string/memmove.c.arm \ + bionic/memmove_words.c + endif # !TARGET_USE_KRAIT_BIONIC_OPTIMIZATION endif # !TARGET_USE_SCORPION_BIONIC_OPTIMIZATION # These files need to be arm so that gdbserver @@ -509,6 +515,16 @@ ifeq ($(TARGET_ARCH),arm) ifeq ($(TARGET_HAVE_TEGRA_ERRATA_657451),true) libc_common_cflags += -DHAVE_TEGRA_ERRATA_657451 endif + # Add in defines to activate KRAIT_NEON_OPTIMIZATION + ifeq ($(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION),true) + libc_common_cflags += -DKRAIT_NEON_OPTIMIZATION + ifeq ($(TARGET_USE_KRAIT_PLD_SET),true) + libc_common_cflags += -DPLDOFFS=$(TARGET_KRAIT_BIONIC_PLDOFFS) + libc_common_cflags += -DPLDTHRESH=$(TARGET_KRAIT_BIONIC_PLDTHRESH) + libc_common_cflags += -DPLDSIZE=$(TARGET_KRAIT_BIONIC_PLDSIZE) + libc_common_cflags += -DBBTHRESH=$(TARGET_KRAIT_BIONIC_BBTHRESH) + endif + endif ifeq ($(TARGET_CORTEX_CACHE_LINE_32),true) libc_common_cflags += -DCORTEX_CACHE_LINE_32 endif diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index 4a9aac8..e26620b 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -32,7 +32,162 @@ #include <machine/asm.h> #if defined(__ARM_NEON__) -#if defined(SCORPION_NEON_OPTIMIZATION) +#if defined(KRAIT_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true + * TARGET_USE_KRAIT_PLD_SET := true + * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> + * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> + * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> + * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold> + */ +#ifndef PLDOFFS +#define PLDOFFS (10) +#endif +#ifndef PLDTHRESH +#define PLDTHRESH (PLDOFFS) +#endif +#ifndef BBTHRESH +#define BBTHRESH (4096/64) +#endif +#if (PLDOFFS < 1) +#error Routine does not support offsets less than 1 +#endif +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif +#ifndef PLDSIZE +#define PLDSIZE (64) +#endif +#define NOP_OPCODE (0xe320f000) + + .text + .fpu neon + .global memcpy + .type memcpy, %function + .align 5 +memcpy: + stmfd sp!, {r0, r9, r10, lr} + cmp r2, #4 + blt .Lneon_lt4 + cmp r2, #16 + blt .Lneon_lt16 + cmp r2, #32 + blt .Lneon_16 + cmp r2, #64 + blt .Lneon_copy_32_a + + mov r12, r2, lsr #6 + cmp r12, #PLDTHRESH + ble .Lneon_copy_64_loop_nopld + + cmp r12, #BBTHRESH + ble .Lneon_prime_pump + + add lr, r0, #0x400 + add r9, r1, #(PLDOFFS*PLDSIZE) + sub lr, lr, r9 + lsl lr, lr, #21 + lsr lr, lr, #21 + add lr, lr, #(PLDOFFS*PLDSIZE) + cmp r12, lr, lsr #6 + movle lr, #(PLDOFFS*PLDSIZE) + + movgt r9, #(PLDOFFS) + rsbgts r9, r9, lr, lsr #6 + ble .Lneon_prime_pump + + add r10, r1, lr + bic r10, #0x3F + + sub r12, lr, lsr #6 + cmp r9, r12 + suble r12, r12, r9 + movgt r9, r12 + movgt r12, #0 + + pld [r1, #((PLDOFFS-1)*PLDSIZE)] + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_copy_64_loop_outer_doublepld: + pld [r1, #((PLDOFFS)*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r9, r9, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .Lneon_copy_64_loop_outer_doublepld + cmp r12, #0 + bne .Lneon_copy_64_loop_outer + mov r12, lr, lsr #6 + b .Lneon_copy_64_loop_nopld + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_prime_pump: + mov lr, #(PLDOFFS*PLDSIZE) + add r10, r1, #(PLDOFFS*PLDSIZE) + bic r10, #0x3F + sub r12, r12, #PLDOFFS + pld [r10, #(-1*PLDSIZE)] + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_copy_64_loop_outer: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .Lneon_copy_64_loop_outer + mov r12, lr, lsr #6 + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_copy_64_loop_nopld: + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_64_loop_nopld + ands r2, r2, #0x3f + beq .Lneon_exit + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_copy_32_a: + movs r12, r2, lsl #27 + bcc .Lneon_16 + vld1.32 {q0,q1}, [r1]! + vst1.32 {q0,q1}, [r0]! + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_16: + bpl .Lneon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + ands r2, r2, #0x0f + beq .Lneon_exit + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_lt16: + movs r12, r2, lsl #29 + ldrcs r3, [r1], #4 + ldrcs r12, [r1], #4 + strcs r3, [r0], #4 + strcs r12, [r0], #4 + ldrmi r3, [r1], #4 + strmi r3, [r0], #4 + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_lt4: + movs r2, r2, lsl #31 + ldrcsh r3, [r1], #2 + strcsh r3, [r0], #2 + ldrmib r12, [r1] + strmib r12, [r0] + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_exit: + ldmfd sp!, {r0, r9, r10, lr} + bx lr + .end +#elif defined(SCORPION_NEON_OPTIMIZATION) /* * These can be overridden in: * device/<vendor>/<board>/BoardConfig.mk diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S index 1234195..a792680 100644 --- a/libc/arch-arm/bionic/memmove.S +++ b/libc/arch-arm/bionic/memmove.S @@ -1,5 +1,5 @@ /*************************************************************************** - Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved. + Copyright (c) 2009-2012 Code Aurora Forum. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -37,7 +37,177 @@ #include <machine/cpu-features.h> -#if defined(SCORPION_NEON_OPTIMIZATION) +#if defined(KRAIT_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true + * TARGET_USE_KRAIT_PLD_SET := true + * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> + * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> + * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> + */ +#ifndef PLDOFFS +#define PLDOFFS (10) +#endif +#ifndef PLDTHRESH +#define PLDTHRESH (PLDOFFS) +#endif +#if (PLDOFFS < 5) +#error Routine does not support offsets less than 5 +#endif +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif +#ifndef PLDSIZE +#define PLDSIZE (64) +#endif +#define NOP_OPCODE (0xe320f000) + + .code 32 + .align 5 + .global memmove + .type memmove, %function + + .global _memmove_words + .type _memmove_words, %function + + .global bcopy + .type bcopy, %function + +bcopy: + mov r12, r0 + mov r0, r1 + mov r1, r12 + .balignl 64, NOP_OPCODE, 4*2 +memmove: +_memmove_words: +.Lneon_memmove_cmf: + subs r12, r0, r1 + bxeq lr + cmphi r2, r12 + bls memcpy /* Use memcpy for non-overlapping areas */ + + push {r0} + +.Lneon_back_to_front_copy: + add r0, r0, r2 + add r1, r1, r2 + cmp r2, #4 + bgt .Lneon_b2f_gt4 + cmp r2, #0 +.Lneon_b2f_smallcopy_loop: + beq .Lneon_memmove_done + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + b .Lneon_b2f_smallcopy_loop +.Lneon_b2f_gt4: + sub r3, r0, r1 + cmp r2, r3 + movle r12, r2 + movgt r12, r3 + cmp r12, #64 + bge .Lneon_b2f_copy_64 + cmp r12, #32 + bge .Lneon_b2f_copy_32 + cmp r12, #8 + bge .Lneon_b2f_copy_8 + cmp r12, #4 + bge .Lneon_b2f_copy_4 + b .Lneon_b2f_copy_1 +.Lneon_b2f_copy_64: + sub r1, r1, #64 /* Predecrement */ + sub r0, r0, #64 + movs r12, r2, lsr #6 + cmp r12, #PLDTHRESH + ble .Lneon_b2f_copy_64_loop_nopld + sub r12, #PLDOFFS + pld [r1, #-(PLDOFFS-5)*PLDSIZE] + pld [r1, #-(PLDOFFS-4)*PLDSIZE] + pld [r1, #-(PLDOFFS-3)*PLDSIZE] + pld [r1, #-(PLDOFFS-2)*PLDSIZE] + pld [r1, #-(PLDOFFS-1)*PLDSIZE] + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_b2f_copy_64_loop_outer: + pld [r1, #-(PLDOFFS)*PLDSIZE] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + sub r1, r1, #96 /* Post-fixup and predecrement */ + vst1.32 {q2, q3}, [r0] + sub r0, r0, #96 + bne .Lneon_b2f_copy_64_loop_outer + mov r12, #PLDOFFS + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_b2f_copy_64_loop_nopld: + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1] + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + sub r1, r1, #96 /* Post-fixup and predecrement */ + vst1.32 {q10, q11}, [r0] + sub r0, r0, #96 + bne .Lneon_b2f_copy_64_loop_nopld + ands r2, r2, #0x3f + beq .Lneon_memmove_done + add r1, r1, #64 /* Post-fixup */ + add r0, r0, #64 + cmp r2, #32 + blt .Lneon_b2f_copy_finish +.Lneon_b2f_copy_32: + mov r12, r2, lsr #5 +.Lneon_b2f_copy_32_loop: + sub r1, r1, #32 /* Predecrement */ + sub r0, r0, #32 + vld1.32 {q0,q1}, [r1] + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0] + bne .Lneon_b2f_copy_32_loop + ands r2, r2, #0x1f + beq .Lneon_memmove_done +.Lneon_b2f_copy_finish: +.Lneon_b2f_copy_8: + movs r12, r2, lsr #0x3 + beq .Lneon_b2f_copy_4 + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_b2f_copy_8_loop: + sub r1, r1, #8 /* Predecrement */ + sub r0, r0, #8 + vld1.32 {d0}, [r1] + subs r12, r12, #1 + vst1.32 {d0}, [r0] + bne .Lneon_b2f_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_b2f_copy_4: + movs r12, r2, lsr #0x2 + beq .Lneon_b2f_copy_1 +.Lneon_b2f_copy_4_loop: + ldr r3, [r1, #-4]! + subs r12, r12, #1 + str r3, [r0, #-4]! + bne .Lneon_b2f_copy_4_loop + ands r2, r2, #0x3 +.Lneon_b2f_copy_1: + cmp r2, #0 + beq .Lneon_memmove_done + .balignl 64, NOP_OPCODE, 4*2 +.Lneon_b2f_copy_1_loop: + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + bne .Lneon_b2f_copy_1_loop + +.Lneon_memmove_done: + pop {r0} + bx lr + + .end + +#elif defined(SCORPION_NEON_OPTIMIZATION) /* * These can be overridden in: * device/<vendor>/<board>/BoardConfig.mk |