summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorxkonni <konstantin.koslowski@gmail.com>2012-06-21 03:07:53 +0200
committerRicardo Cerqueira <cyanogenmod@cerqueira.org>2012-07-10 20:52:27 +0100
commit00b8d2378513ab558a8442c20b5b0a73e54c4710 (patch)
treeacd461581a151ca9f55749d299fcee470b83626a
parenta39464c0509f5fd75531ff1e468bb90b4cc51495 (diff)
downloadbionic-00b8d2378513ab558a8442c20b5b0a73e54c4710.zip
bionic-00b8d2378513ab558a8442c20b5b0a73e54c4710.tar.gz
bionic-00b8d2378513ab558a8442c20b5b0a73e54c4710.tar.bz2
msm8960: Improve performance of memcpy, memmove, bcopy and memmove_words (codeaurora)
Taken from the following commits to codeaurora https://www.codeaurora.org/gitweb/quic/la/?p=platform/bionic.git;a=commit;h=a5333c8fbeb5190e3a8dc9f99af66eb50b01462c https://www.codeaurora.org/gitweb/quic/la/?p=platform/bionic.git;a=commit;h=6077a9577667fc9999312a2c6daf4d3c77bdf294 Uses following variables in BoardConfig.mk TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true TARGET_USE_KRAIT_PLD_SET := true TARGET_KRAIT_BIONIC_PLDOFFS := 10 TARGET_KRAIT_BIONIC_PLDTHRESH := 10 TARGET_KRAIT_BIONIC_BBTHRESH := 64 TARGET_KRAIT_BIONIC_PLDSIZE := 64 Change-Id: Iee66f7698dc301507a012e27c91141f3f6925dcb
-rw-r--r--libc/Android.mk24
-rw-r--r--libc/arch-arm/bionic/memcpy.S157
-rw-r--r--libc/arch-arm/bionic/memmove.S174
3 files changed, 348 insertions, 7 deletions
diff --git a/libc/Android.mk b/libc/Android.mk
index e3e7a17..f1b6072 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -269,7 +269,6 @@ libc_common_src_files := \
bionic/libc_init_common.c \
bionic/logd_write.c \
bionic/md5.c \
- bionic/memmove_words.c \
bionic/pututline.c \
bionic/realpath.c \
bionic/sched_getaffinity.c \
@@ -381,11 +380,18 @@ endif
# current ARM version
ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true)
libc_common_src_files += \
+ arch-arm/bionic/memmove.S \
+ bionic/memmove_words.c
+else
+ ifeq ($(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION),true)
+ libc_common_src_files += \
arch-arm/bionic/memmove.S
-else # Non-Scorpion-based ARM
-libc_common_src_files += \
+ else # Other ARM
+ libc_common_src_files += \
string/bcopy.c \
- string/memmove.c.arm
+ string/memmove.c.arm \
+ bionic/memmove_words.c
+ endif # !TARGET_USE_KRAIT_BIONIC_OPTIMIZATION
endif # !TARGET_USE_SCORPION_BIONIC_OPTIMIZATION
# These files need to be arm so that gdbserver
@@ -509,6 +515,16 @@ ifeq ($(TARGET_ARCH),arm)
ifeq ($(TARGET_HAVE_TEGRA_ERRATA_657451),true)
libc_common_cflags += -DHAVE_TEGRA_ERRATA_657451
endif
+ # Add in defines to activate KRAIT_NEON_OPTIMIZATION
+ ifeq ($(TARGET_USE_KRAIT_BIONIC_OPTIMIZATION),true)
+ libc_common_cflags += -DKRAIT_NEON_OPTIMIZATION
+ ifeq ($(TARGET_USE_KRAIT_PLD_SET),true)
+ libc_common_cflags += -DPLDOFFS=$(TARGET_KRAIT_BIONIC_PLDOFFS)
+ libc_common_cflags += -DPLDTHRESH=$(TARGET_KRAIT_BIONIC_PLDTHRESH)
+ libc_common_cflags += -DPLDSIZE=$(TARGET_KRAIT_BIONIC_PLDSIZE)
+ libc_common_cflags += -DBBTHRESH=$(TARGET_KRAIT_BIONIC_BBTHRESH)
+ endif
+ endif
ifeq ($(TARGET_CORTEX_CACHE_LINE_32),true)
libc_common_cflags += -DCORTEX_CACHE_LINE_32
endif
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index 4a9aac8..e26620b 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -32,7 +32,162 @@
#include <machine/asm.h>
#if defined(__ARM_NEON__)
-#if defined(SCORPION_NEON_OPTIMIZATION)
+#if defined(KRAIT_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_KRAIT_PLD_SET := true
+ * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#ifndef BBTHRESH
+#define BBTHRESH (4096/64)
+#endif
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+#define NOP_OPCODE (0xe320f000)
+
+ .text
+ .fpu neon
+ .global memcpy
+ .type memcpy, %function
+ .align 5
+memcpy:
+ stmfd sp!, {r0, r9, r10, lr}
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #64
+ blt .Lneon_copy_32_a
+
+ mov r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .Lneon_copy_64_loop_nopld
+
+ cmp r12, #BBTHRESH
+ ble .Lneon_prime_pump
+
+ add lr, r0, #0x400
+ add r9, r1, #(PLDOFFS*PLDSIZE)
+ sub lr, lr, r9
+ lsl lr, lr, #21
+ lsr lr, lr, #21
+ add lr, lr, #(PLDOFFS*PLDSIZE)
+ cmp r12, lr, lsr #6
+ movle lr, #(PLDOFFS*PLDSIZE)
+
+ movgt r9, #(PLDOFFS)
+ rsbgts r9, r9, lr, lsr #6
+ ble .Lneon_prime_pump
+
+ add r10, r1, lr
+ bic r10, #0x3F
+
+ sub r12, lr, lsr #6
+ cmp r9, r12
+ suble r12, r12, r9
+ movgt r9, r12
+ movgt r12, #0
+
+ pld [r1, #((PLDOFFS-1)*PLDSIZE)]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_outer_doublepld:
+ pld [r1, #((PLDOFFS)*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r9, r9, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_outer_doublepld
+ cmp r12, #0
+ bne .Lneon_copy_64_loop_outer
+ mov r12, lr, lsr #6
+ b .Lneon_copy_64_loop_nopld
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_prime_pump:
+ mov lr, #(PLDOFFS*PLDSIZE)
+ add r10, r1, #(PLDOFFS*PLDSIZE)
+ bic r10, #0x3F
+ sub r12, r12, #PLDOFFS
+ pld [r10, #(-1*PLDSIZE)]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_outer:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .Lneon_copy_64_loop_outer
+ mov r12, lr, lsr #6
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ beq .Lneon_exit
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_copy_32_a:
+ movs r12, r2, lsl #27
+ bcc .Lneon_16
+ vld1.32 {q0,q1}, [r1]!
+ vst1.32 {q0,q1}, [r0]!
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_16:
+ bpl .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ ands r2, r2, #0x0f
+ beq .Lneon_exit
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_lt16:
+ movs r12, r2, lsl #29
+ ldrcs r3, [r1], #4
+ ldrcs r12, [r1], #4
+ strcs r3, [r0], #4
+ strcs r12, [r0], #4
+ ldrmi r3, [r1], #4
+ strmi r3, [r0], #4
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ ldrcsh r3, [r1], #2
+ strcsh r3, [r0], #2
+ ldrmib r12, [r1]
+ strmib r12, [r0]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_exit:
+ ldmfd sp!, {r0, r9, r10, lr}
+ bx lr
+ .end
+#elif defined(SCORPION_NEON_OPTIMIZATION)
/*
* These can be overridden in:
* device/<vendor>/<board>/BoardConfig.mk
diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
index 1234195..a792680 100644
--- a/libc/arch-arm/bionic/memmove.S
+++ b/libc/arch-arm/bionic/memmove.S
@@ -1,5 +1,5 @@
/***************************************************************************
- Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved.
+ Copyright (c) 2009-2012 Code Aurora Forum. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
@@ -37,7 +37,177 @@
#include <machine/cpu-features.h>
-#if defined(SCORPION_NEON_OPTIMIZATION)
+#if defined(KRAIT_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_KRAIT_PLD_SET := true
+ * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#if (PLDOFFS < 5)
+#error Routine does not support offsets less than 5
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+#define NOP_OPCODE (0xe320f000)
+
+ .code 32
+ .align 5
+ .global memmove
+ .type memmove, %function
+
+ .global _memmove_words
+ .type _memmove_words, %function
+
+ .global bcopy
+ .type bcopy, %function
+
+bcopy:
+ mov r12, r0
+ mov r0, r1
+ mov r1, r12
+ .balignl 64, NOP_OPCODE, 4*2
+memmove:
+_memmove_words:
+.Lneon_memmove_cmf:
+ subs r12, r0, r1
+ bxeq lr
+ cmphi r2, r12
+ bls memcpy /* Use memcpy for non-overlapping areas */
+
+ push {r0}
+
+.Lneon_back_to_front_copy:
+ add r0, r0, r2
+ add r1, r1, r2
+ cmp r2, #4
+ bgt .Lneon_b2f_gt4
+ cmp r2, #0
+.Lneon_b2f_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ b .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+ sub r3, r0, r1
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #64
+ bge .Lneon_b2f_copy_64
+ cmp r12, #32
+ bge .Lneon_b2f_copy_32
+ cmp r12, #8
+ bge .Lneon_b2f_copy_8
+ cmp r12, #4
+ bge .Lneon_b2f_copy_4
+ b .Lneon_b2f_copy_1
+.Lneon_b2f_copy_64:
+ sub r1, r1, #64 /* Predecrement */
+ sub r0, r0, #64
+ movs r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .Lneon_b2f_copy_64_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #-(PLDOFFS-5)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-4)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-3)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-2)*PLDSIZE]
+ pld [r1, #-(PLDOFFS-1)*PLDSIZE]
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_outer:
+ pld [r1, #-(PLDOFFS)*PLDSIZE]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ sub r1, r1, #96 /* Post-fixup and predecrement */
+ vst1.32 {q2, q3}, [r0]
+ sub r0, r0, #96
+ bne .Lneon_b2f_copy_64_loop_outer
+ mov r12, #PLDOFFS
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ sub r1, r1, #96 /* Post-fixup and predecrement */
+ vst1.32 {q10, q11}, [r0]
+ sub r0, r0, #96
+ bne .Lneon_b2f_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ beq .Lneon_memmove_done
+ add r1, r1, #64 /* Post-fixup */
+ add r0, r0, #64
+ cmp r2, #32
+ blt .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+ sub r1, r1, #32 /* Predecrement */
+ sub r0, r0, #32
+ vld1.32 {q0,q1}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]
+ bne .Lneon_b2f_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+ movs r12, r2, lsr #0x3
+ beq .Lneon_b2f_copy_4
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_8_loop:
+ sub r1, r1, #8 /* Predecrement */
+ sub r0, r0, #8
+ vld1.32 {d0}, [r1]
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]
+ bne .Lneon_b2f_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+ movs r12, r2, lsr #0x2
+ beq .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+ ldr r3, [r1, #-4]!
+ subs r12, r12, #1
+ str r3, [r0, #-4]!
+ bne .Lneon_b2f_copy_4_loop
+ ands r2, r2, #0x3
+.Lneon_b2f_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+ .balignl 64, NOP_OPCODE, 4*2
+.Lneon_b2f_copy_1_loop:
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ bne .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+ pop {r0}
+ bx lr
+
+ .end
+
+#elif defined(SCORPION_NEON_OPTIMIZATION)
/*
* These can be overridden in:
* device/<vendor>/<board>/BoardConfig.mk