diff options
author | Pat Galizia <pgalizia@codeaurora.org> | 2011-01-25 16:51:08 -0500 |
---|---|---|
committer | Pat Galizia <pgalizia@codeaurora.org> | 2011-02-25 14:58:19 -0500 |
commit | 77e312b6535456180f3b77a9d998a58d192ac07f (patch) | |
tree | af045105f8ba3aa57df75483654624f71ef53560 /libc/arch-arm/bionic/memcpy.S | |
parent | eed8d08b6897f8be61a8093db2b7f0e34fa23abd (diff) | |
download | bionic-77e312b6535456180f3b77a9d998a58d192ac07f.zip bionic-77e312b6535456180f3b77a9d998a58d192ac07f.tar.gz bionic-77e312b6535456180f3b77a9d998a58d192ac07f.tar.bz2 |
Modify Android mem* routines with CodeAurora versions.
Update the memcpy, memmove, and memset routines to use the
versions from CodeAurora when specified in the bionic/Android.mk
file (actually activated in the BoardConfig.mk file under
device/<vendor>/<board>). With this change, the mem* routines are
only used for the msm8660, while other platforms will use the
current Android mem* routines.
Future platforms can modify the makefile to use the CodeAurora-based
mem* routines as desired. This has the benefit of making the CodeAurora-
based routines opt-in instead of opt-out.
Also, PLDSIZE and PLDOFFS can be specified in the BoardConfig.mk as well,
so other platforms with different PLD tunings can use the same code
without modifying the source file itself.
Tests with FileCycler-0.3 showed a slight 1.1% improvement with these
files on an 8660v2, based on the average of three FileCycler runs with
and without the patch. Since the min/max values did not overlap, and
the average score showed an improvement, we can consider upstreaming these
modifications.
Change-Id: I5607fe6f116f8fd2b2bd5ade151778eb0d5a3663
Diffstat (limited to 'libc/arch-arm/bionic/memcpy.S')
-rw-r--r-- | libc/arch-arm/bionic/memcpy.S | 124 |
1 files changed, 111 insertions, 13 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index b8d1007..e92ff5e 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -2,6 +2,8 @@ * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * + * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -29,7 +31,114 @@ #include <machine/cpu-features.h> #if defined(__ARM_NEON__) - +#if defined(SCORPION_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true + * TARGET_USE_SCORPION_PLD_SET := true + * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset> + * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize> + */ +#ifndef PLDOFFS +#define PLDOFFS (6) +#endif +#ifndef PLDSIZE +#define PLDSIZE (128) /* L2 cache line size */ +#endif + .code 32 + .align 5 + .globl memcpy + .func +memcpy: + push {r0} + cmp r2, #4 + blt .Lneon_lt4 + cmp r2, #16 + blt .Lneon_lt16 + cmp r2, #32 + blt .Lneon_16 + cmp r2, #128 + blt .Lneon_copy_32_a + /* Copy blocks of 128-bytes (word-aligned) at a time*/ + /* Code below is optimized for PLDSIZE=128 only */ + mov r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #(PLDOFFS-1)*PLDSIZE] +.Lneon_copy_128_loop_outer: + pld [r1, #(PLDOFFS*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_copy_128_loop_nopld: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_exit + cmp r2, #32 + blt .Lneon_16 + nop + /* Copy blocks of 32-bytes (word aligned) at a time*/ +.Lneon_copy_32_a: + mov r12, r2, lsr #5 +.Lneon_copy_32_loop_a: + vld1.32 {q0,q1}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + bne .Lneon_copy_32_loop_a + ands r2, r2, #0x1f + beq .Lneon_exit +.Lneon_16: + subs r2, r2, #16 + blt .Lneon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + beq .Lneon_exit +.Lneon_lt16: + movs r12, r2, lsl #29 + bcc .Lneon_skip8 + ldr r3, [r1], #4 + ldr r12, [r1], #4 + str r3, [r0], #4 + str r12, [r0], #4 +.Lneon_skip8: + bpl .Lneon_lt4 + ldr r3, [r1], #4 + str r3, [r0], #4 +.Lneon_lt4: + movs r2, r2, lsl #31 + bcc .Lneon_lt2 + ldrh r3, [r1], #2 + strh r3, [r0], #2 +.Lneon_lt2: + bpl .Lneon_exit + ldrb r12, [r1] + strb r12, [r0] +.Lneon_exit: + pop {r0} + bx lr + .endfunc + .end +#else /* !SCORPION_NEON_OPTIMIZATION */ .text .fpu neon @@ -145,7 +254,7 @@ memcpy: bx lr .fnend - +#endif /* !SCORPION_NEON_OPTIMIZATION */ #else /* __ARM_ARCH__ < 7 */ @@ -260,31 +369,20 @@ cached_aligned32: * */ -#if __ARM_ARCH__ == 5 // Align the preload register to a cache-line because the cpu does // "critical word first" (the first word requested is loaded first). bic r12, r1, #0x1F add r12, r12, #64 -#endif 1: ldmia r1!, { r4-r11 } - -#if __ARM_ARCH__ == 5 PLD (r12, #64) -#else - PLD (r1, #64) -#endif - subs r2, r2, #32 -#if __ARM_ARCH__ == 5 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi // for ARM9 preload will not be safely guarded by the preceding subs. // When it is safely guarded the only possibility to have SIGSEGV here // is because the caller overstates the length. ldrhi r3, [r12], #32 /* cheap ARM9 preload */ -#endif - stmia r0!, { r4-r11 } bhs 1b |