From f585de254efe9ecada3299ecba1a21ccb60dacef Mon Sep 17 00:00:00 2001 From: Steve Kondik Date: Thu, 7 Feb 2013 12:34:01 -0800 Subject: Revert "memcmp: prefetch optimizing for ARM Cortex-A8/A9" This reverts commit 579dba196255d3f2db7664cfba2db4cb86f59aa9. --- libc/arch-arm/bionic/memcmp.S | 81 +++++++------------------------------------ 1 file changed, 12 insertions(+), 69 deletions(-) (limited to 'libc/arch-arm') diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S index 781c4f8..c872a51 100644 --- a/libc/arch-arm/bionic/memcmp.S +++ b/libc/arch-arm/bionic/memcmp.S @@ -1,6 +1,5 @@ /* - * Copyright (C) 2008, 2011 The Android Open Source Project - * Copyright (C) 2010 ST-Ericsson SA + * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,71 +30,43 @@ #include /* - * Optimized memcmp() for ARM9 and Cortex-A9 + * Optimized memcmp() for ARM9. + * This would not be optimal on XScale or ARM11, where more prefetching + * and use of PLD will be needed. + * The 2 major optimzations here are + * (1) The main loop compares 16 bytes at a time + * (2) The loads are scheduled in a way they won't stall */ -#if __ARM_ARCH__ >= 7 -#define __ARM_CORTEX - -#if defined(CORTEX_CACHE_LINE_32) -#define CACHE_LINE_SIZE 32 -#else -#define CACHE_LINE_SIZE 64 -#endif - -#endif /* __ARM_ARCH__ */ - ENTRY(memcmp) -#if defined(__ARM_CORTEX) - pld [r0, #(CACHE_LINE_SIZE * 0)] - pld [r0, #(CACHE_LINE_SIZE * 1)] -#else PLD (r0, #0) PLD (r1, #0) -#endif /* take of the case where length is 0 or the buffers are the same */ cmp r0, r1 -#if !defined(__ARM_CORTEX) cmpne r2, #0 -#endif moveq r0, #0 bxeq lr -#if defined(__ARM_CORTEX) - pld [r1, #(CACHE_LINE_SIZE * 0)] - pld [r1, #(CACHE_LINE_SIZE * 1)] - - /* make sure we have at least 8+4 bytes, this simplify things below - * and avoid some overhead for small blocks - */ - cmp r2, #(8+4) - bmi 10f -#endif /* __ARM_CORTEX */ - .save {r4, lr} /* save registers */ stmfd sp!, {r4, lr} - -#if !defined(__ARM_CORTEX) + PLD (r0, #32) PLD (r1, #32) -#endif /* since r0 hold the result, move the first source * pointer somewhere else */ mov r4, r0 - -#if !defined(__ARM_CORTEX) + /* make sure we have at least 8+4 bytes, this simplify things below * and avoid some overhead for small blocks */ cmp r2, #(8+4) bmi 8f -#endif - + /* align first pointer to word boundary * offset = -src & 3 */ @@ -132,14 +103,8 @@ ENTRY(memcmp) subs r2, r2, #(32 + 4) bmi 1f -0: -#if defined(__ARM_CORTEX) - pld [r4, #(CACHE_LINE_SIZE * 2)] - pld [r1, #(CACHE_LINE_SIZE * 2)] -#else - PLD (r4, #64) +0: PLD (r4, #64) PLD (r1, #64) -#endif ldr r0, [r4], #4 ldr lr, [r1, #4]! eors r0, r0, ip @@ -205,22 +170,6 @@ ENTRY(memcmp) 9: /* restore registers and return */ ldmfd sp!, {r4, lr} bx lr - -#if defined(__ARM_CORTEX) -10: /* process less than 12 bytes */ - cmp r2, #0 - moveq r0, #0 - bxeq lr - mov r3, r0 -11: - ldrb r0, [r3], #1 - ldrb ip, [r1], #1 - subs r0, ip - bxne lr - subs r2, r2, #1 - bne 11b - bx lr -#endif /* __ARM_CORTEX */ END(memcmp) @@ -243,14 +192,8 @@ END(memcmp) bic r1, r1, #3 ldr lr, [r1], #4 -6: -#if defined(__ARM_CORTEX) - pld [r1, #(CACHE_LINE_SIZE * 2)] - pld [r4, #(CACHE_LINE_SIZE * 2)] -#else - PLD (r1, #64) +6: PLD (r1, #64) PLD (r4, #64) -#endif mov ip, lr, lsr #16 ldr lr, [r1], #4 ldr r0, [r4], #4 -- cgit v1.1