summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libc/arch-arm/bionic/memcmp.S81
1 files changed, 12 insertions, 69 deletions
diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S
index 781c4f8..c872a51 100644
--- a/libc/arch-arm/bionic/memcmp.S
+++ b/libc/arch-arm/bionic/memcmp.S
@@ -1,6 +1,5 @@
/*
- * Copyright (C) 2008, 2011 The Android Open Source Project
- * Copyright (C) 2010 ST-Ericsson SA
+ * Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -31,71 +30,43 @@
#include <machine/asm.h>
/*
- * Optimized memcmp() for ARM9 and Cortex-A9
+ * Optimized memcmp() for ARM9.
+ * This would not be optimal on XScale or ARM11, where more prefetching
+ * and use of PLD will be needed.
+ * The 2 major optimzations here are
+ * (1) The main loop compares 16 bytes at a time
+ * (2) The loads are scheduled in a way they won't stall
*/
-#if __ARM_ARCH__ >= 7
-#define __ARM_CORTEX
-
-#if defined(CORTEX_CACHE_LINE_32)
-#define CACHE_LINE_SIZE 32
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-
-#endif /* __ARM_ARCH__ */
-
ENTRY(memcmp)
-#if defined(__ARM_CORTEX)
- pld [r0, #(CACHE_LINE_SIZE * 0)]
- pld [r0, #(CACHE_LINE_SIZE * 1)]
-#else
PLD (r0, #0)
PLD (r1, #0)
-#endif
/* take of the case where length is 0 or the buffers are the same */
cmp r0, r1
-#if !defined(__ARM_CORTEX)
cmpne r2, #0
-#endif
moveq r0, #0
bxeq lr
-#if defined(__ARM_CORTEX)
- pld [r1, #(CACHE_LINE_SIZE * 0)]
- pld [r1, #(CACHE_LINE_SIZE * 1)]
-
- /* make sure we have at least 8+4 bytes, this simplify things below
- * and avoid some overhead for small blocks
- */
- cmp r2, #(8+4)
- bmi 10f
-#endif /* __ARM_CORTEX */
-
.save {r4, lr}
/* save registers */
stmfd sp!, {r4, lr}
-
-#if !defined(__ARM_CORTEX)
+
PLD (r0, #32)
PLD (r1, #32)
-#endif
/* since r0 hold the result, move the first source
* pointer somewhere else
*/
mov r4, r0
-
-#if !defined(__ARM_CORTEX)
+
/* make sure we have at least 8+4 bytes, this simplify things below
* and avoid some overhead for small blocks
*/
cmp r2, #(8+4)
bmi 8f
-#endif
-
+
/* align first pointer to word boundary
* offset = -src & 3
*/
@@ -132,14 +103,8 @@ ENTRY(memcmp)
subs r2, r2, #(32 + 4)
bmi 1f
-0:
-#if defined(__ARM_CORTEX)
- pld [r4, #(CACHE_LINE_SIZE * 2)]
- pld [r1, #(CACHE_LINE_SIZE * 2)]
-#else
- PLD (r4, #64)
+0: PLD (r4, #64)
PLD (r1, #64)
-#endif
ldr r0, [r4], #4
ldr lr, [r1, #4]!
eors r0, r0, ip
@@ -205,22 +170,6 @@ ENTRY(memcmp)
9: /* restore registers and return */
ldmfd sp!, {r4, lr}
bx lr
-
-#if defined(__ARM_CORTEX)
-10: /* process less than 12 bytes */
- cmp r2, #0
- moveq r0, #0
- bxeq lr
- mov r3, r0
-11:
- ldrb r0, [r3], #1
- ldrb ip, [r1], #1
- subs r0, ip
- bxne lr
- subs r2, r2, #1
- bne 11b
- bx lr
-#endif /* __ARM_CORTEX */
END(memcmp)
@@ -243,14 +192,8 @@ END(memcmp)
bic r1, r1, #3
ldr lr, [r1], #4
-6:
-#if defined(__ARM_CORTEX)
- pld [r1, #(CACHE_LINE_SIZE * 2)]
- pld [r4, #(CACHE_LINE_SIZE * 2)]
-#else
- PLD (r1, #64)
+6: PLD (r1, #64)
PLD (r4, #64)
-#endif
mov ip, lr, lsr #16
ldr lr, [r1], #4
ldr r0, [r4], #4