1 files changed, 285 insertions, 0 deletions
diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S
new file mode 100644
index 0000000..f45b56b
--- /dev/null
+++ b/libc/arch-arm/bionic/memcmp.S
@@ -0,0 +1,285 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/cpu-features.h>
+
+    .text
+
+    .global memcmp
+    .type memcmp, %function
+    .align 4
+
+/*
+ * Optimized memcmp() for ARM9.
+ * This would not be optimal on XScale or ARM11, where more prefetching
+ * and use of PLD will be needed.
+ * The 2 major optimzations here are
+ * (1) The main loop compares 16 bytes at a time
+ * (2) The loads are scheduled in a way they won't stall
+ */
+
+memcmp:
+        PLD         (r0, #0)
+        PLD         (r1, #0)
+
+        /* take of the case where length is 0 or the buffers are the same */
+        cmp         r0, r1
+        cmpne       r2, #0
+        moveq       r0, #0
+        bxeq        lr
+
+        /* save registers */
+        stmfd       sp!, {r4, lr}
+        
+        PLD         (r0, #32)
+        PLD         (r1, #32)
+
+        /* since r0 hold the result, move the first source
+         * pointer somewhere else
+         */
+         
+         mov        r4, r0
+         
+         /* make sure we have at least 8+4 bytes, this simplify things below
+          * and avoid some overhead for small blocks
+          */
+         cmp        r2, #(8+4)
+         bmi        8f
+        
+        /* align first pointer to word boundary
+         * offset = -src & 3
+         */
+        rsb         r3, r4, #0
+        ands        r3, r3, #3
+        beq         0f
+
+        /* align first pointer  */
+        sub         r2, r2, r3
+1:      ldrb        r0, [r4], #1
+        ldrb        ip, [r1], #1
+        subs        r0, r0, ip
+        bne         9f
+        subs        r3, r3, #1
+        bne         1b
+
+
+0:      /* here the first pointer is aligned, and we have at least 4 bytes
+         * to process.
+         */
+
+        /* see if the pointers are congruent */
+        eor         r0, r4, r1
+        ands        r0, r0, #3
+        bne         5f
+
+        /* congruent case, 32 bytes per iteration
+         * We need to make sure there are at least 32+4 bytes left
+         * because we effectively read ahead one word, and we could
+         * read past the buffer (and segfault) if we're not careful.
+         */
+
+        ldr         ip, [r1]
+        subs        r2, r2, #(32 + 4)
+        bmi         1f
+        
+0:      PLD         (r4, #64)
+        PLD         (r1, #64)
+        ldr         r0, [r4], #4
+        ldr         lr, [r1, #4]!
+        eors        r0, r0, ip
+        ldreq       r0, [r4], #4
+        ldreq       ip, [r1, #4]!
+        eoreqs      r0, r0, lr
+        ldreq       r0, [r4], #4
+        ldreq       lr, [r1, #4]!
+        eoreqs      r0, r0, ip
+        ldreq       r0, [r4], #4
+        ldreq       ip, [r1, #4]!
+        eoreqs      r0, r0, lr
+        ldreq       r0, [r4], #4
+        ldreq       lr, [r1, #4]!
+        eoreqs      r0, r0, ip
+        ldreq       r0, [r4], #4
+        ldreq       ip, [r1, #4]!
+        eoreqs      r0, r0, lr
+        ldreq       r0, [r4], #4
+        ldreq       lr, [r1, #4]!
+        eoreqs      r0, r0, ip
+        ldreq       r0, [r4], #4
+        ldreq       ip, [r1, #4]!
+        eoreqs      r0, r0, lr
+        bne         2f        
+        subs        r2, r2, #32
+        bhs         0b
+
+        /* do we have at least 4 bytes left? */
+1:      adds        r2, r2, #(32 - 4 + 4)
+        bmi         4f
+        
+        /* finish off 4 bytes at a time */
+3:      ldr         r0, [r4], #4
+        ldr         ip, [r1], #4
+        eors        r0, r0, ip
+        bne         2f
+        subs        r2, r2, #4
+        bhs         3b
+
+        /* are we done? */
+4:      adds        r2, r2, #4
+        moveq       r0, #0
+        beq         9f
+
+        /* finish off the remaining bytes */
+        b           8f
+
+2:      /* the last 4 bytes are different, restart them */
+        sub         r4, r4, #4
+        sub         r1, r1, #4
+        mov         r2, #4
+
+        /* process the last few bytes */
+8:      ldrb        r0, [r4], #1
+        ldrb        ip, [r1], #1
+        // stall
+        subs        r0, r0, ip
+        bne         9f
+        subs        r2, r2, #1
+        bne         8b
+
+9:      /* restore registers and return */
+        ldmfd       sp!, {r4, lr}
+        bx          lr
+
+
+
+
+
+5:      /*************** non-congruent case ***************/
+        and         r0, r1, #3      
+        cmp         r0, #2
+        bne         4f
+
+        /* here, offset is 2 (16-bits aligned, special cased) */
+        
+        /* make sure we have at least 16 bytes to process */
+        subs        r2, r2, #16
+        addmi       r2, r2, #16
+        bmi         8b
+
+        /* align the unaligned pointer */
+        bic         r1, r1, #3
+        ldr         lr, [r1], #4
+
+6:      PLD         (r1, #64)
+        PLD         (r4, #64)
+        mov         ip, lr, lsr #16
+        ldr         lr, [r1], #4
+        ldr         r0, [r4], #4
+        orr         ip, ip, lr, lsl #16
+        eors        r0, r0, ip
+        moveq       ip, lr, lsr #16
+        ldreq       lr, [r1], #4
+        ldreq       r0, [r4], #4
+        orreq       ip, ip, lr, lsl #16
+        eoreqs      r0, r0, ip
+        moveq       ip, lr, lsr #16
+        ldreq       lr, [r1], #4
+        ldreq       r0, [r4], #4
+        orreq       ip, ip, lr, lsl #16
+        eoreqs      r0, r0, ip
+        moveq       ip, lr, lsr #16
+        ldreq       lr, [r1], #4
+        ldreq       r0, [r4], #4
+        orreq       ip, ip, lr, lsl #16
+        eoreqs      r0, r0, ip
+        bne         7f
+        subs        r2, r2, #16
+        bhs         6b
+        sub         r1, r1, #2
+        /* are we done? */
+        adds        r2, r2, #16
+        moveq       r0, #0
+        beq         9b
+        /* finish off the remaining bytes */
+        b           8b
+
+7:      /* fix up the 2 pointers and fallthrough... */
+        sub         r1, r1, #(4+2)
+        sub         r4, r4, #4
+        mov         r2, #4
+        b           8b
+
+
+4:      /*************** offset is 1 or 3 (less optimized) ***************/
+
+		stmfd		sp!, {r5, r6, r7}
+
+        // r5 = rhs
+        // r6 = lhs
+        // r7 = scratch
+
+        mov         r5, r0, lsl #3		/* r5 = right shift */
+        rsb         r6, r5, #32         /* r6 = left shift */
+
+        /* align the unaligned pointer */
+        bic         r1, r1, #3
+        ldr         r7, [r1], #4
+        sub         r2, r2, #8
+
+6:      mov         ip, r7, lsr r5
+        ldr         r7, [r1], #4
+        ldr         r0, [r4], #4
+        orr         ip, ip, r7, lsl r6
+        eors        r0, r0, ip
+        moveq       ip, r7, lsr r5
+        ldreq       r7, [r1], #4
+        ldreq       r0, [r4], #4
+        orreq       ip, ip, r7, lsl r6
+        eoreqs      r0, r0, ip
+        bne         7f
+        subs        r2, r2, #8
+        bhs         6b
+
+        sub         r1, r1, r6, lsr #3
+		ldmfd       sp!, {r5, r6, r7}
+
+        /* are we done? */
+        adds        r2, r2, #8
+        moveq       r0, #0
+        beq         9b
+
+        /* finish off the remaining bytes */
+        b           8b
+
+7:      /* fix up the 2 pointers and fallthrough... */
+        sub         r1, r1, #4
+        sub         r1, r1, r6, lsr #3
+        sub         r4, r4, #4
+        mov         r2, #4
+		ldmfd		sp!, {r5, r6, r7}
+        b           8b