4 files changed, 573 insertions, 15 deletions
diff --git a/libc/Android.mk b/libc/Android.mk
index 9e6bdfb..dcd7ba4 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -355,12 +355,21 @@ libc_common_src_files += \
 	arch-arm/bionic/sigsetjmp.S \
 	arch-arm/bionic/strlen.c.arm \
 	arch-arm/bionic/syscall.S \
-	string/memmove.c.arm \
-	string/bcopy.c \
 	string/strcmp.c \
 	string/strncmp.c \
 	unistd/socketcalls.c
 
+# Check if we want a neonized version of memmove instead of the
+# current ARM version
+ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true)
+libc_common_src_files += \
+	arch-arm/bionic/memmove.S
+else # Non-Scorpion-based ARM
+libc_common_src_files += \
+	string/bcopy.c \
+	string/memmove.c.arm
+endif # !TARGET_USE_SCORPION_BIONIC_OPTIMIZATION
+
 # These files need to be arm so that gdbserver
 # can set breakpoints in them without messing
 # up any thumb code.
@@ -494,6 +503,14 @@ ifeq ($(TARGET_ARCH),arm)
   ifeq ($(ARCH_ARM_HAVE_TLS_REGISTER),true)
     libc_common_cflags += -DHAVE_ARM_TLS_REGISTER
   endif
+  # Add in defines to activate SCORPION_NEON_OPTIMIZATION
+  ifeq ($(TARGET_USE_SCORPION_BIONIC_OPTIMIZATION),true)
+    libc_common_cflags += -DSCORPION_NEON_OPTIMIZATION
+    ifeq ($(TARGET_USE_SCORPION_PLD_SET),true)
+      libc_common_cflags += -DPLDOFFS=$(TARGET_SCORPION_BIONIC_PLDOFFS)
+      libc_common_cflags += -DPLDSIZE=$(TARGET_SCORPION_BIONIC_PLDSIZE)
+    endif
+  endif
 else # !arm
   ifeq ($(TARGET_ARCH),x86)
     libc_crt_target_cflags := -m32
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index b8d1007..e92ff5e 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -2,6 +2,8 @@
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -29,7 +31,114 @@
 #include <machine/cpu-features.h>
 
 #if defined(__ARM_NEON__)
-
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+        .code 32
+        .align 5
+        .globl memcpy
+        .func
+memcpy:
+	push            {r0}
+	cmp             r2, #4
+	blt             .Lneon_lt4
+	cmp             r2, #16
+	blt             .Lneon_lt16
+	cmp             r2, #32
+	blt             .Lneon_16
+	cmp              r2, #128
+	blt              .Lneon_copy_32_a
+	/* Copy blocks of 128-bytes (word-aligned) at a time*/
+	/* Code below is optimized for PLDSIZE=128 only */
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32	        {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_exit
+	cmp             r2, #32
+	blt             .Lneon_16
+	nop
+	/* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+	mov             r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_copy_32_loop_a
+	ands            r2, r2, #0x1f
+	beq             .Lneon_exit
+.Lneon_16:
+	subs            r2, r2, #16
+	blt             .Lneon_lt16
+	vld1.32         {q8}, [r1]!
+	vst1.32         {q8}, [r0]!
+	beq             .Lneon_exit
+.Lneon_lt16:
+	movs            r12, r2, lsl #29
+	bcc             .Lneon_skip8
+	ldr             r3, [r1], #4
+	ldr             r12, [r1], #4
+	str             r3, [r0], #4
+	str             r12, [r0], #4
+.Lneon_skip8:
+	bpl             .Lneon_lt4
+	ldr             r3, [r1], #4
+	str             r3, [r0], #4
+.Lneon_lt4:
+	movs            r2, r2, lsl #31
+	bcc             .Lneon_lt2
+	ldrh            r3, [r1], #2
+	strh            r3, [r0], #2
+.Lneon_lt2:
+	bpl             .Lneon_exit
+	ldrb            r12, [r1]
+	strb            r12, [r0]
+.Lneon_exit:
+	pop             {r0}
+	bx              lr
+	.endfunc
+	.end
+#else /* !SCORPION_NEON_OPTIMIZATION */
         .text
         .fpu    neon
 
@@ -145,7 +254,7 @@ memcpy:
         bx          lr
         .fnend
 
-
+#endif  /* !SCORPION_NEON_OPTIMIZATION */
 #else   /* __ARM_ARCH__ < 7 */
 
 
@@ -260,31 +369,20 @@ cached_aligned32:
          *
          */
 
-#if __ARM_ARCH__ == 5
         // Align the preload register to a cache-line because the cpu does
         // "critical word first" (the first word requested is loaded first).
         bic         r12, r1, #0x1F
         add         r12, r12, #64
-#endif
 
 1:      ldmia       r1!, { r4-r11 }
-
-#if __ARM_ARCH__ == 5
         PLD         (r12, #64)
-#else
-        PLD         (r1, #64)
-#endif
-
         subs        r2, r2, #32
 
-#if __ARM_ARCH__ == 5
         // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
         // for ARM9 preload will not be safely guarded by the preceding subs.
         // When it is safely guarded the only possibility to have SIGSEGV here
         // is because the caller overstates the length.
         ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
-#endif
-
         stmia       r0!, { r4-r11 }
 		bhs         1b
 
diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
new file mode 100644
index 0000000..d68b142
--- /dev/null
+++ b/libc/arch-arm/bionic/memmove.S
@@ -0,0 +1,356 @@
+/***************************************************************************
+ Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of Code Aurora nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/***************************************************************************
+ *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ *     Inputs:
+ *        dest: The destination buffer
+ *        src: The source buffer
+ *        n: The size of the buffer to transfer
+ *     Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+
+	.code 32
+	.align 5
+	.global memmove
+	.type memmove, %function
+
+	.global bcopy
+	.type bcopy, %function
+
+bcopy:
+	mov		r12, r1
+	mov		r0, r1
+	mov		r1, r12
+memmove:
+	push            {r0}
+
+	/*
+	 * The requirements for memmove state that the function should
+	 * operate as if data were being copied from the source to a
+	 * buffer, then to the destination.  This is to allow a user
+	 * to copy data from a source and target that overlap.
+	 *
+	 * We can't just do byte copies front-to-back automatically, since
+	 * there's a good chance we may have an overlap (why else would someone
+	 * intentionally use memmove then?).
+	 *
+	 * We'll break this into two parts.  Front-to-back, or back-to-front
+	 * copies.
+	 */
+.Lneon_memmove_cmf:
+	cmp             r0, r1
+	blt             .Lneon_front_to_back_copy
+	bgt             .Lneon_back_to_front_copy
+	b               .Lneon_memmove_done
+
+	/* #############################################################
+	 * Front to Back copy
+	 */
+.Lneon_front_to_back_copy:
+	/*
+	 * For small copies, just do a quick memcpy.  We can do this for
+	 * front-to-back copies, aligned or unaligned, since we're only
+	 * doing 1 byte at a time...
+	 */
+	cmp             r2, #4
+	bgt             .Lneon_f2b_gt4
+	cmp             r2, #0
+.Lneon_f2b_smallcopy_loop:
+	beq             .Lneon_memmove_done
+	ldrb            r12, [r1], #1
+	subs            r2, r2, #1
+	strb            r12, [r0], #1
+	b               .Lneon_f2b_smallcopy_loop
+.Lneon_f2b_gt4:
+	/* The window size is in r3. */
+	sub             r3, r1, r0
+	/* #############################################################
+	 * Front to Back copy 
+	 */
+	/*
+	 * Note that we can't just route based on the size in r2.  If that's
+	 * larger than the overlap window in r3, we could potentially
+	 * (and likely!) destroy data we're copying.
+	 */
+	cmp             r2, r3
+	movle           r12, r2
+	movgt           r12, r3
+	cmp             r12, #256
+	bge             .Lneon_f2b_copy_128
+	cmp             r12, #64
+	bge             .Lneon_f2b_copy_32
+	cmp             r12, #16
+	bge             .Lneon_f2b_copy_16
+	cmp             r12, #8
+	bge             .Lneon_f2b_copy_8
+	cmp             r12, #4
+	bge             .Lneon_f2b_copy_4
+	b               .Lneon_f2b_copy_1
+	nop
+.Lneon_f2b_copy_128:
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_f2b_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_f2b_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0,q1}, [r1]!
+	vld1.32         {q2,q3}, [r1]!
+	vld1.32         {q8,q9}, [r1]!
+	vld1.32         {q10,q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	vst1.32         {q2,q3}, [r0]!
+	vst1.32         {q8,q9}, [r0]!
+	vst1.32         {q10,q11}, [r0]!
+	bne             .Lneon_f2b_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_f2b_copy_128_loop_nopld:
+	vld1.32         {q0,q1}, [r1]!
+	vld1.32         {q2,q3}, [r1]!
+	vld1.32         {q8,q9}, [r1]!
+	vld1.32         {q10,q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	vst1.32         {q2,q3}, [r0]!
+	vst1.32         {q8,q9}, [r0]!
+	vst1.32         {q10,q11}, [r0]!
+	bne             .Lneon_f2b_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_memmove_done
+	cmp             r2, #32
+	bge             .Lneon_f2b_copy_32
+	b               .Lneon_f2b_copy_finish
+.Lneon_f2b_copy_32:
+	mov             r12, r2, lsr #5
+.Lneon_f2b_copy_32_loop:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_f2b_copy_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_finish:
+.Lneon_f2b_copy_16:
+	movs            r12, r2, lsr #4
+	beq             .Lneon_f2b_copy_8
+.Lneon_f2b_copy_16_loop:
+	vld1.32         {q0}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0}, [r0]!
+	bne             .Lneon_f2b_copy_16_loop
+	ands            r2, r2, #0xf
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_8:
+	movs            r12, r2, lsr #3
+	beq             .Lneon_f2b_copy_4
+.Lneon_f2b_copy_8_loop:
+	vld1.32         {d0}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {d0}, [r0]!
+	bne             .Lneon_f2b_copy_8_loop
+	ands            r2, r2, #0x7
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_4:
+	movs            r12, r2, lsr #2
+	beq             .Lneon_f2b_copy_1
+.Lneon_f2b_copy_4_loop:
+	ldr             r3, [r1], #4
+	subs            r12, r12, #1
+	str             r3, [r0], #4
+	bne             .Lneon_f2b_copy_4_loop
+	ands            r2, r2, #0x3
+	nop
+.Lneon_f2b_copy_1:
+	cmp             r2, #0
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_1_loop:
+	ldrb            r12, [r1], #1
+	subs            r2, r2, #1
+	strb            r12, [r0], #1
+	bne             .Lneon_f2b_copy_1_loop
+.Lneon_f2b_finish:
+	b               .Lneon_memmove_done
+
+	/* #############################################################
+	 * Back to Front copy
+	 */
+.Lneon_back_to_front_copy:
+	/*
+	 * Here, we'll want to shift to the end of the buffers.  This
+	 * actually points us one past where we need to go, but since
+	 * we'll pre-decrement throughout, this will be fine.
+	 */
+	add             r0, r0, r2
+	add             r1, r1, r2
+	cmp             r2, #4
+	bgt             .Lneon_b2f_gt4
+	cmp             r2, #0
+.Lneon_b2f_smallcopy_loop:
+	beq             .Lneon_memmove_done
+	ldrb            r12, [r1, #-1]!
+	subs            r2, r2, #1
+	strb            r12, [r0, #-1]!
+	b               .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+	/*
+	 * The minimum of the overlap window size and the copy size
+	 * is in r3.
+	 */
+	sub             r3, r0, r1
+	/*
+	 * #############################################################
+	 * Back to Front copy -
+	 */
+	cmp             r2, r3
+	movle           r12, r2
+	movgt           r12, r3
+	cmp             r12, #256
+	bge             .Lneon_b2f_copy_128
+	cmp             r12, #64
+	bge             .Lneon_b2f_copy_32
+	cmp             r12, #8
+	bge             .Lneon_b2f_copy_8
+	cmp             r12, #4
+	bge             .Lneon_b2f_copy_4
+	b               .Lneon_b2f_copy_1
+	nop
+.Lneon_b2f_copy_128:
+	movs            r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_b2f_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #-(PLDOFFS-1)*PLDSIZE]
+.Lneon_b2f_copy_128_loop_outer:
+	pld             [r1, #-(PLDOFFS*PLDSIZE)]
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	bne             .Lneon_b2f_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_b2f_copy_128_loop_nopld:
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	bne             .Lneon_b2f_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_memmove_done
+	cmp             r2, #32
+	bge             .Lneon_b2f_copy_32
+	b               .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+	mov             r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+	sub             r1, r1, #32
+	sub             r0, r0, #32
+	vld1.32         {q0,q1}, [r1]
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]
+	bne             .Lneon_b2f_copy_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+	movs            r12, r2, lsr #0x3
+	beq             .Lneon_b2f_copy_4
+.Lneon_b2f_copy_8_loop:
+	sub             r1, r1, #8
+	sub             r0, r0, #8
+	vld1.32         {d0}, [r1]
+	subs            r12, r12, #1
+	vst1.32         {d0}, [r0]
+	bne             .Lneon_b2f_copy_8_loop
+	ands            r2, r2, #0x7
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+	movs            r12, r2, lsr #0x2
+	beq             .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+	ldr             r3, [r1, #-4]!
+	subs            r12, r12, #1
+	str             r3, [r0, #-4]!
+	bne             .Lneon_b2f_copy_4_loop
+	ands            r2, r2, #0x3
+	nop
+.Lneon_b2f_copy_1:
+	cmp             r2, #0
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_1_loop:
+	ldrb            r12, [r1, #-1]!
+	subs            r2, r2, #1
+	strb            r12, [r0, #-1]!
+	bne             .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+	pop             {r0}
+	bx              lr
+
+	.end
+#endif /* SCORPION_NEON_OPTIMIZATION */
+
diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S
index 93abe15..8ecd80c 100644
--- a/libc/arch-arm/bionic/memset.S
+++ b/libc/arch-arm/bionic/memset.S
@@ -2,6 +2,8 @@
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -25,6 +27,90 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+#include <machine/cpu-features.h>
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	.code 32
+	.align 8
+	.global memset
+	.type memset, %function
+
+	.global bzero
+	.type bzero, %function
+
+bzero:
+	mov             r2, r1
+	mov             r1, #0	
+memset:
+	push            {r0}
+
+	cmp             r2, #6
+	bgt             .Lmemset_gt6
+	cmp             r2, #0
+	beq             .Lmemset_smallcopy_done
+.Lmemset_smallcopy_loop:
+	strb            r1, [r0], #1
+	subs            r2, r2, #1
+	bne             .Lmemset_smallcopy_loop
+.Lmemset_smallcopy_done:
+	pop             {r0}
+	bx              lr
+
+.Lmemset_gt6:
+	vdup.8		q0, r1
+	vmov		r1, s0
+
+	/*
+	 * Decide where to route for the maximum copy sizes.
+	 */
+	cmp             r2, #4
+	blt             .Lmemset_lt4
+	cmp             r2, #16
+	blt             .Lmemset_lt16
+	vmov            q1, q0
+	cmp             r2, #128
+	blt             .Lmemset_32
+.Lmemset_128:
+	mov             r12, r2, lsr #7
+.Lmemset_128_loop:
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	subs            r12, r12, #1
+	bne             .Lmemset_128_loop
+	ands            r2, r2, #0x7f
+	beq             .Lmemset_end
+.Lmemset_32:
+	movs             r12, r2, lsr #5
+	beq              .Lmemset_lt32
+.Lmemset_32_loop:
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	bne             .Lmemset_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lmemset_end
+.Lmemset_lt32:
+	cmp             r2, #16
+	blt             .Lmemset_lt16
+	vst1.64         {q0}, [r0]!
+	subs            r2, r2, #16
+	beq             .Lmemset_end
+.Lmemset_lt16:
+	movs            r12, r2, lsl #29
+	strcs           r1, [r0], #4
+	strcs           r1, [r0], #4
+	strmi           r1, [r0], #4
+.Lmemset_lt4:
+	movs            r2, r2, lsl #31
+	strcsh          r1, [r0], #2
+	strmib          r1, [r0]
+.Lmemset_end:
+	pop             {r0}
+	bx		lr
+
+	.end
+#else   /* !SCORPION_NEON_OPTIMIZATION */
 	.text
 
     .global memset
@@ -115,3 +201,4 @@ memset:
         bx          lr
         .fnend
     
+#endif  /* SCORPION_NEON_OPTIMIZATION */