60 files changed, 3330 insertions, 1681 deletions
diff --git a/libc/Android.mk b/libc/Android.mk
index f0c5e9f..f7f2adc 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -611,6 +611,10 @@ ifneq ($(BOARD_MALLOC_ALIGNMENT),)
   libc_common_cflags += -DMALLOC_ALIGNMENT=$(BOARD_MALLOC_ALIGNMENT)
 endif
 
+ifeq ($(BOARD_USES_LEGACY_MMAP),true)
+  libc_common_cflags += -DLEGACY_MMAP
+endif
+
 # Define some common conlyflags
 libc_common_conlyflags := \
     -std=gnu99
@@ -1394,6 +1398,9 @@ LOCAL_SRC_FILES_arm += \
 LOCAL_ADDRESS_SANITIZER := false
 LOCAL_NATIVE_COVERAGE := $(bionic_coverage)
 
+# Allow devices to provide additional symbols
+LOCAL_WHOLE_STATIC_LIBRARIES += $(BOARD_PROVIDES_ADDITIONAL_BIONIC_STATIC_LIBS)
+
 include $(BUILD_SHARED_LIBRARY)
 
 
diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk
index d72a160..c2b80c5 100644
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -20,7 +20,6 @@ libc_freebsd_src_files_arm += \
     upstream-freebsd/lib/libc/string/wmemmove.c \
 
 libc_openbsd_src_files_arm += \
-    upstream-openbsd/lib/libc/string/memchr.c \
     upstream-openbsd/lib/libc/string/memrchr.c \
     upstream-openbsd/lib/libc/string/stpncpy.c \
     upstream-openbsd/lib/libc/string/strlcat.c \
@@ -52,7 +51,7 @@ ifeq ($(strip $(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)),)
 endif
 cpu_variant_mk := $(LOCAL_PATH)/arch-arm/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT).mk
 ifeq ($(wildcard $(cpu_variant_mk)),)
-$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
+$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, scorpion, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
 endif
 include $(cpu_variant_mk)
 libc_common_additional_dependencies += $(cpu_variant_mk)
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
index a2e9c22..3692f04 100644
--- a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,191 +26,7 @@
  * SUCH DAMAGE.
  */
 
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
 
-    .syntax unified
-
-    .thumb
-    .thumb_func
-
-// Get the length of src string, then get the source of the dst string.
-// Check that the two lengths together don't exceed the threshold, then
-// do a memcpy of the data.
-ENTRY(__strcat_chk)
-    pld     [r0, #0]
-    push    {r0, lr}
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-    push    {r4, r5}
-    .cfi_adjust_cfa_offset 8
-    .cfi_rel_offset r4, 0
-    .cfi_rel_offset r5, 4
-
-    mov     lr, r2
-
-    // Save the dst register to r5
-    mov     r5, r0
-
-    // Zero out r4
-    eor     r4, r4, r4
-
-    // r1 contains the address of the string to count.
-.L_strlen_start:
-    mov     r0, r1
-    ands    r3, r1, #7
-    beq     .L_mainloop
-
-    // Align to a double word (64 bits).
-    rsb     r3, r3, #8
-    lsls    ip, r3, #31
-    beq     .L_align_to_32
-
-    ldrb    r2, [r1], #1
-    cbz     r2, .L_update_count_and_finish
-
-.L_align_to_32:
-    bcc     .L_align_to_64
-    ands    ip, r3, #2
-    beq     .L_align_to_64
-
-    ldrb    r2, [r1], #1
-    cbz     r2, .L_update_count_and_finish
-    ldrb    r2, [r1], #1
-    cbz     r2, .L_update_count_and_finish
-
-.L_align_to_64:
-    tst     r3, #4
-    beq     .L_mainloop
-    ldr     r3, [r1], #4
-
-    sub     ip, r3, #0x01010101
-    bic     ip, ip, r3
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_second_register
-
-    .p2align 2
-.L_mainloop:
-    ldrd    r2, r3, [r1], #8
-
-    pld     [r1, #64]
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_first_register
-
-    sub     ip, r3, #0x01010101
-    bic     ip, ip, r3
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_second_register
-    b       .L_mainloop
-
-.L_update_count_and_finish:
-    sub     r3, r1, r0
-    sub     r3, r3, #1
-    b       .L_finish
-
-.L_zero_in_first_register:
-    sub     r3, r1, r0
-    lsls    r2, ip, #17
-    bne     .L_sub8_and_finish
-    bcs     .L_sub7_and_finish
-    lsls    ip, ip, #1
-    bne     .L_sub6_and_finish
-
-    sub     r3, r3, #5
-    b       .L_finish
-
-.L_sub8_and_finish:
-    sub     r3, r3, #8
-    b       .L_finish
-
-.L_sub7_and_finish:
-    sub     r3, r3, #7
-    b       .L_finish
-
-.L_sub6_and_finish:
-    sub     r3, r3, #6
-    b       .L_finish
-
-.L_zero_in_second_register:
-    sub     r3, r1, r0
-    lsls    r2, ip, #17
-    bne     .L_sub4_and_finish
-    bcs     .L_sub3_and_finish
-    lsls    ip, ip, #1
-    bne     .L_sub2_and_finish
-
-    sub     r3, r3, #1
-    b       .L_finish
-
-.L_sub4_and_finish:
-    sub     r3, r3, #4
-    b       .L_finish
-
-.L_sub3_and_finish:
-    sub     r3, r3, #3
-    b       .L_finish
-
-.L_sub2_and_finish:
-    sub     r3, r3, #2
-
-.L_finish:
-    cmp     r4, #0
-    bne     .L_strlen_done
-
-    // Time to get the dst string length.
-    mov     r1, r5
-
-    // Save the original source address to r5.
-    mov     r5, r0
-
-    // Save the current length (adding 1 for the terminator).
-    add     r4, r3, #1
-    b       .L_strlen_start
-
-    // r0 holds the pointer to the dst string.
-    // r3 holds the dst string length.
-    // r4 holds the src string length + 1.
-.L_strlen_done:
-    add     r2, r3, r4
-    cmp     r2, lr
-    bhi     __strcat_chk_failed
-
-    // Set up the registers for the memcpy code.
-    mov     r1, r5
-    pld     [r1, #64]
-    mov     r2, r4
-    add     r0, r0, r3
-    pop     {r4, r5}
-END(__strcat_chk)
-
-#define MEMCPY_BASE         __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
-
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__strcat_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-    .cfi_adjust_cfa_offset 8
-    .cfi_rel_offset r4, 0
-    .cfi_rel_offset r5, 4
-
-    ldr     r0, error_message
-    ldr     r1, error_code
-1:
-    add     r0, pc
-    bl      __fortify_chk_fail
-error_code:
-    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
-error_message:
-    .word   error_string-(1b+4)
-END(__strcat_chk_failed)
-
-    .data
-error_string:
-    .string "strcat: prevented write past end of buffer"
+#include "__strcat_chk_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S
new file mode 100644
index 0000000..de66967
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+    .syntax unified
+
+    .thumb
+    .thumb_func
+
+// Get the length of src string, then get the source of the dst string.
+// Check that the two lengths together don't exceed the threshold, then
+// do a memcpy of the data.
+ENTRY(__strcat_chk)
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+    push    {r4, r5}
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+
+    mov     lr, r2
+
+    // Save the dst register to r5
+    mov     r5, r0
+
+    // Zero out r4
+    eor     r4, r4, r4
+
+    // r1 contains the address of the string to count.
+.L_strlen_start:
+    mov     r0, r1
+    ands    r3, r1, #7
+    beq     .L_mainloop
+
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+    ands    ip, r3, #2
+    beq     .L_align_to_64
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r3, [r1], #4
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+
+    .p2align 2
+.L_mainloop:
+    ldrd    r2, r3, [r1], #8
+
+    pld     [r1, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_update_count_and_finish:
+    sub     r3, r1, r0
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_zero_in_first_register:
+    sub     r3, r1, r0
+    lsls    r2, ip, #17
+    bne     .L_sub8_and_finish
+    bcs     .L_sub7_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub6_and_finish
+
+    sub     r3, r3, #5
+    b       .L_finish
+
+.L_sub8_and_finish:
+    sub     r3, r3, #8
+    b       .L_finish
+
+.L_sub7_and_finish:
+    sub     r3, r3, #7
+    b       .L_finish
+
+.L_sub6_and_finish:
+    sub     r3, r3, #6
+    b       .L_finish
+
+.L_zero_in_second_register:
+    sub     r3, r1, r0
+    lsls    r2, ip, #17
+    bne     .L_sub4_and_finish
+    bcs     .L_sub3_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub2_and_finish
+
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_sub4_and_finish:
+    sub     r3, r3, #4
+    b       .L_finish
+
+.L_sub3_and_finish:
+    sub     r3, r3, #3
+    b       .L_finish
+
+.L_sub2_and_finish:
+    sub     r3, r3, #2
+
+.L_finish:
+    cmp     r4, #0
+    bne     .L_strlen_done
+
+    // Time to get the dst string length.
+    mov     r1, r5
+
+    // Save the original source address to r5.
+    mov     r5, r0
+
+    // Save the current length (adding 1 for the terminator).
+    add     r4, r3, #1
+    b       .L_strlen_start
+
+    // r0 holds the pointer to the dst string.
+    // r3 holds the dst string length.
+    // r4 holds the src string length + 1.
+.L_strlen_done:
+    add     r2, r3, r4
+    cmp     r2, lr
+    bhi     .L_strcat_chk_failed
+
+    // Set up the registers for the memcpy code.
+    mov     r1, r5
+    pld     [r1, #64]
+    mov     r2, r4
+    add     r0, r0, r3
+    pop     {r4, r5}
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore r4
+    .cfi_restore r5
+
+#include MEMCPY_BASE
+
+    // Undo the above cfi directives
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+.L_strcat_chk_failed:
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+END(__strcat_chk)
+
+    .data
+error_string:
+    .string "strcat: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
index db76686..d8cb3d9 100644
--- a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,155 +26,7 @@
  * SUCH DAMAGE.
  */
 
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
 
-    .syntax unified
-
-    .thumb
-    .thumb_func
-
-// Get the length of the source string first, then do a memcpy of the data
-// instead of a strcpy.
-ENTRY(__strcpy_chk)
-    pld     [r0, #0]
-    push    {r0, lr}
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-
-    mov     lr, r2
-    mov     r0, r1
-
-    ands    r3, r1, #7
-    beq     .L_mainloop
-
-    // Align to a double word (64 bits).
-    rsb     r3, r3, #8
-    lsls    ip, r3, #31
-    beq     .L_align_to_32
-
-    ldrb    r2, [r0], #1
-    cbz     r2, .L_update_count_and_finish
-
-.L_align_to_32:
-    bcc     .L_align_to_64
-    ands    ip, r3, #2
-    beq     .L_align_to_64
-
-    ldrb    r2, [r0], #1
-    cbz     r2, .L_update_count_and_finish
-    ldrb    r2, [r0], #1
-    cbz     r2, .L_update_count_and_finish
-
-.L_align_to_64:
-    tst     r3, #4
-    beq     .L_mainloop
-    ldr     r3, [r0], #4
-
-    sub     ip, r3, #0x01010101
-    bic     ip, ip, r3
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_second_register
-
-    .p2align 2
-.L_mainloop:
-    ldrd    r2, r3, [r0], #8
-
-    pld     [r0, #64]
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_first_register
-
-    sub     ip, r3, #0x01010101
-    bic     ip, ip, r3
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_second_register
-    b       .L_mainloop
-
-.L_update_count_and_finish:
-    sub     r3, r0, r1
-    sub     r3, r3, #1
-    b       .L_check_size
-
-.L_zero_in_first_register:
-    sub     r3, r0, r1
-    lsls    r2, ip, #17
-    bne     .L_sub8_and_finish
-    bcs     .L_sub7_and_finish
-    lsls    ip, ip, #1
-    bne     .L_sub6_and_finish
-
-    sub     r3, r3, #5
-    b       .L_check_size
-
-.L_sub8_and_finish:
-    sub     r3, r3, #8
-    b       .L_check_size
-
-.L_sub7_and_finish:
-    sub     r3, r3, #7
-    b       .L_check_size
-
-.L_sub6_and_finish:
-    sub     r3, r3, #6
-    b       .L_check_size
-
-.L_zero_in_second_register:
-    sub     r3, r0, r1
-    lsls    r2, ip, #17
-    bne     .L_sub4_and_finish
-    bcs     .L_sub3_and_finish
-    lsls    ip, ip, #1
-    bne     .L_sub2_and_finish
-
-    sub     r3, r3, #1
-    b       .L_check_size
-
-.L_sub4_and_finish:
-    sub     r3, r3, #4
-    b       .L_check_size
-
-.L_sub3_and_finish:
-    sub     r3, r3, #3
-    b       .L_check_size
-
-.L_sub2_and_finish:
-    sub     r3, r3, #2
-
-.L_check_size:
-    pld     [r1, #0]
-    pld     [r1, #64]
-    ldr     r0, [sp]
-    cmp     r3, lr
-    bhs     __strcpy_chk_failed
-
-    // Add 1 for copy length to get the string terminator.
-    add     r2, r3, #1
-END(__strcpy_chk)
-
-#define MEMCPY_BASE         __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__strcpy_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-
-    ldr     r0, error_message
-    ldr     r1, error_code
-1:
-    add     r0, pc
-    bl      __fortify_chk_fail
-error_code:
-    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
-error_message:
-    .word   error_string-(1b+4)
-END(__strcpy_chk_failed)
-
-    .data
-error_string:
-    .string "strcpy: prevented write past end of buffer"
+#include "__strcpy_chk_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S
new file mode 100644
index 0000000..69ebcb4
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+    .syntax unified
+
+    .thumb
+    .thumb_func
+
+// Get the length of the source string first, then do a memcpy of the data
+// instead of a strcpy.
+ENTRY(__strcpy_chk)
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+
+    mov     lr, r2
+    mov     r0, r1
+
+    ands    r3, r1, #7
+    beq     .L_mainloop
+
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+    ands    ip, r3, #2
+    beq     .L_align_to_64
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r3, [r0], #4
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+
+    .p2align 2
+.L_mainloop:
+    ldrd    r2, r3, [r0], #8
+
+    pld     [r0, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_update_count_and_finish:
+    sub     r3, r0, r1
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_zero_in_first_register:
+    sub     r3, r0, r1
+    lsls    r2, ip, #17
+    bne     .L_sub8_and_finish
+    bcs     .L_sub7_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub6_and_finish
+
+    sub     r3, r3, #5
+    b       .L_check_size
+
+.L_sub8_and_finish:
+    sub     r3, r3, #8
+    b       .L_check_size
+
+.L_sub7_and_finish:
+    sub     r3, r3, #7
+    b       .L_check_size
+
+.L_sub6_and_finish:
+    sub     r3, r3, #6
+    b       .L_check_size
+
+.L_zero_in_second_register:
+    sub     r3, r0, r1
+    lsls    r2, ip, #17
+    bne     .L_sub4_and_finish
+    bcs     .L_sub3_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub2_and_finish
+
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_sub4_and_finish:
+    sub     r3, r3, #4
+    b       .L_check_size
+
+.L_sub3_and_finish:
+    sub     r3, r3, #3
+    b       .L_check_size
+
+.L_sub2_and_finish:
+    sub     r3, r3, #2
+
+.L_check_size:
+    pld     [r1, #0]
+    pld     [r1, #64]
+    ldr     r0, [sp]
+    cmp     r3, lr
+    bhs     .L_strcpy_chk_failed
+
+    // Add 1 for copy length to get the string terminator.
+    add     r2, r3, #1
+
+#include MEMCPY_BASE
+
+.L_strcpy_chk_failed:
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+END(__strcpy_chk)
+
+    .data
+error_string:
+    .string "strcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S
index 410b663..537f3de 100644
--- a/libc/arch-arm/cortex-a15/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,79 +25,8 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
-/*
- * Copyright (c) 2013 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-// Prototype: void *memcpy (void *dst, const void *src, size_t count).
-
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
-
-        .text
-        .syntax unified
-        .fpu    neon
-
-ENTRY(__memcpy_chk)
-        cmp     r2, r3
-        bhi     __memcpy_chk_fail
-
-        // Fall through to memcpy...
-END(__memcpy_chk)
-
-ENTRY(memcpy)
-        pld     [r1, #64]
-        push    {r0, lr}
-        .cfi_def_cfa_offset 8
-        .cfi_rel_offset r0, 0
-        .cfi_rel_offset lr, 4
-END(memcpy)
-
-#define MEMCPY_BASE         __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__memcpy_chk_fail)
-        // Preserve lr for backtrace.
-        push    {lr}
-        .cfi_def_cfa_offset 4
-        .cfi_rel_offset lr, 0
 
-        ldr     r0, error_message
-        ldr     r1, error_code
-1:
-        add     r0, pc
-        bl      __fortify_chk_fail
-error_code:
-        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
-error_message:
-        .word   error_string-(1b+8)
-END(__memcpy_chk_fail)
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
 
-        .data
-error_string:
-        .string "memcpy: prevented write past end of buffer"
+#include "memcpy_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
index 2a73852..aac737d 100644
--- a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
@@ -53,11 +53,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-ENTRY_PRIVATE(MEMCPY_BASE)
-        .cfi_def_cfa_offset 8
-        .cfi_rel_offset r0, 0
-        .cfi_rel_offset lr, 4
-
+.L_memcpy_base:
         // Assumes that n >= 0, and dst, src are valid pointers.
         // For any sizes less than 832 use the neon code that doesn't
         // care about the src alignment. This avoids any checks
@@ -168,12 +164,6 @@ ENTRY_PRIVATE(MEMCPY_BASE)
         eor     r3, r0, r1
         ands    r3, r3, #0x3
         bne     .L_copy_unknown_alignment
-END(MEMCPY_BASE)
-
-ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
-        .cfi_def_cfa_offset 8
-        .cfi_rel_offset r0, 0
-        .cfi_rel_offset lr, 4
 
         // To try and improve performance, stack layout changed,
         // i.e., not keeping the stack looking like users expect
@@ -185,7 +175,7 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
         strd    r6, r7, [sp, #-8]!
         .cfi_adjust_cfa_offset 8
         .cfi_rel_offset r6, 0
-        .cfi_rel_offset r7, 0
+        .cfi_rel_offset r7, 4
         strd    r8, r9, [sp, #-8]!
         .cfi_adjust_cfa_offset 8
         .cfi_rel_offset r8, 0
@@ -291,10 +281,28 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
 
         // Restore registers: optimized pop {r0, pc}
         ldrd    r8, r9, [sp], #8
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r8
+        .cfi_restore r9
         ldrd    r6, r7, [sp], #8
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r6
+        .cfi_restore r7
         ldrd    r4, r5, [sp], #8
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r4
+        .cfi_restore r5
         pop     {r0, pc}
 
+        // Put the cfi directives back for the below instructions.
+        .cfi_adjust_cfa_offset 24
+        .cfi_rel_offset r4, 0
+        .cfi_rel_offset r5, 4
+        .cfi_rel_offset r6, 8
+        .cfi_rel_offset r7, 12
+        .cfi_rel_offset r8, 16
+        .cfi_rel_offset r9, 20
+
 .L_dst_not_word_aligned:
         // Align dst to word.
         rsb     ip, ip, #4
@@ -315,4 +323,12 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
 
         // Src is guaranteed to be at least word aligned by this point.
         b       .L_word_aligned
-END(MEMCPY_BASE_ALIGNED)
+
+        // Undo any cfi directives from above.
+        .cfi_adjust_cfa_offset -24
+        .cfi_restore r4
+        .cfi_restore r5
+        .cfi_restore r6
+        .cfi_restore r7
+        .cfi_restore r8
+        .cfi_restore r9
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_common.S b/libc/arch-arm/cortex-a15/bionic/memcpy_common.S
new file mode 100644
index 0000000..464fb46
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy_common.S
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+        .text
+        .syntax unified
+        .fpu    neon
+
+ENTRY(__memcpy_chk)
+        cmp     r2, r3
+        bhi     .L_memcpy_chk_fail
+
+        // Fall through to memcpy...
+END(__memcpy_chk)
+
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).
+ENTRY(memcpy)
+        pld     [r1, #64]
+        push    {r0, lr}
+        .cfi_def_cfa_offset 8
+        .cfi_rel_offset r0, 0
+        .cfi_rel_offset lr, 4
+
+#include MEMCPY_BASE
+
+        // Undo the cfi instructions from above.
+        .cfi_def_cfa_offset 0
+        .cfi_restore r0
+        .cfi_restore lr
+.L_memcpy_chk_fail:
+        // Preserve lr for backtrace.
+        push    {lr}
+        .cfi_adjust_cfa_offset 4
+        .cfi_rel_offset lr, 0
+
+        ldr     r0, error_message
+        ldr     r1, error_code
+1:
+        add     r0, pc
+        bl      __fortify_chk_fail
+error_code:
+        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+error_message:
+        .word   error_string-(1b+8)
+END(memcpy)
+
+        .data
+error_string:
+        .string "memcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/strcat.S b/libc/arch-arm/cortex-a15/bionic/strcat.S
index b95be94..157cc9f 100644
--- a/libc/arch-arm/cortex-a15/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a15/bionic/strcat.S
@@ -70,7 +70,7 @@
 
     .macro m_scan_byte
     ldrb    r3, [r0]
-    cbz     r3, strcat_r0_scan_done
+    cbz     r3, .L_strcat_r0_scan_done
     add     r0, #1
     .endm // m_scan_byte
 
@@ -84,10 +84,10 @@ ENTRY(strcat)
     // Quick check to see if src is empty.
     ldrb    r2, [r1]
     pld     [r1, #0]
-    cbnz    r2, strcat_continue
+    cbnz    r2, .L_strcat_continue
     bx      lr
 
-strcat_continue:
+.L_strcat_continue:
     // To speed up really small dst strings, unroll checking the first 4 bytes.
     m_push
     m_scan_byte
@@ -96,95 +96,102 @@ strcat_continue:
     m_scan_byte
 
     ands    r3, r0, #7
-    beq     strcat_mainloop
+    beq     .L_strcat_mainloop
 
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcat_align_to_32
+    beq     .L_strcat_align_to_32
 
     ldrb    r5, [r0]
-    cbz     r5, strcat_r0_scan_done
+    cbz     r5, .L_strcat_r0_scan_done
     add     r0, r0, #1
 
-strcat_align_to_32:
-    bcc     strcat_align_to_64
+.L_strcat_align_to_32:
+    bcc     .L_strcat_align_to_64
 
     ldrb    r2, [r0]
-    cbz     r2, strcat_r0_scan_done
+    cbz     r2, .L_strcat_r0_scan_done
     add     r0, r0, #1
     ldrb    r4, [r0]
-    cbz     r4, strcat_r0_scan_done
+    cbz     r4, .L_strcat_r0_scan_done
     add     r0, r0, #1
 
-strcat_align_to_64:
+.L_strcat_align_to_64:
     tst     r3, #4
-    beq     strcat_mainloop
+    beq     .L_strcat_mainloop
     ldr     r3, [r0], #4
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .L_strcat_zero_in_second_register
+    b       .L_strcat_mainloop
 
-strcat_r0_scan_done:
+.L_strcat_r0_scan_done:
     // For short copies, hard-code checking the first 8 bytes since this
     // new code doesn't win until after about 8 bytes.
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
-
-strcpy_finish:
+    m_copy_byte reg=r2, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r5, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r2, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r5, cmd=cbnz, label=.L_strcpy_continue
+
+.L_strcpy_finish:
     m_pop
 
-strcpy_continue:
+.L_strcpy_continue:
     ands    r3, r0, #7
-    beq     strcpy_check_src_align
+    beq     .L_strcpy_check_src_align
 
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcpy_align_to_32
+    beq     .L_strcpy_align_to_32
 
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .L_strcpy_complete
 
-strcpy_align_to_32:
-    bcc     strcpy_align_to_64
+.L_strcpy_align_to_32:
+    bcc     .L_strcpy_align_to_64
 
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .L_strcpy_complete
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .L_strcpy_complete
 
-strcpy_align_to_64:
+.L_strcpy_align_to_64:
     tst     r3, #4
-    beq     strcpy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
-    str     r2, [r0], #4
+    beq     .L_strcpy_check_src_align
+    // Read one byte at a time since we don't know the src alignment
+    // and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
 
-strcpy_check_src_align:
+.L_strcpy_check_src_align:
     // At this point dst is aligned to a double word, check if src
     // is also aligned to a double word.
     ands    r3, r1, #7
-    bne     strcpy_unaligned_copy
+    bne     .L_strcpy_unaligned_copy
 
     .p2align 2
-strcpy_mainloop:
+.L_strcpy_mainloop:
     ldrd    r2, r3, [r1], #8
 
     pld     [r1, #64]
@@ -192,128 +199,128 @@ strcpy_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_mainloop
+    b       .L_strcpy_mainloop
 
-strcpy_complete:
+.L_strcpy_complete:
     m_pop
 
-strcpy_zero_in_first_register:
+.L_strcpy_zero_in_first_register:
     lsls    lr, ip, #17
-    bne     strcpy_copy1byte
-    bcs     strcpy_copy2bytes
+    bne     .L_strcpy_copy1byte
+    bcs     .L_strcpy_copy2bytes
     lsls    ip, ip, #1
-    bne     strcpy_copy3bytes
+    bne     .L_strcpy_copy3bytes
 
-strcpy_copy4bytes:
+.L_strcpy_copy4bytes:
     // Copy 4 bytes to the destiniation.
     str     r2, [r0]
     m_pop
 
-strcpy_copy1byte:
+.L_strcpy_copy1byte:
     strb    r2, [r0]
     m_pop
 
-strcpy_copy2bytes:
+.L_strcpy_copy2bytes:
     strh    r2, [r0]
     m_pop
 
-strcpy_copy3bytes:
+.L_strcpy_copy3bytes:
     strh    r2, [r0], #2
     lsr     r2, #16
     strb    r2, [r0]
     m_pop
 
-strcpy_zero_in_second_register:
+.L_strcpy_zero_in_second_register:
     lsls    lr, ip, #17
-    bne     strcpy_copy5bytes
-    bcs     strcpy_copy6bytes
+    bne     .L_strcpy_copy5bytes
+    bcs     .L_strcpy_copy6bytes
     lsls    ip, ip, #1
-    bne     strcpy_copy7bytes
+    bne     .L_strcpy_copy7bytes
 
     // Copy 8 bytes to the destination.
     strd    r2, r3, [r0]
     m_pop
 
-strcpy_copy5bytes:
+.L_strcpy_copy5bytes:
     str     r2, [r0], #4
     strb    r3, [r0]
     m_pop
 
-strcpy_copy6bytes:
+.L_strcpy_copy6bytes:
     str     r2, [r0], #4
     strh    r3, [r0]
     m_pop
 
-strcpy_copy7bytes:
+.L_strcpy_copy7bytes:
     str     r2, [r0], #4
     strh    r3, [r0], #2
     lsr     r3, #16
     strb    r3, [r0]
     m_pop
 
-strcpy_unaligned_copy:
+.L_strcpy_unaligned_copy:
     // Dst is aligned to a double word, while src is at an unknown alignment.
     // There are 7 different versions of the unaligned copy code
     // to prevent overreading the src. The mainloop of every single version
     // will store 64 bits per loop. The difference is how much of src can
     // be read without potentially crossing a page boundary.
     tbb     [pc, r3]
-strcpy_unaligned_branchtable:
+.L_strcpy_unaligned_branchtable:
     .byte 0
-    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign7 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign6 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign5 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign4 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign3 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign2 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign1 - .L_strcpy_unaligned_branchtable)/2)
 
     .p2align 2
     // Can read 7 bytes before possibly crossing a page.
-strcpy_unalign7:
+.L_strcpy_unalign7:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     ldrb    r3, [r1]
-    cbz     r3, strcpy_unalign7_copy5bytes
+    cbz     r3, .L_strcpy_unalign7_copy5bytes
     ldrb    r4, [r1, #1]
-    cbz     r4, strcpy_unalign7_copy6bytes
+    cbz     r4, .L_strcpy_unalign7_copy6bytes
     ldrb    r5, [r1, #2]
-    cbz     r5, strcpy_unalign7_copy7bytes
+    cbz     r5, .L_strcpy_unalign7_copy7bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     lsrs    ip, r3, #24
     strd    r2, r3, [r0], #8
-    beq     strcpy_unalign_return
-    b       strcpy_unalign7
+    beq     .L_strcpy_unalign_return
+    b       .L_strcpy_unalign7
 
-strcpy_unalign7_copy5bytes:
+.L_strcpy_unalign7_copy5bytes:
     str     r2, [r0], #4
     strb    r3, [r0]
-strcpy_unalign_return:
+.L_strcpy_unalign_return:
     m_pop
 
-strcpy_unalign7_copy6bytes:
+.L_strcpy_unalign7_copy6bytes:
     str     r2, [r0], #4
     strb    r3, [r0], #1
     strb    r4, [r0], #1
     m_pop
 
-strcpy_unalign7_copy7bytes:
+.L_strcpy_unalign7_copy7bytes:
     str     r2, [r0], #4
     strb    r3, [r0], #1
     strb    r4, [r0], #1
@@ -322,41 +329,41 @@ strcpy_unalign7_copy7bytes:
 
     .p2align 2
     // Can read 6 bytes before possibly crossing a page.
-strcpy_unalign6:
+.L_strcpy_unalign6:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .L_strcpy_unalign_copy5bytes
     ldrb    r5, [r1, #1]
-    cbz     r5, strcpy_unalign_copy6bytes
+    cbz     r5, .L_strcpy_unalign_copy6bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r3, #0xff0000
-    beq     strcpy_copy7bytes
+    beq     .L_strcpy_copy7bytes
     lsrs    ip, r3, #24
     strd    r2, r3, [r0], #8
-    beq     strcpy_unalign_return
-    b       strcpy_unalign6
+    beq     .L_strcpy_unalign_return
+    b       .L_strcpy_unalign6
 
     .p2align 2
     // Can read 5 bytes before possibly crossing a page.
-strcpy_unalign5:
+.L_strcpy_unalign5:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .L_strcpy_unalign_copy5bytes
 
     ldr     r3, [r1], #4
 
@@ -365,17 +372,17 @@ strcpy_unalign5:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign5
+    b       .L_strcpy_unalign5
 
-strcpy_unalign_copy5bytes:
+.L_strcpy_unalign_copy5bytes:
     str     r2, [r0], #4
     strb    r4, [r0]
     m_pop
 
-strcpy_unalign_copy6bytes:
+.L_strcpy_unalign_copy6bytes:
     str     r2, [r0], #4
     strb    r4, [r0], #1
     strb    r5, [r0]
@@ -383,13 +390,13 @@ strcpy_unalign_copy6bytes:
 
     .p2align 2
     // Can read 4 bytes before possibly crossing a page.
-strcpy_unalign4:
+.L_strcpy_unalign4:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
@@ -397,20 +404,20 @@ strcpy_unalign4:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign4
+    b       .L_strcpy_unalign4
 
     .p2align 2
     // Can read 3 bytes before possibly crossing a page.
-strcpy_unalign3:
+.L_strcpy_unalign3:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign3_copy1byte
+    cbz     r2, .L_strcpy_unalign3_copy1byte
     ldrb    r3, [r1, #1]
-    cbz     r3, strcpy_unalign3_copy2bytes
+    cbz     r3, .L_strcpy_unalign3_copy2bytes
     ldrb    r4, [r1, #2]
-    cbz     r4, strcpy_unalign3_copy3bytes
+    cbz     r4, .L_strcpy_unalign3_copy3bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -418,26 +425,26 @@ strcpy_unalign3:
     pld     [r1, #64]
 
     lsrs    lr, r2, #24
-    beq     strcpy_copy4bytes
+    beq     .L_strcpy_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign3
+    b       .L_strcpy_unalign3
 
-strcpy_unalign3_copy1byte:
+.L_strcpy_unalign3_copy1byte:
     strb    r2, [r0]
     m_pop
 
-strcpy_unalign3_copy2bytes:
+.L_strcpy_unalign3_copy2bytes:
     strb    r2, [r0], #1
     strb    r3, [r0]
     m_pop
 
-strcpy_unalign3_copy3bytes:
+.L_strcpy_unalign3_copy3bytes:
     strb    r2, [r0], #1
     strb    r3, [r0], #1
     strb    r4, [r0]
@@ -445,34 +452,34 @@ strcpy_unalign3_copy3bytes:
 
     .p2align 2
     // Can read 2 bytes before possibly crossing a page.
-strcpy_unalign2:
+.L_strcpy_unalign2:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .L_strcpy_unalign_copy1byte
     ldrb    r4, [r1, #1]
-    cbz     r4, strcpy_unalign_copy2bytes
+    cbz     r4, .L_strcpy_unalign_copy2bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r2, #0xff0000
-    beq     strcpy_copy3bytes
+    beq     .L_strcpy_copy3bytes
     lsrs    ip, r2, #24
-    beq     strcpy_copy4bytes
+    beq     .L_strcpy_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign2
+    b       .L_strcpy_unalign2
 
     .p2align 2
     // Can read 1 byte before possibly crossing a page.
-strcpy_unalign1:
+.L_strcpy_unalign1:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .L_strcpy_unalign_copy1byte
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -482,27 +489,27 @@ strcpy_unalign1:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign1
+    b       .L_strcpy_unalign1
 
-strcpy_unalign_copy1byte:
+.L_strcpy_unalign_copy1byte:
     strb    r2, [r0]
     m_pop
 
-strcpy_unalign_copy2bytes:
+.L_strcpy_unalign_copy2bytes:
     strb    r2, [r0], #1
     strb    r4, [r0]
     m_pop
 
     .p2align 2
-strcat_mainloop:
+.L_strcat_mainloop:
     ldrd    r2, r3, [r0], #8
 
     pld     [r0, #64]
@@ -510,59 +517,59 @@ strcat_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_first_register
+    bne     .L_strcat_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .L_strcat_zero_in_second_register
+    b       .L_strcat_mainloop
 
-strcat_zero_in_first_register:
+.L_strcat_zero_in_first_register:
     // Prefetch the src now, it's going to be used soon.
     pld     [r1, #0]
     lsls    lr, ip, #17
-    bne     strcat_sub8
-    bcs     strcat_sub7
+    bne     .L_strcat_sub8
+    bcs     .L_strcat_sub7
     lsls    ip, ip, #1
-    bne     strcat_sub6
+    bne     .L_strcat_sub6
 
     sub     r0, r0, #5
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub8:
+.L_strcat_sub8:
     sub     r0, r0, #8
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub7:
+.L_strcat_sub7:
     sub     r0, r0, #7
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub6:
+.L_strcat_sub6:
     sub     r0, r0, #6
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_zero_in_second_register:
+.L_strcat_zero_in_second_register:
     // Prefetch the src now, it's going to be used soon.
     pld     [r1, #0]
     lsls    lr, ip, #17
-    bne     strcat_sub4
-    bcs     strcat_sub3
+    bne     .L_strcat_sub4
+    bcs     .L_strcat_sub3
     lsls    ip, ip, #1
-    bne     strcat_sub2
+    bne     .L_strcat_sub2
 
     sub     r0, r0, #1
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub4:
+.L_strcat_sub4:
     sub     r0, r0, #4
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub3:
+.L_strcat_sub3:
     sub     r0, r0, #3
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub2:
+.L_strcat_sub2:
     sub     r0, r0, #2
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 END(strcat)
diff --git a/libc/arch-arm/cortex-a15/bionic/string_copy.S b/libc/arch-arm/cortex-a15/bionic/string_copy.S
index 20f0e91..92d1c98 100644
--- a/libc/arch-arm/cortex-a15/bionic/string_copy.S
+++ b/libc/arch-arm/cortex-a15/bionic/string_copy.S
@@ -149,13 +149,20 @@ ENTRY(strcpy)
 .Lstringcopy_align_to_64:
     tst     r3, #4
     beq     .Lstringcopy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .Lstringcopy_zero_in_first_register
-    str     r2, [r0], #4
+    // Read one byte at a time since we don't have any idea about the alignment
+    // of the source and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
 
 .Lstringcopy_check_src_align:
     // At this point dst is aligned to a double word, check if src
diff --git a/libc/arch-arm/cortex-a15/bionic/strlen.S b/libc/arch-arm/cortex-a15/bionic/strlen.S
index 9a0ce62..4fd6284 100644
--- a/libc/arch-arm/cortex-a15/bionic/strlen.S
+++ b/libc/arch-arm/cortex-a15/bionic/strlen.S
@@ -65,38 +65,38 @@ ENTRY(strlen)
     mov     r1, r0
 
     ands    r3, r0, #7
-    beq     mainloop
+    beq     .L_mainloop
 
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     align_to_32
+    beq     .L_align_to_32
 
     ldrb    r2, [r1], #1
-    cbz     r2, update_count_and_return
+    cbz     r2, .L_update_count_and_return
 
-align_to_32:
-    bcc     align_to_64
+.L_align_to_32:
+    bcc     .L_align_to_64
     ands    ip, r3, #2
-    beq     align_to_64
+    beq     .L_align_to_64
 
     ldrb    r2, [r1], #1
-    cbz     r2, update_count_and_return
+    cbz     r2, .L_update_count_and_return
     ldrb    r2, [r1], #1
-    cbz     r2, update_count_and_return
+    cbz     r2, .L_update_count_and_return
 
-align_to_64:
+.L_align_to_64:
     tst     r3, #4
-    beq     mainloop
+    beq     .L_mainloop
     ldr     r3, [r1], #4
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     zero_in_second_register
+    bne     .L_zero_in_second_register
 
     .p2align 2
-mainloop:
+.L_mainloop:
     ldrd    r2, r3, [r1], #8
 
     pld     [r1, #64]
@@ -104,62 +104,62 @@ mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     zero_in_first_register
+    bne     .L_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     zero_in_second_register
-    b       mainloop
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
 
-update_count_and_return:
+.L_update_count_and_return:
     sub     r0, r1, r0
     sub     r0, r0, #1
     bx      lr
 
-zero_in_first_register:
+.L_zero_in_first_register:
     sub     r0, r1, r0
     lsls    r3, ip, #17
-    bne     sub8_and_return
-    bcs     sub7_and_return
+    bne     .L_sub8_and_return
+    bcs     .L_sub7_and_return
     lsls    ip, ip, #1
-    bne     sub6_and_return
+    bne     .L_sub6_and_return
 
     sub     r0, r0, #5
     bx      lr
 
-sub8_and_return:
+.L_sub8_and_return:
     sub     r0, r0, #8
     bx      lr
 
-sub7_and_return:
+.L_sub7_and_return:
     sub     r0, r0, #7
     bx      lr
 
-sub6_and_return:
+.L_sub6_and_return:
     sub     r0, r0, #6
     bx      lr
 
-zero_in_second_register:
+.L_zero_in_second_register:
     sub     r0, r1, r0
     lsls    r3, ip, #17
-    bne     sub4_and_return
-    bcs     sub3_and_return
+    bne     .L_sub4_and_return
+    bcs     .L_sub3_and_return
     lsls    ip, ip, #1
-    bne     sub2_and_return
+    bne     .L_sub2_and_return
 
     sub     r0, r0, #1
     bx      lr
 
-sub4_and_return:
+.L_sub4_and_return:
     sub     r0, r0, #4
     bx      lr
 
-sub3_and_return:
+.L_sub3_and_return:
     sub     r0, r0, #3
     bx      lr
 
-sub2_and_return:
+.L_sub2_and_return:
     sub     r0, r0, #2
     bx      lr
 END(strlen)
diff --git a/libc/arch-arm/cortex-a15/cortex-a15.mk b/libc/arch-arm/cortex-a15/cortex-a15.mk
index 6fa3270..202a3bf 100644
--- a/libc/arch-arm/cortex-a15/cortex-a15.mk
+++ b/libc/arch-arm/cortex-a15/cortex-a15.mk
@@ -10,6 +10,7 @@ libc_bionic_src_files_arm += \
     arch-arm/cortex-a15/bionic/strlen.S \
 
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
 
 libc_bionic_src_files_arm += \
diff --git a/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk b/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk
new file mode 100644
index 0000000..5d7efc6
--- /dev/null
+++ b/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk
@@ -0,0 +1,22 @@
+# This file represents the best optimized routines that are the middle
+# ground when running on a big/little system that is cortex-a57/cortex-a53.
+# The cortex-a7 optimized routines, and the cortex-a53 optimized routines
+# decrease performance on cortex-a57 processors by as much as 20%.
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S \
+    arch-arm/cortex-a15/bionic/memset.S \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/__strcat_chk.S \
+    arch-arm/cortex-a15/bionic/strcmp.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+    arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memcmp.S \
+    arch-arm/generic/bionic/memchr.S
+
+libc_bionic_src_files_arm += \
+    arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S
new file mode 100644
index 0000000..c5bc98a
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/__strcat_chk_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S
new file mode 100644
index 0000000..1f8945d
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/__strcpy_chk_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy.S b/libc/arch-arm/cortex-a53/bionic/memcpy.S
new file mode 100644
index 0000000..664f574
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/memcpy.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/memcpy_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy_base.S b/libc/arch-arm/cortex-a53/bionic/memcpy_base.S
new file mode 100644
index 0000000..2749fc8
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/memcpy_base.S
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+.L_memcpy_base:
+        // Assumes that n >= 0, and dst, src are valid pointers.
+        cmp     r2, #16
+        blo     .L_copy_less_than_16_unknown_align
+
+.L_copy_unknown_alignment:
+        // Unknown alignment of src and dst.
+        // Assumes that the first few bytes have already been prefetched.
+
+        // Align destination to 128 bits. The mainloop store instructions
+        // require this alignment or they will throw an exception.
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         2f
+
+        // Copy up to 15 bytes (count in r3).
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+
+        itt         mi
+        ldrbmi      lr, [r1], #1
+        strbmi      lr, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1], #1
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0], #1
+
+        movs        ip, r3, lsl #29
+        bge         1f
+        // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
+1:      bcc         2f
+        // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+
+2:      // Make sure we have at least 64 bytes to copy.
+        subs        r2, r2, #64
+        blo         2f
+
+1:      // The main loop copies 64 bytes at a time.
+        vld1.8      {d0  - d3},   [r1]!
+        vld1.8      {d4  - d7},   [r1]!
+        subs        r2, r2, #64
+        vstmia      r0!, {d0 - d7}
+        pld         [r1, #(64*10)]
+        bhs         1b
+
+2:      // Fix-up the remaining count and make sure we have >= 32 bytes left.
+        adds        r2, r2, #32
+        blo         3f
+
+        // 32 bytes. These cache lines were already preloaded.
+        vld1.8      {d0 - d3},  [r1]!
+        sub         r2, r2, #32
+        vst1.8      {d0 - d3},  [r0, :128]!
+3:      // Less than 32 left.
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         .L_copy_less_than_16_unknown_align
+        // Copies 16 bytes, destination 128 bits aligned.
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0, :128]!
+
+.L_copy_less_than_16_unknown_align:
+        // Copy up to 15 bytes (count in r2).
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+2:      // Copy 0 to 4 bytes.
+        lsls        r2, r2, #31
+        itt         ne
+        ldrbne      lr, [r1], #1
+        strbne      lr, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1]
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0]
+
+        pop         {r0, pc}
diff --git a/libc/arch-arm/cortex-a53/cortex-a53.mk b/libc/arch-arm/cortex-a53/cortex-a53.mk
index b5c337c..14aaa71 100644
--- a/libc/arch-arm/cortex-a53/cortex-a53.mk
+++ b/libc/arch-arm/cortex-a53/cortex-a53.mk
@@ -1 +1,21 @@
-include bionic/libc/arch-arm/cortex-a7/cortex-a7.mk
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a53/bionic/memcpy.S \
+    arch-arm/cortex-a53/bionic/__strcat_chk.S \
+    arch-arm/cortex-a53/bionic/__strcpy_chk.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a7/bionic/memset.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/strcmp.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
+    arch-arm/generic/bionic/memcmp.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a7/bionic/memset.S b/libc/arch-arm/cortex-a7/bionic/memset.S
new file mode 100644
index 0000000..6365b06
--- /dev/null
+++ b/libc/arch-arm/cortex-a7/bionic/memset.S
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/cpu-features.h>
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+        /*
+         * Optimized memset() for ARM.
+         *
+         * memset() returns its first argument.
+         */
+
+        .fpu        neon
+        .syntax     unified
+
+ENTRY(__memset_chk)
+        cmp         r2, r3
+        bls         .L_done
+
+        // Preserve lr for backtrace.
+        push        {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0
+
+        ldr         r0, error_message
+        ldr         r1, error_code
+1:
+        add         r0, pc
+        bl          __fortify_chk_fail
+error_code:
+        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
+error_message:
+        .word       error_string-(1b+8)
+END(__memset_chk)
+
+ENTRY(bzero)
+        mov         r2, r1
+        mov         r1, #0
+.L_done:
+        // Fall through to memset...
+END(bzero)
+
+ENTRY(memset)
+        mov         r3, r0
+        // At this point only d0, d1 are going to be used below.
+        vdup.8      q0, r1
+        cmp         r2, #16
+        blo         .L_set_less_than_16_unknown_align
+
+.L_check_alignment:
+        // Align destination to a double word to avoid the store crossing
+        // a cache line boundary.
+        ands        ip, r3, #7
+        bne         .L_do_double_word_align
+
+.L_double_word_aligned:
+        // Duplicate since the less than 64 can use d2, d3.
+        vmov        q1, q0
+        subs        r2, #64
+        blo         .L_set_less_than_64
+
+        // Duplicate the copy value so that we can store 64 bytes at a time.
+        vmov        q2, q0
+        vmov        q3, q0
+
+1:      // Main loop stores 64 bytes at a time.
+        subs        r2, #64
+        vstmia      r3!, {d0 - d7}
+        bge         1b
+
+.L_set_less_than_64:
+        // Restore r2 to the count of bytes left to set.
+        add         r2, #64
+        lsls        ip, r2, #27
+        bcc         .L_set_less_than_32
+        // Set 32 bytes.
+        vstmia      r3!, {d0 - d3}
+
+.L_set_less_than_32:
+        bpl         .L_set_less_than_16
+        // Set 16 bytes.
+        vstmia      r3!, {d0, d1}
+
+.L_set_less_than_16:
+        // Less than 16 bytes to set.
+        lsls        ip, r2, #29
+        bcc         .L_set_less_than_8
+
+        // Set 8 bytes.
+        vstmia      r3!, {d0}
+
+.L_set_less_than_8:
+        bpl         .L_set_less_than_4
+        // Set 4 bytes
+        vst1.32     {d0[0]}, [r3]!
+
+.L_set_less_than_4:
+        lsls        ip, r2, #31
+        it          ne
+        strbne      r1, [r3], #1
+        itt         cs
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3]
+        bx          lr
+
+.L_do_double_word_align:
+        rsb         ip, ip, #8
+        sub         r2, r2, ip
+
+        // Do this comparison now, otherwise we'll need to save a
+        // register to the stack since we've used all available
+        // registers.
+        cmp         ip, #4
+        blo         1f
+
+        // Need to do a four byte copy.
+        movs        ip, ip, lsl #31
+        it          mi
+        strbmi      r1, [r3], #1
+        itt         cs
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        vst1.32     {d0[0]}, [r3]!
+        b           .L_double_word_aligned
+
+1:
+        // No four byte copy.
+        movs        ip, ip, lsl #31
+        it          mi
+        strbmi      r1, [r3], #1
+        itt         cs
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        b           .L_double_word_aligned
+
+.L_set_less_than_16_unknown_align:
+        // Set up to 15 bytes.
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vst1.8      {d0}, [r3]!
+1:      bge         2f
+        vst1.32     {d0[0]}, [r3]!
+2:      movs        ip, r2, lsl #31
+        it          mi
+        strbmi      r1, [r3], #1
+        itt         cs
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        bx          lr
+END(memset)
+
+        .data
+error_string:
+        .string     "memset: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a7/cortex-a7.mk b/libc/arch-arm/cortex-a7/cortex-a7.mk
index 9af03d9..3629a57 100644
--- a/libc/arch-arm/cortex-a7/cortex-a7.mk
+++ b/libc/arch-arm/cortex-a7/cortex-a7.mk
@@ -1 +1,19 @@
-include bionic/libc/arch-arm/cortex-a15/cortex-a15.mk
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a7/bionic/memset.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/__strcat_chk.S \
+    arch-arm/cortex-a15/bionic/strcmp.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+    arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
+    arch-arm/generic/bionic/memcmp.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
index 5e81305..6ab5a69 100644
--- a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
@@ -133,8 +133,7 @@ ENTRY_PRIVATE(MEMCPY_BASE)
         strbcs      ip, [r0], #1
         strbcs      lr, [r0], #1
 
-        ldmfd       sp!, {r0, lr}
-        bx          lr
+        ldmfd       sp!, {r0, pc}
 END(MEMCPY_BASE)
 
 ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
diff --git a/libc/arch-arm/cortex-a9/bionic/memset.S b/libc/arch-arm/cortex-a9/bionic/memset.S
index 8ee6ac2..b39fcc4 100644
--- a/libc/arch-arm/cortex-a9/bionic/memset.S
+++ b/libc/arch-arm/cortex-a9/bionic/memset.S
@@ -69,12 +69,9 @@ END(bzero)
 ENTRY(memset)
         // The neon memset only wins for less than 132.
         cmp         r2, #132
-        bhi         __memset_large_copy
-
-        stmfd       sp!, {r0}
-        .cfi_def_cfa_offset 4
-        .cfi_rel_offset r0, 0
+        bhi         .L_memset_large_copy
 
+        mov         r3, r0
         vdup.8      q0, r1
 
         /* make sure we have at least 32 bytes to write */
@@ -84,7 +81,7 @@ ENTRY(memset)
 
 1:      /* The main loop writes 32 bytes at a time */
         subs        r2, r2, #32
-        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d0 - d3}, [r3]!
         bhs         1b
 
 2:      /* less than 32 left */
@@ -93,22 +90,20 @@ ENTRY(memset)
         beq         3f
 
         // writes 16 bytes, 128-bits aligned
-        vst1.8      {d0, d1}, [r0]!
+        vst1.8      {d0, d1}, [r3]!
 3:      /* write up to 15-bytes (count in r2) */
         movs        ip, r2, lsl #29
         bcc         1f
-        vst1.8      {d0}, [r0]!
+        vst1.8      {d0}, [r3]!
 1:      bge         2f
-        vst1.32     {d0[0]}, [r0]!
+        vst1.32     {d0[0]}, [r3]!
 2:      movs        ip, r2, lsl #31
-        strbmi      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        ldmfd       sp!, {r0}
+        strbmi      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
         bx          lr
-END(memset)
 
-ENTRY_PRIVATE(__memset_large_copy)
+.L_memset_large_copy:
         /* compute the offset to align the destination
          * offset = (4-(src&3))&3 = -src & 3
          */
@@ -136,8 +131,7 @@ ENTRY_PRIVATE(__memset_large_copy)
         strbcs      r1, [r0], #1
         strbmi      r1, [r0], #1
         subs        r2, r2, r3
-        popls       {r0, r4-r7, lr}   /* return */
-        bxls        lr
+        popls       {r0, r4-r7, pc}   /* return */
 
         /* align the destination to a cache-line */
         mov         r12, r1
@@ -180,9 +174,8 @@ ENTRY_PRIVATE(__memset_large_copy)
         strhmi      r1, [r0], #2
         movs        r2, r2, lsl #2
         strbcs      r1, [r0]
-        ldmfd       sp!, {r0, r4-r7, lr}
-        bx          lr
-END(__memset_large_copy)
+        ldmfd       sp!, {r0, r4-r7, pc}
+END(memset)
 
         .data
 error_string:
diff --git a/libc/arch-arm/cortex-a9/bionic/strcat.S b/libc/arch-arm/cortex-a9/bionic/strcat.S
index f5a855e..9077a74 100644
--- a/libc/arch-arm/cortex-a9/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a9/bionic/strcat.S
@@ -70,7 +70,7 @@
 
     .macro m_scan_byte
     ldrb    r3, [r0]
-    cbz     r3, strcat_r0_scan_done
+    cbz     r3, .Lstrcat_r0_scan_done
     add     r0, #1
     .endm // m_scan_byte
 
@@ -84,10 +84,10 @@ ENTRY(strcat)
     // Quick check to see if src is empty.
     ldrb        r2, [r1]
     pld         [r1, #0]
-    cbnz        r2, strcat_continue
+    cbnz        r2, .Lstrcat_continue
     bx          lr
 
-strcat_continue:
+.Lstrcat_continue:
     // To speed up really small dst strings, unroll checking the first 4 bytes.
     m_push
     m_scan_byte
@@ -96,10 +96,10 @@ strcat_continue:
     m_scan_byte
 
     ands    r3, r0, #7
-    bne     strcat_align_src
+    bne     .Lstrcat_align_src
 
     .p2align 2
-strcat_mainloop:
+.Lstrcat_mainloop:
     ldmia   r0!, {r2, r3}
 
     pld     [r0, #64]
@@ -107,28 +107,28 @@ strcat_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_first_register
+    bne     .Lstrcat_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .Lstrcat_zero_in_second_register
+    b       .Lstrcat_mainloop
 
-strcat_zero_in_first_register:
+.Lstrcat_zero_in_first_register:
     sub     r0, r0, #4
 
-strcat_zero_in_second_register:
+.Lstrcat_zero_in_second_register:
     // Check for zero in byte 0.
     tst     ip, #0x80
     it      ne
     subne   r0, r0, #4
-    bne     strcat_r0_scan_done
+    bne     .Lstrcat_r0_scan_done
     // Check for zero in byte 1.
     tst     ip, #0x8000
     it      ne
     subne   r0, r0, #3
-    bne     strcat_r0_scan_done
+    bne     .Lstrcat_r0_scan_done
     // Check for zero in byte 2.
     tst     ip, #0x800000
     it      ne
@@ -137,33 +137,33 @@ strcat_zero_in_second_register:
     // Zero is in byte 3.
     subeq   r0, r0, #1
 
-strcat_r0_scan_done:
+.Lstrcat_r0_scan_done:
     // Unroll the first 8 bytes that will be copied.
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
-
-strcpy_finish:
+    m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r5, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r5, cmd=cbnz, label=.Lstrcpy_continue
+
+.Lstrcpy_finish:
     m_ret   inst=pop
 
-strcpy_continue:
+.Lstrcpy_continue:
     pld     [r1, #0]
     ands    r3, r0, #7
-    bne     strcpy_align_dst
+    bne     .Lstrcpy_align_dst
 
-strcpy_check_src_align:
+.Lstrcpy_check_src_align:
     // At this point dst is aligned to a double word, check if src
     // is also aligned to a double word.
     ands    r3, r1, #7
-    bne     strcpy_unaligned_copy
+    bne     .Lstrcpy_unaligned_copy
 
     .p2align 2
-strcpy_mainloop:
+.Lstrcpy_mainloop:
     ldmia   r1!, {r2, r3}
 
     pld     [r1, #64]
@@ -171,17 +171,17 @@ strcpy_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_mainloop
+    b       .Lstrcpy_mainloop
 
-strcpy_zero_in_first_register:
+.Lstrcpy_zero_in_first_register:
     lsls    lr, ip, #17
     itt     ne
     strbne  r2, [r0]
@@ -198,7 +198,7 @@ strcpy_zero_in_first_register:
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_zero_in_second_register:
+.Lstrcpy_zero_in_second_register:
     lsls    lr, ip, #17
     ittt    ne
     stmiane r0!, {r2}
@@ -218,18 +218,18 @@ strcpy_zero_in_second_register:
     strb    r4, [r0]
     m_ret   inst=pop
 
-strcpy_align_dst:
+.Lstrcpy_align_dst:
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcpy_align_to_32
+    beq     .Lstrcpy_align_to_32
 
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .Lstrcpy_complete
 
-strcpy_align_to_32:
-    bcc     strcpy_align_to_64
+.Lstrcpy_align_to_32:
+    bcc     .Lstrcpy_align_to_64
 
     ldrb    r4, [r1], #1
     strb    r4, [r0], #1
@@ -242,76 +242,83 @@ strcpy_align_to_32:
     it      eq
     m_ret   inst=popeq
 
-strcpy_align_to_64:
+.Lstrcpy_align_to_64:
     tst     r3, #4
-    beq     strcpy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
-    stmia   r0!, {r2}
-    b       strcpy_check_src_align
+    beq     .Lstrcpy_check_src_align
+    // Read one byte at a time since we don't know the src alignment
+    // and we don't want to read into a different page.
+    ldrb    r4, [r1], #1
+    strb    r4, [r0], #1
+    cbz     r4, .Lstrcpy_complete
+    ldrb    r5, [r1], #1
+    strb    r5, [r0], #1
+    cbz     r5, .Lstrcpy_complete
+    ldrb    r4, [r1], #1
+    strb    r4, [r0], #1
+    cbz     r4, .Lstrcpy_complete
+    ldrb    r5, [r1], #1
+    strb    r5, [r0], #1
+    cbz     r5, .Lstrcpy_complete
+    b       .Lstrcpy_check_src_align
 
-strcpy_complete:
+.Lstrcpy_complete:
     m_ret   inst=pop
 
-strcpy_unaligned_copy:
+.Lstrcpy_unaligned_copy:
     // Dst is aligned to a double word, while src is at an unknown alignment.
     // There are 7 different versions of the unaligned copy code
     // to prevent overreading the src. The mainloop of every single version
     // will store 64 bits per loop. The difference is how much of src can
     // be read without potentially crossing a page boundary.
     tbb     [pc, r3]
-strcpy_unaligned_branchtable:
+.Lstrcpy_unaligned_branchtable:
     .byte 0
-    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign7 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign6 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign5 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign4 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign3 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign2 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign1 - .Lstrcpy_unaligned_branchtable)/2)
 
     .p2align 2
     // Can read 7 bytes before possibly crossing a page.
-strcpy_unalign7:
+.Lstrcpy_unalign7:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r3, [r1]
-    cbz     r3, strcpy_unalign7_copy5bytes
+    cbz     r3, .Lstrcpy_unalign7_copy5bytes
     ldrb    r4, [r1, #1]
-    cbz     r4, strcpy_unalign7_copy6bytes
+    cbz     r4, .Lstrcpy_unalign7_copy6bytes
     ldrb    r5, [r1, #2]
-    cbz     r5, strcpy_unalign7_copy7bytes
+    cbz     r5, .Lstrcpy_unalign7_copy7bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     lsrs    ip, r3, #24
     stmia   r0!, {r2, r3}
-    beq     strcpy_unalign_return
-    b       strcpy_unalign7
+    beq     .Lstrcpy_unalign_return
+    b       .Lstrcpy_unalign7
 
-strcpy_unalign7_copy5bytes:
+.Lstrcpy_unalign7_copy5bytes:
     stmia   r0!, {r2}
     strb    r3, [r0]
-strcpy_unalign_return:
+.Lstrcpy_unalign_return:
     m_ret   inst=pop
 
-strcpy_unalign7_copy6bytes:
+.Lstrcpy_unalign7_copy6bytes:
     stmia   r0!, {r2}
     strb    r3, [r0], #1
     strb    r4, [r0], #1
     m_ret   inst=pop
 
-strcpy_unalign7_copy7bytes:
+.Lstrcpy_unalign7_copy7bytes:
     stmia   r0!, {r2}
     strb    r3, [r0], #1
     strb    r4, [r0], #1
@@ -320,30 +327,30 @@ strcpy_unalign7_copy7bytes:
 
     .p2align 2
     // Can read 6 bytes before possibly crossing a page.
-strcpy_unalign6:
+.Lstrcpy_unalign6:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .Lstrcpy_unalign_copy5bytes
     ldrb    r5, [r1, #1]
-    cbz     r5, strcpy_unalign_copy6bytes
+    cbz     r5, .Lstrcpy_unalign_copy6bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r3, #0xff0000
-    beq     strcpy_unalign6_copy7bytes
+    beq     .Lstrcpy_unalign6_copy7bytes
     lsrs    ip, r3, #24
     stmia   r0!, {r2, r3}
-    beq     strcpy_unalign_return
-    b       strcpy_unalign6
+    beq     .Lstrcpy_unalign_return
+    b       .Lstrcpy_unalign6
 
-strcpy_unalign6_copy7bytes:
+.Lstrcpy_unalign6_copy7bytes:
     stmia   r0!, {r2}
     strh    r3, [r0], #2
     lsr     r3, #16
@@ -352,16 +359,16 @@ strcpy_unalign6_copy7bytes:
 
     .p2align 2
     // Can read 5 bytes before possibly crossing a page.
-strcpy_unalign5:
+.Lstrcpy_unalign5:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .Lstrcpy_unalign_copy5bytes
 
     ldr     r3, [r1], #4
 
@@ -370,17 +377,17 @@ strcpy_unalign5:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign5
+    b       .Lstrcpy_unalign5
 
-strcpy_unalign_copy5bytes:
+.Lstrcpy_unalign_copy5bytes:
     stmia   r0!, {r2}
     strb    r4, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy6bytes:
+.Lstrcpy_unalign_copy6bytes:
     stmia   r0!, {r2}
     strb    r4, [r0], #1
     strb    r5, [r0]
@@ -388,13 +395,13 @@ strcpy_unalign_copy6bytes:
 
     .p2align 2
     // Can read 4 bytes before possibly crossing a page.
-strcpy_unalign4:
+.Lstrcpy_unalign4:
     ldmia   r1!, {r2}
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldmia   r1!, {r3}
     pld     [r1, #64]
@@ -402,20 +409,20 @@ strcpy_unalign4:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign4
+    b       .Lstrcpy_unalign4
 
     .p2align 2
     // Can read 3 bytes before possibly crossing a page.
-strcpy_unalign3:
+.Lstrcpy_unalign3:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign3_copy1byte
+    cbz     r2, .Lstrcpy_unalign3_copy1byte
     ldrb    r3, [r1, #1]
-    cbz     r3, strcpy_unalign3_copy2bytes
+    cbz     r3, .Lstrcpy_unalign3_copy2bytes
     ldrb    r4, [r1, #2]
-    cbz     r4, strcpy_unalign3_copy3bytes
+    cbz     r4, .Lstrcpy_unalign3_copy3bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -423,26 +430,26 @@ strcpy_unalign3:
     pld     [r1, #64]
 
     lsrs    lr, r2, #24
-    beq     strcpy_unalign_copy4bytes
+    beq     .Lstrcpy_unalign_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign3
+    b       .Lstrcpy_unalign3
 
-strcpy_unalign3_copy1byte:
+.Lstrcpy_unalign3_copy1byte:
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign3_copy2bytes:
+.Lstrcpy_unalign3_copy2bytes:
     strb    r2, [r0], #1
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_unalign3_copy3bytes:
+.Lstrcpy_unalign3_copy3bytes:
     strb    r2, [r0], #1
     strb    r3, [r0], #1
     strb    r4, [r0]
@@ -450,34 +457,34 @@ strcpy_unalign3_copy3bytes:
 
     .p2align 2
     // Can read 2 bytes before possibly crossing a page.
-strcpy_unalign2:
+.Lstrcpy_unalign2:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .Lstrcpy_unalign_copy1byte
     ldrb    r3, [r1, #1]
-    cbz     r3, strcpy_unalign_copy2bytes
+    cbz     r3, .Lstrcpy_unalign_copy2bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r2, #0xff0000
-    beq     strcpy_unalign_copy3bytes
+    beq     .Lstrcpy_unalign_copy3bytes
     lsrs    ip, r2, #24
-    beq     strcpy_unalign_copy4bytes
+    beq     .Lstrcpy_unalign_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign2
+    b       .Lstrcpy_unalign2
 
     .p2align 2
     // Can read 1 byte before possibly crossing a page.
-strcpy_unalign1:
+.Lstrcpy_unalign1:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .Lstrcpy_unalign_copy1byte
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -487,62 +494,62 @@ strcpy_unalign1:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign1
+    b       .Lstrcpy_unalign1
 
-strcpy_unalign_copy1byte:
+.Lstrcpy_unalign_copy1byte:
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy2bytes:
+.Lstrcpy_unalign_copy2bytes:
     strb    r2, [r0], #1
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy3bytes:
+.Lstrcpy_unalign_copy3bytes:
     strh    r2, [r0], #2
     lsr     r2, #16
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy4bytes:
+.Lstrcpy_unalign_copy4bytes:
     stmia   r0, {r2}
     m_ret   inst=pop
 
-strcat_align_src:
+.Lstrcat_align_src:
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcat_align_to_32
+    beq     .Lstrcat_align_to_32
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
 
-strcat_align_to_32:
-    bcc     strcat_align_to_64
+.Lstrcat_align_to_32:
+    bcc     .Lstrcat_align_to_64
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
 
-strcat_align_to_64:
+.Lstrcat_align_to_64:
     tst     r3, #4
-    beq     strcat_mainloop
+    beq     .Lstrcat_mainloop
     ldr     r3, [r0], #4
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .Lstrcat_zero_in_second_register
+    b       .Lstrcat_mainloop
 
-strcat_r0_update:
+.Lstrcat_r0_update:
     sub     r0, r0, #1
-    b strcat_r0_scan_done
+    b .Lstrcat_r0_scan_done
 END(strcat)
diff --git a/libc/arch-arm/cortex-a9/bionic/string_copy.S b/libc/arch-arm/cortex-a9/bionic/string_copy.S
index caf5a11..642db0f 100644
--- a/libc/arch-arm/cortex-a9/bionic/string_copy.S
+++ b/libc/arch-arm/cortex-a9/bionic/string_copy.S
@@ -244,13 +244,20 @@ ENTRY(strcpy)
 .Lstringcopy_align_to_64:
     tst     r3, #4
     beq     .Lstringcopy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .Lstringcopy_zero_in_first_register
-    stmia   r0!, {r2}
+    // Read one byte at a time since we don't have any idea about the alignment
+    // of the source and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
     b       .Lstringcopy_check_src_align
 
 .Lstringcopy_complete:
diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk
index 7b38de1..db4bcc7 100644
--- a/libc/arch-arm/cortex-a9/cortex-a9.mk
+++ b/libc/arch-arm/cortex-a9/cortex-a9.mk
@@ -10,6 +10,7 @@ libc_bionic_src_files_arm += \
     arch-arm/cortex-a9/bionic/strlen.S \
 
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
 
 libc_bionic_src_files_arm += \
diff --git a/libc/arch-arm/denver/denver.mk b/libc/arch-arm/denver/denver.mk
index 5fddf95..e81f8c7 100644
--- a/libc/arch-arm/denver/denver.mk
+++ b/libc/arch-arm/denver/denver.mk
@@ -1,4 +1,5 @@
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
     arch-arm/denver/bionic/memcpy.S \
     arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/generic/bionic/memchr.S b/libc/arch-arm/generic/bionic/memchr.S
new file mode 100644
index 0000000..cb00d82
--- /dev/null
+++ b/libc/arch-arm/generic/bionic/memchr.S
@@ -0,0 +1,155 @@
+/* Copyright (c) 2010-2015, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This memchr routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.   It has a fast past for short sizes, and has
+   an optimised path for large data sets; the worst case is finding the
+   match early in a large data set.
+
+ */
+
+#include <private/bionic_asm.h>
+
+@ 2011-02-07 david.gilbert@linaro.org
+@    Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@    Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@    Removed unneeded cbz from align loop
+
+	.syntax unified
+	.arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.text
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+ENTRY(memchr)
+	.p2align 4,,15
+	@ r0 = start of memory to scan
+	@ r1 = character to look for
+	@ r2 = length
+	@ returns r0 = pointer to character or NULL if not found
+	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
+
+	cmp	r2,#16		@ If it's short don't bother with anything clever
+	blt	20f
+
+	tst	r0, #7		@ If it's already aligned skip the next bit
+	beq	10f
+
+	@ Work up to an aligned point
+5:
+	ldrb	r3, [r0],#1
+	subs	r2, r2, #1
+	cmp	r3, r1
+	beq	50f		@ If it matches exit found
+	tst	r0, #7
+	bne	5b		@ If not aligned yet then do next byte
+
+10:
+	@ At this point, we are aligned, we know we have at least 8 bytes to work with
+	push	{r4,r5,r6,r7}
+	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
+	orr	r1, r1, r1, lsl #16
+	bic	r4, r2, #7	@ Number of double words to work with
+	mvns	r7, #0		@ all F's
+	movs	r3, #0
+
+15:
+	ldrd    r5,r6,[r0],#8
+	subs	r4, r4, #8
+	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
+	eor	r6,r6, r1
+	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	cbnz	r6, 60f
+	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
+
+	pop	{r4,r5,r6,r7}
+	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
+	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
+
+20:
+	cbz	r2, 40f		@ 0 length or hit the end already then not found
+
+21:  @ Post aligned section, or just a short call
+	ldrb	r3,[r0],#1
+	subs	r2,r2,#1
+	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
+	cbz	r3, 50f
+	bne	21b		@ on r2 flags
+
+40:
+	movs	r0,#0		@ not found
+	bx	lr
+
+50:
+	subs	r0,r0,#1	@ found
+	bx	lr
+
+60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+	@ r0 points to the start of the double word after the one that was tested
+	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	cmp	r5, #0
+	itte	eq
+	moveq	r5, r6		@ the end is in the 2nd word
+	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
+	subne	r0,r0,#7	@ or 2nd byte of 1st word
+
+	@ r0 currently points to the 3rd byte of the word containing the hit
+	tst	r5, # CHARTSTMASK(0)	@ 1st character
+	bne	61f
+	adds	r0,r0,#1
+	tst	r5, # CHARTSTMASK(1)	@ 2nd character
+	ittt	eq
+	addeq	r0,r0,#1
+	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
+	@ If not the 3rd must be the last one
+	addeq	r0,r0,#1
+
+61:
+	pop	{r4,r5,r6,r7}
+	subs	r0,r0,#1
+	bx	lr
+END(memchr)
diff --git a/libc/arch-arm/generic/bionic/memcmp.S b/libc/arch-arm/generic/bionic/memcmp.S
index c78dbd4..6643d55 100644
--- a/libc/arch-arm/generic/bionic/memcmp.S
+++ b/libc/arch-arm/generic/bionic/memcmp.S
@@ -221,8 +221,7 @@ ENTRY(memcmp)
         bne         8b
 
 9:      /* restore registers and return */
-        ldmfd       sp!, {r4, lr}
-        bx          lr
+        ldmfd       sp!, {r4, pc}
 
 10:     /* process less than 12 bytes */
         cmp         r2, #0
diff --git a/libc/arch-arm/generic/bionic/memcpy.S b/libc/arch-arm/generic/bionic/memcpy.S
index ea5a399..65cba4c 100644
--- a/libc/arch-arm/generic/bionic/memcpy.S
+++ b/libc/arch-arm/generic/bionic/memcpy.S
@@ -194,8 +194,7 @@ ENTRY(memcpy)
 
         /* we're done! restore everything and return */
 1:      ldmfd       sp!, {r5-r11}
-        ldmfd       sp!, {r0, r4, lr}
-        bx          lr
+        ldmfd       sp!, {r0, r4, pc}
 
         /********************************************************************/
 
@@ -385,8 +384,7 @@ ENTRY(memcpy)
 
         /* we're done! restore sp and spilled registers and return */
         add         sp,  sp, #28
-        ldmfd       sp!, {r0, r4, lr}
-        bx          lr
+        ldmfd       sp!, {r0, r4, pc}
 END(memcpy)
 
         // Only reached when the __memcpy_chk check fails.
diff --git a/libc/arch-arm/generic/bionic/memset.S b/libc/arch-arm/generic/bionic/memset.S
index d17a9c4..b8eabbf 100644
--- a/libc/arch-arm/generic/bionic/memset.S
+++ b/libc/arch-arm/generic/bionic/memset.S
@@ -82,8 +82,7 @@ ENTRY(memset)
         strbcs      r1, [r0], #1
         strbmi      r1, [r0], #1
         subs        r2, r2, r3
-        popls       {r0, r4-r7, lr}    /* return */
-        bxls        lr
+        popls       {r0, r4-r7, pc}    /* return */
 
         /* align the destination to a cache-line */
         mov         r12, r1
@@ -126,8 +125,7 @@ ENTRY(memset)
         strhmi      r1, [r0], #2
         movs        r2, r2, lsl #2
         strbcs      r1, [r0]
-        ldmfd       sp!, {r0, r4-r7, lr}
-        bx          lr
+        ldmfd       sp!, {r0, r4-r7, pc}
 END(memset)
 
         .data
diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index e49d6d2..016c882 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -1,4 +1,5 @@
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
     arch-arm/generic/bionic/memcpy.S \
     arch-arm/generic/bionic/memset.S \
diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S
index 246f159..1a39c5b 100644
--- a/libc/arch-arm/krait/bionic/__strcat_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcat_chk.S
@@ -40,7 +40,7 @@
 ENTRY(__strcat_chk)
     pld     [r0, #0]
     push    {r0, lr}
-    .cfi_def_cfa_offset 8
+    .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r0, 0
     .cfi_rel_offset lr, 4
     push    {r4, r5}
@@ -177,7 +177,7 @@ ENTRY(__strcat_chk)
 .L_strlen_done:
     add     r2, r3, r4
     cmp     r2, lr
-    bhi     __strcat_chk_failed
+    bhi     .L_strcat_chk_failed
 
     // Set up the registers for the memcpy code.
     mov     r1, r5
@@ -185,20 +185,17 @@ ENTRY(__strcat_chk)
     mov     r2, r4
     add     r0, r0, r3
     pop     {r4, r5}
-END(__strcat_chk)
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore r4
+    .cfi_restore r5
 
-#define MEMCPY_BASE         __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__strcat_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
+    // Undo the above cfi directives.
     .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r4, 0
     .cfi_rel_offset r5, 4
-
+.L_strcat_chk_failed:
     ldr     r0, error_message
     ldr     r1, error_code
 1:
@@ -208,7 +205,7 @@ error_code:
     .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
 error_message:
     .word   error_string-(1b+4)
-END(__strcat_chk_failed)
+END(__strcat_chk)
 
     .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S
index db76686..00202f3 100644
--- a/libc/arch-arm/krait/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S
@@ -39,7 +39,7 @@
 ENTRY(__strcpy_chk)
     pld     [r0, #0]
     push    {r0, lr}
-    .cfi_def_cfa_offset 8
+    .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r0, 0
     .cfi_rel_offset lr, 4
 
@@ -149,21 +149,14 @@ ENTRY(__strcpy_chk)
     pld     [r1, #64]
     ldr     r0, [sp]
     cmp     r3, lr
-    bhs     __strcpy_chk_failed
+    bhs     .L_strcpy_chk_failed
 
     // Add 1 for copy length to get the string terminator.
     add     r2, r3, #1
-END(__strcpy_chk)
 
-#define MEMCPY_BASE         __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__strcpy_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-
+.L_strcpy_chk_failed:
     ldr     r0, error_message
     ldr     r1, error_code
 1:
@@ -173,7 +166,7 @@ error_code:
     .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
 error_message:
     .word   error_string-(1b+4)
-END(__strcpy_chk_failed)
+END(__strcpy_chk)
 
     .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 9ff46a8..5d27b57 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -45,7 +45,7 @@
 
 ENTRY(__memcpy_chk)
         cmp         r2, r3
-        bhi         __memcpy_chk_fail
+        bhi         .L_memcpy_chk_fail
 
         // Fall through to memcpy...
 END(__memcpy_chk)
@@ -53,19 +53,20 @@ END(__memcpy_chk)
 ENTRY(memcpy)
         pld     [r1, #64]
         stmfd   sp!, {r0, lr}
-        .cfi_def_cfa_offset 8
+        .cfi_adjust_cfa_offset 8
         .cfi_rel_offset r0, 0
         .cfi_rel_offset lr, 4
-END(memcpy)
 
-#define MEMCPY_BASE         __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__memcpy_chk_fail)
+        // Undo the cfi directives from above.
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r0
+        .cfi_restore lr
+.L_memcpy_chk_fail:
         // Preserve lr for backtrace.
         push    {lr}
-        .cfi_def_cfa_offset 4
+        .cfi_adjust_cfa_offset 4
         .cfi_rel_offset lr, 0
 
         ldr     r0, error_message
@@ -77,7 +78,7 @@ error_code:
         .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
         .word   error_string-(1b+4)
-END(__memcpy_chk_fail)
+END(memcpy)
 
         .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S
index 035dcf1..76c5a84 100644
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -1,123 +1,191 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-/*
- * This code assumes it is running on a processor that supports all arm v7
- * instructions, that supports neon instructions, and that has a 32 byte
- * cache line.
- */
-
-// Assumes neon instructions and a cache line size of 32 bytes.
-
-ENTRY_PRIVATE(MEMCPY_BASE)
-        .cfi_def_cfa_offset 8
-        .cfi_rel_offset r0, 0
-        .cfi_rel_offset lr, 4
-
-        /* do we have at least 16-bytes to copy (needed for alignment below) */
-        cmp         r2, #16
-        blo         5f
-
-        /* align destination to cache-line for the write-buffer */
-        rsb         r3, r0, #0
-        ands        r3, r3, #0xF
-        beq         2f
-
-        /* copy up to 15-bytes (count in r3) */
-        sub         r2, r2, r3
-        movs        ip, r3, lsl #31
-        itt         mi
-        ldrbmi      lr, [r1], #1
-        strbmi      lr, [r0], #1
-        itttt       cs
-        ldrbcs      ip, [r1], #1
-        ldrbcs      lr, [r1], #1
-        strbcs      ip, [r0], #1
-        strbcs      lr, [r0], #1
-        movs        ip, r3, lsl #29
-        bge         1f
-        // copies 4 bytes, destination 32-bits aligned
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
-1:      bcc         2f
-        // copies 8 bytes, destination 64-bits aligned
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0, :64]!
-
-2:      /* make sure we have at least 64 bytes to copy */
-        subs        r2, r2, #64
-        blo         2f
-
-1:      /* The main loop copies 64 bytes at a time */
-        vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(32*8)]
-        subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
-        vst1.8      {d4  - d7},   [r0, :128]!
-        bhs         1b
-
-2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
-        adds        r2, r2, #32
-        blo         4f
-
-        /* Copy 32 bytes. These cache lines were already preloaded */
-        vld1.8      {d0 - d3},  [r1]!
-        sub         r2, r2, #32
-        vst1.8      {d0 - d3},  [r0, :128]!
-
-4:      /* less than 32 left */
-        add         r2, r2, #32
-        tst         r2, #0x10
-        beq         5f
-        // copies 16 bytes, 128-bits aligned
-        vld1.8      {d0, d1}, [r1]!
-        vst1.8      {d0, d1}, [r0, :128]!
-
-5:      /* copy up to 15-bytes (count in r2) */
-        movs        ip, r2, lsl #29
-        bcc         1f
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0]!
-1:      bge         2f
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
-2:      movs        ip, r2, lsl #31
-        itt         mi
-        ldrbmi      r3, [r1], #1
-        strbmi      r3, [r0], #1
-        itttt       cs
-        ldrbcs      ip, [r1], #1
-        ldrbcs      lr, [r1], #1
-        strbcs      ip, [r0], #1
-        strbcs      lr, [r0], #1
-
-        ldmfd       sp!, {r0, lr}
-        bx          lr
-END(MEMCPY_BASE)
+/***************************************************************************
+ Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of The Linux Foundation nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/* Assumes neon instructions and a cache line size of 64 bytes. */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+#define PLDOFFS	(10)
+#define PLDTHRESH (PLDOFFS)
+#define BBTHRESH (4096/64)
+#define PLDSIZE (64)
+
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+
+	.text
+	.fpu    neon
+
+.L_memcpy_base:
+	cmp	r2, #4
+	blt	.L_neon_lt4
+	cmp	r2, #16
+	blt	.L_neon_lt16
+	cmp	r2, #32
+	blt	.L_neon_16
+	cmp	r2, #64
+	blt	.L_neon_copy_32_a
+
+	mov	r12, r2, lsr #6
+	cmp	r12, #PLDTHRESH
+	ble	.L_neon_copy_64_loop_nopld
+
+	push	{r9, r10}
+	.cfi_adjust_cfa_offset 8
+	.cfi_rel_offset r9, 0
+	.cfi_rel_offset r10, 4
+
+	cmp	r12, #BBTHRESH
+	ble	.L_neon_prime_pump
+
+	add	lr, r0, #0x400
+	add	r9, r1, #(PLDOFFS*PLDSIZE)
+	sub	lr, lr, r9
+	lsl	lr, lr, #21
+	lsr	lr, lr, #21
+	add	lr, lr, #(PLDOFFS*PLDSIZE)
+	cmp	r12, lr, lsr #6
+	ble	.L_neon_prime_pump
+
+	itt	gt
+	movgt	r9, #(PLDOFFS)
+	rsbsgt	r9, r9, lr, lsr #6
+	ble	.L_neon_prime_pump
+
+	add	r10, r1, lr
+	bic	r10, #0x3F
+
+	sub	r12, r12, lr, lsr #6
+
+	cmp	r9, r12
+	itee	le
+	suble	r12, r12, r9
+	movgt	r9, r12
+	movgt	r12, #0
+
+	pld	[r1, #((PLDOFFS-1)*PLDSIZE)]
+.L_neon_copy_64_loop_outer_doublepld:
+	pld	[r1, #((PLDOFFS)*PLDSIZE)]
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]!
+	ldr	r3, [r10]
+	subs	r9, r9, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]!
+	add	r10, #64
+	bne	.L_neon_copy_64_loop_outer_doublepld
+	cmp	r12, #0
+	beq	.L_neon_pop_before_nopld
+
+	cmp	r12, #(512*1024/64)
+	blt	.L_neon_copy_64_loop_outer
+
+.L_neon_copy_64_loop_ddr:
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]!
+	pld	[r10]
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]!
+	add	r10, #64
+	bne	.L_neon_copy_64_loop_ddr
+	b	.L_neon_pop_before_nopld
+
+.L_neon_prime_pump:
+	mov	lr, #(PLDOFFS*PLDSIZE)
+	add	r10, r1, #(PLDOFFS*PLDSIZE)
+	bic	r10, #0x3F
+	sub	r12, r12, #PLDOFFS
+	ldr	r3, [r10, #(-1*PLDSIZE)]
+
+.L_neon_copy_64_loop_outer:
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]!
+	ldr	r3, [r10]
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]!
+	add	r10, #64
+	bne	.L_neon_copy_64_loop_outer
+
+.L_neon_pop_before_nopld:
+	mov	r12, lr, lsr #6
+	pop	{r9, r10}
+	.cfi_adjust_cfa_offset -8
+	.cfi_restore r9
+	.cfi_restore r10
+
+.L_neon_copy_64_loop_nopld:
+	vld1.32	{q8, q9}, [r1]!
+	vld1.32	{q10, q11}, [r1]!
+	subs	r12, r12, #1
+	vst1.32	{q8, q9}, [r0]!
+	vst1.32	{q10, q11}, [r0]!
+	bne	.L_neon_copy_64_loop_nopld
+	ands	r2, r2, #0x3f
+	beq	.L_neon_exit
+
+.L_neon_copy_32_a:
+	movs	r3, r2, lsl #27
+	bcc	.L_neon_16
+	vld1.32	{q0,q1}, [r1]!
+	vst1.32	{q0,q1}, [r0]!
+
+.L_neon_16:
+	bpl	.L_neon_lt16
+	vld1.32	{q8}, [r1]!
+	vst1.32	{q8}, [r0]!
+	ands	r2, r2, #0x0f
+	beq	.L_neon_exit
+
+.L_neon_lt16:
+	movs	r3, r2, lsl #29
+	bcc	1f
+	vld1.8	{d0}, [r1]!
+	vst1.8	{d0}, [r0]!
+1:
+	bge	.L_neon_lt4
+	vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
+	vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+.L_neon_lt4:
+	movs	r2, r2, lsl #31
+	itt	cs
+	ldrhcs	r3, [r1], #2
+	strhcs	r3, [r0], #2
+	itt	mi
+	ldrbmi	r3, [r1]
+	strbmi	r3, [r0]
+
+.L_neon_exit:
+	pop	{r0, pc}
diff --git a/libc/arch-arm/krait/bionic/memmove.S b/libc/arch-arm/krait/bionic/memmove.S
new file mode 100644
index 0000000..aea7315
--- /dev/null
+++ b/libc/arch-arm/krait/bionic/memmove.S
@@ -0,0 +1,219 @@
+/***************************************************************************
+ Copyright (c) 2009-2014 The Linux Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of The Linux Foundation nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/***************************************************************************
+ *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ *     Inputs:
+ *        dest: The destination buffer
+ *        src: The source buffer
+ *        n: The size of the buffer to transfer
+ *     Outputs:
+ *
+ ***************************************************************************/
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+/*
+ * These can be overridden in:
+ *   device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ *   TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ *   TARGET_USE_KRAIT_PLD_SET := true
+ *   TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ *   TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ *   TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS	(10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#if (PLDOFFS < 5)
+#error Routine does not support offsets less than 5
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+
+	.text
+	.syntax	unified
+	.fpu neon
+	.thumb
+	.thumb_func
+
+//ENTRY(bcopy)
+//        //.cfi_startproc
+//	mov	r12, r0
+//	mov	r0, r1
+//	mov	r1, r12
+//        // Fall through to memmove
+//        //.cfi_endproc
+//END(bcopy)
+
+ENTRY(memmove)
+_memmove_words:
+        //.cfi_startproc
+	.save	{r0, lr}
+	cmp	r2, #0
+	it	ne
+	subsne	r12, r0, r1	// Warning: do not combine these "it" blocks
+	it	eq
+	bxeq	lr
+//	memmove only if r1 < r0 < r1+r2
+	cmp	r0, r1
+	itt	ge
+	addge	r12, r1, r2
+	cmpge	r12, r0
+	it	le
+	ble	memcpy
+	cmp	r2, #4
+	it	le
+	ble	.Lneon_b2f_smallcopy_loop
+	push	{r0, lr}
+	add	r0, r0, r2
+	add	r1, r1, r2
+	cmp	r2, #64
+	it	ge
+	bge	.Lneon_b2f_copy_64
+	cmp	r2, #32
+	it	ge
+	bge	.Lneon_b2f_copy_32
+	cmp	r2, #8
+	it	ge
+	bge	.Lneon_b2f_copy_8
+	b	.Lneon_b2f_copy_1
+.Lneon_b2f_copy_64:
+	mov	r12, r2, lsr #6
+	add	r0, r0, #32
+	add	r1, r1, #32
+	cmp	r12, #PLDTHRESH
+	it	le
+	ble	.Lneon_b2f_copy_64_loop_nopld
+	sub	r12, #PLDOFFS
+	sub	lr, r1, #(PLDOFFS)*PLDSIZE
+.Lneon_b2f_copy_64_loop_outer:
+	pld	[lr]
+	sub	r1, r1, #96
+	sub	r0, r0, #96
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]
+	sub	lr, lr, #64
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]
+	it	ne
+	bne	.Lneon_b2f_copy_64_loop_outer
+	mov	r12, #PLDOFFS
+.Lneon_b2f_copy_64_loop_nopld:
+	sub	r1, r1, #96
+	sub	r0, r0, #96
+	vld1.32	{q8, q9}, [r1]!
+	vld1.32	{q10, q11}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{q8, q9}, [r0]!
+	vst1.32	{q10, q11}, [r0]
+	it	ne
+	bne	.Lneon_b2f_copy_64_loop_nopld
+	ands	r2, r2, #0x3f
+	it	eq
+	beq	.Lneon_memmove_done
+	sub	r1, r1, #32
+	sub	r0, r0, #32
+	cmp	r2, #32
+	it	lt
+	blt	.Lneon_b2f_copy_8
+.Lneon_b2f_copy_32:
+	sub	r1, r1, #32
+	sub	r0, r0, #32
+	vld1.32	{q0, q1}, [r1]
+	vst1.32	{q0, q1}, [r0]
+	ands	r2, r2, #0x1f
+	it	eq
+	beq	.Lneon_memmove_done
+.Lneon_b2f_copy_8:
+	movs	r12, r2, lsr #0x3
+	it	eq
+	beq	.Lneon_b2f_copy_1
+.Lneon_b2f_copy_8_loop:
+	sub	r1, r1, #8
+	sub	r0, r0, #8
+	vld1.32	{d0}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{d0}, [r0]
+	it	ne
+	bne	.Lneon_b2f_copy_8_loop
+	ands	r2, r2, #0x7
+	beq	.Lneon_memmove_done
+.Lneon_b2f_copy_1:
+	movs	r12, r2, lsl #29
+	itttt	mi
+	submi	r1, r1, #4
+	submi	r0, r0, #4
+	ldrmi	r3, [r1]
+	strmi	r3, [r0]
+	movs	r2, r2, lsl #31
+	itttt	cs
+	subcs	r1, r1, #2
+	subcs	r0, r0, #2
+	ldrhcs	r3, [r1]
+	strhcs	r3, [r0]
+	itttt	mi
+	submi	r1, r1, #1
+	submi	r0, r0, #1
+	ldrbmi	r12, [r1]
+	strbmi	r12, [r0]
+.Lneon_memmove_done:
+	pop	{r0, pc}
+.Lneon_b2f_smallcopy_loop:
+	// 4 bytes or less
+	add	r1, r1, r2
+	add	r0, r0, r2
+	movs	r12, r2, lsl #29
+	itttt	mi
+	submi	r1, r1, #4
+	submi	r0, r0, #4
+	ldrmi	r3, [r1]
+	strmi	r3, [r0]
+	movs	r2, r2, lsl #31
+	itttt	cs
+	subcs	r1, r1, #2
+	subcs	r0, r0, #2
+	ldrhcs	r3, [r1]
+	strhcs	r3, [r0]
+	itttt	mi
+	submi	r1, r1, #1
+	submi	r0, r0, #1
+	ldrbmi	r12, [r1]
+	strbmi	r12, [r0]
+	bx	lr
+//	.cfi_endproc
+END(memmove)
+
diff --git a/libc/arch-arm/krait/bionic/memset.S b/libc/arch-arm/krait/bionic/memset.S
index a4fbe17..ae05965 100644
--- a/libc/arch-arm/krait/bionic/memset.S
+++ b/libc/arch-arm/krait/bionic/memset.S
@@ -69,10 +69,7 @@ END(bzero)
 
 /* memset() returns its first argument.  */
 ENTRY(memset)
-        stmfd       sp!, {r0}
-        .cfi_def_cfa_offset 4
-        .cfi_rel_offset r0, 0
-
+        mov         r3, r0
         vdup.8      q0, r1
 
         /* make sure we have at least 32 bytes to write */
@@ -82,7 +79,7 @@ ENTRY(memset)
 
 1:      /* The main loop writes 32 bytes at a time */
         subs        r2, r2, #32
-        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d0 - d3}, [r3]!
         bhs         1b
 
 2:      /* less than 32 left */
@@ -91,18 +88,17 @@ ENTRY(memset)
         beq         3f
 
         // writes 16 bytes, 128-bits aligned
-        vst1.8      {d0, d1}, [r0]!
+        vst1.8      {d0, d1}, [r3]!
 3:      /* write up to 15-bytes (count in r2) */
         movs        ip, r2, lsl #29
         bcc         1f
-        vst1.8      {d0}, [r0]!
+        vst1.8      {d0}, [r3]!
 1:      bge         2f
-        vst1.32     {d0[0]}, [r0]!
+        vst1.32     {d0[0]}, [r3]!
 2:      movs        ip, r2, lsl #31
-        strbmi      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        ldmfd       sp!, {r0}
+        strbmi      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
         bx          lr
 END(memset)
 
diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk
index 88b4d66..5f5b414 100644
--- a/libc/arch-arm/krait/krait.mk
+++ b/libc/arch-arm/krait/krait.mk
@@ -1,9 +1,19 @@
 libc_bionic_src_files_arm += \
-    arch-arm/krait/bionic/memcpy.S \
     arch-arm/krait/bionic/memset.S \
     arch-arm/krait/bionic/strcmp.S \
     arch-arm/krait/bionic/__strcat_chk.S \
     arch-arm/krait/bionic/__strcpy_chk.S \
+    arch-arm/krait/bionic/memmove.S
+
+#For some targets we don't need this optimization.
+#Corresponding flag is defined in device specific folder.
+ifeq ($(TARGET_CPU_MEMCPY_BASE_OPT_DISABLE),true)
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S
+else
+libc_bionic_src_files_arm += \
+    arch-arm/krait/bionic/memcpy.S
+endif
 
 # Use cortex-a15 versions of strcat/strcpy/strlen and standard memmove
 libc_bionic_src_files_arm += \
@@ -13,7 +23,7 @@ libc_bionic_src_files_arm += \
     arch-arm/cortex-a15/bionic/strlen.S \
 
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
 
-libc_bionic_src_files_arm += \
-    arch-arm/denver/bionic/memmove.S \
+
diff --git a/libc/arch-arm/scorpion/scorpion.mk b/libc/arch-arm/scorpion/scorpion.mk
new file mode 100644
index 0000000..ce18a7e
--- /dev/null
+++ b/libc/arch-arm/scorpion/scorpion.mk
@@ -0,0 +1,18 @@
+# Use krait versions of memset/strcmp/memmove
+libc_bionic_src_files_arm += \
+    arch-arm/krait/bionic/memset.S \
+    arch-arm/krait/bionic/strcmp.S \
+    arch-arm/krait/bionic/memmove.S
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/__strcat_chk.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+    arch-arm/cortex-a15/bionic/strlen.S
+
+libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
+    arch-arm/generic/bionic/memcmp.S
diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index 470a038..1b8d534 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -8,7 +8,6 @@ libc_bionic_src_files_arm64 += \
     bionic/__memset_chk.cpp \
     bionic/__strcpy_chk.cpp \
     bionic/__strcat_chk.cpp \
-    bionic/strrchr.cpp \
 
 libc_freebsd_src_files_arm64 += \
     upstream-freebsd/lib/libc/string/wcscat.c \
diff --git a/libc/arch-arm64/denver64/denver64.mk b/libc/arch-arm64/denver64/denver64.mk
index d619c11..3c453bb 100644
--- a/libc/arch-arm64/denver64/denver64.mk
+++ b/libc/arch-arm64/denver64/denver64.mk
@@ -11,4 +11,5 @@ libc_bionic_src_files_arm64 += \
     arch-arm64/generic/bionic/strlen.S \
     arch-arm64/generic/bionic/strncmp.S \
     arch-arm64/generic/bionic/strnlen.S \
+    arch-arm64/generic/bionic/strrchr.S \
     arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S
index c5d42ce..f850624 100644
--- a/libc/arch-arm64/generic/bionic/memcpy_base.S
+++ b/libc/arch-arm64/generic/bionic/memcpy_base.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Linaro Limited
+/* Copyright (c) 2012-2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,158 +22,196 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses.
  *
  */
 
+#include <private/bionic_asm.h>
+
 #define dstin	x0
 #define src	x1
 #define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
-
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
-
-	mov	dst, dstin
-	cmp	count, #64
-	b.ge	.Lcpy_not_short
-	cmp	count, #15
-	b.le	.Ltail15tiny
-
-	/* Deal with small copies quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63:
-	/* Copy up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-
-.Ltail15:
-	ands	count, count, #15
-	beq	1f
-	add	src, src, count
-	ldp	A_l, A_h, [src, #-16]
-	add	dst, dst, count
-	stp	A_l, A_h, [dst, #-16]
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw   w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define tmp1	x9
+
+#define L(l) .L ## l
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Small and medium copies read all data before writing, allowing any
+   kind of overlap, and memmove tailcalls memcpy for these cases as
+   well as non-overlapping copies.
+*/
+
+	prfm    PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+        cmp     count, 16
+        b.ls    L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
 1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
 	ret
 
-.Ltail15tiny:
-	/* Copy up to 15 bytes of data.  Does not assume additional data
-	   being copied.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
+	.p2align 4
+
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
 1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
 	ret
 
-.Lcpy_not_short:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Copy more data than needed; it's faster than jumping
-	 * around copying sub-Quadword quantities.  We know that
-	 * it can't overrun.  */
-	ldp	A_l, A_h, [src]
-	add	src, src, tmp2
-	stp	A_l, A_h, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63
-2:
-	subs	count, count, #128
-	b.ge	.Lcpy_body_large
-	/* Less than 128 bytes to copy, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	.Ltail63
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
 	ret
 
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lcpy_body_large:
-	/* There are at least 128 bytes to copy.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	2f
 1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	.Ltail63
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+2:
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
 	ret
diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S
index 8b366a3..c50112d 100644
--- a/libc/arch-arm64/generic/bionic/memmove.S
+++ b/libc/arch-arm64/generic/bionic/memmove.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,319 +22,131 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
- * wchar_t is 4 bytes
+ * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
  */
 
 #include <private/bionic_asm.h>
 
 /* Parameters and result.  */
-#ifdef BCOPY
-#define origdstin	x1
-#define origsrc	x0
-#endif
 #define dstin	x0
 #define src	x1
 #define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
-
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
+#define srcend	x3
+#define dstend	x4
+#define tmp1	x5
+#define A_l	x6
+#define A_h	x7
+#define B_l	x8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	count
+#define E_h	tmp1
+
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+   Larger backwards copies are also handled by memcpy. The only remaining
+   case is forward large copies.  The destination is aligned, and an
+   unrolled loop processes 64 bytes per iteration.
+*/
 
-#ifdef BCOPY
-ENTRY(bcopy)
-	/* Swap src and dst so that a branch to memcpy doesn't cause issues. */
-	mov	tmp1, origsrc
-	mov	origsrc, origdstin
-	mov	origdstin, tmp1
-#elif defined(WMEMMOVE)
+#if defined(WMEMMOVE)
 ENTRY(wmemmove)
 	lsl	count, count, #2
 #else
 ENTRY(memmove)
 #endif
-	cmp	dstin, src
-	b.lo	.Ldownwards
-	add	tmp1, src, count
-	cmp	dstin, tmp1
-	b.hs	memcpy		/* No overlap.  */
-
-	/* Upwards move with potential overlap.
-	 * Need to move from the tail backwards.  SRC and DST point one
-	 * byte beyond the remaining data to move.  */
-	add	dst, dstin, count
-	add	src, src, count
-	cmp	count, #64
-	b.ge	.Lmov_not_short_up
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63up:
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15up
-	sub	dst, dst, tmp1
-	sub	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #32]
-	stp	A_l, A_h, [dst, #32]
-1:
-	ldp	A_l, A_h, [src, #16]
-	stp	A_l, A_h, [dst, #16]
-2:
-	ldp	A_l, A_h, [src]
-	stp	A_l, A_h, [dst]
-.Ltail15up:
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	 * being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src, #-1]
-	strb	tmp1w, [dst, #-1]
-1:
-	ret
-
-.Lmov_not_short_up:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src, #-1]!
-	strb	tmp1w, [dst, #-1]!
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63up
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.hs	memcpy
+
+	cbz	tmp1, 3f
+	add	dstend, dstin, count
+	add	srcend, src, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
 2:
-	subs	count, count, #128
-	b.ge	.Lmov_body_large_up
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src, #-64]!
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst, #-64]!
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	b.ne	.Ltail63up
-	ret
-
-	/* Critical loop.  Start at a new Icache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lmov_body_large_up:
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-1:
-	stp	A_l, A_h, [dst, #-16]
-	ldp	A_l, A_h, [src, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	ldp	B_l, B_h, [src, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	C_l, C_h, [src, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	ldp	D_l, D_h, [src, #-64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	tst	count, #0x3f
-	b.ne	.Ltail63up
-	ret
-
-
-.Ldownwards:
-	/* For a downwards move we can safely use memcpy provided that
-	 * DST is more than 16 bytes away from SRC.  */
-	sub	tmp1, src, #16
-	cmp	dstin, tmp1
-	b.ls	memcpy		/* May overlap, but not critically.  */
-
-	mov	dst, dstin	/* Preserve DSTIN for return value.  */
-	cmp	count, #64
-	b.ge	.Lmov_not_short_down
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63down:
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15down
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-.Ltail15down:
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	   being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
-1:
-	ret
-
-.Lmov_not_short_down:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src], #1
-	strb	tmp1w, [dst], #1
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63down
-2:
-	subs	count, count, #128
-	b.ge	.Lmov_body_large_down
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	.Ltail63down
-	ret
-
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lmov_body_large_down:
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
-1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	.Ltail63down
-	ret
-#ifdef BCOPY
-END(bcopy)
-#elif defined(WMEMMOVE)
+	ldp	E_l, E_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	E_l, E_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+#if defined(WMEMMOVE)
 END(wmemmove)
 #else
 END(memmove)
diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S
index 7c204b4..4b3b17b 100644
--- a/libc/arch-arm64/generic/bionic/memset.S
+++ b/libc/arch-arm64/generic/bionic/memset.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Linaro Limited
+/* Copyright (c) 2012-2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,226 +22,207 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses
  *
  */
 
 #include <private/bionic_asm.h>
 
-/* By default we assume that the DC instruction can be used to zero
-   data blocks more efficiently.  In some circumstances this might be
-   unsafe, for example in an asymmetric multiprocessor environment with
-   different DC clear lengths (neither the upper nor lower lengths are
-   safe to use).
-
-   If code may be run in a virtualized environment, then define
-   MAYBE_VIRT.  This will cause the code to cache the system register
-   values rather than re-reading them each call.  */
-
-#define dstin		x0
-#ifdef BZERO
-#define count		x1
-#else
-#define count		x2
-#endif
-#define val		w1
-#define tmp1		x3
-#define tmp1w		w3
-#define tmp2		x4
-#define tmp2w		w4
-#define zva_len_x	x5
-#define zva_len		w5
-#define zva_bits_x	x6
-
-#define A_l		x7
-#define A_lw		w7
-#define dst		x8
-#define tmp3w		w9
-
-#ifdef BZERO
-ENTRY(bzero)
-#else
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
 ENTRY(memset)
-#endif
-
-	mov	dst, dstin		/* Preserve return value.  */
-#ifdef BZERO
-	b	.Lzero_mem
-#endif
-	ands	A_lw, val, #255
-	b.eq	.Lzero_mem
-	orr	A_lw, A_lw, A_lw, lsl #8
-	orr	A_lw, A_lw, A_lw, lsl #16
-	orr	A_l, A_l, A_l, lsl #32
-.Ltail_maybe_long:
-	cmp	count, #64
-	b.ge	.Lnot_short
-.Ltail_maybe_tiny:
-	cmp	count, #15
-	b.le	.Ltail15tiny
-.Ltail63:
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	add	dst, dst, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	stp	A_l, A_l, [dst, #-48]
-1:
-	stp	A_l, A_l, [dst, #-32]
-2:
-	stp	A_l, A_l, [dst, #-16]
-
-.Ltail15:
-	and	count, count, #15
-	add	dst, dst, count
-	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
-	ret
 
-.Ltail15tiny:
-	/* Set up to 15 bytes.  Does not assume earlier memory
-	   being set.  */
-	tbz	count, #3, 1f
-	str	A_l, [dst], #8
-1:
-	tbz	count, #2, 1f
-	str	A_lw, [dst], #4
-1:
-	tbz	count, #1, 1f
-	strh	A_lw, [dst], #2
-1:
-	tbz	count, #0, 1f
-	strb	A_lw, [dst]
-1:
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	nop
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
 	ret
 
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line, this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lnot_short:
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	2f
-	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
-	 * more than that to set, so we simply store 16 bytes and advance by
-	 * the amount required to reach alignment.  */
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63
-2:
-	sub	dst, dst, #16		/* Pre-bias.  */
-	sub	count, count, #64
-1:
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	stp	A_l, A_l, [dst, #48]
-	stp	A_l, A_l, [dst, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	tst	count, #0x3f
-	add	dst, dst, #16
-	b.ne	.Ltail63
+	.p2align 3
+	nop
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+L(tail64):
+	subs	count, count, 64
+	b.hi	1b
+2:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
 
-	/* For zeroing memory, check to see if we can use the ZVA feature to
-	 * zero entire 'cache' lines.  */
-.Lzero_mem:
-	mov	A_l, #0
-	cmp	count, #63
-	b.le	.Ltail_maybe_tiny
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	1f
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	cmp	count, #63
-	b.le	.Ltail63
-1:
-	/* For zeroing small amounts of memory, it's not worth setting up
-	 * the line-clear code.  */
-	cmp	count, #128
-	b.lt	.Lnot_short
-#ifdef MAYBE_VIRT
-	/* For efficiency when virtualized, we cache the ZVA capability.  */
-	adrp	tmp2, .Lcache_clear
-	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	tbnz	zva_len, #31, .Lnot_short
-	cbnz	zva_len, .Lzero_by_line
+	.p2align 3
+L(try_zva):
 	mrs	tmp1, dczid_el0
-	tbz	tmp1, #4, 1f
-	/* ZVA not available.  Remember this for next time.  */
-	mov	zva_len, #~0
-	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	b	.Lnot_short
-1:
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-#else
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1, #4, .Lnot_short
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-#endif
-
-.Lzero_by_line:
-	/* Compute how far we need to go to become suitably aligned.  We're
-	 * already at quad-word alignment.  */
-	cmp	count, zva_len_x
-	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
-	sub	zva_bits_x, zva_len_x, #1
-	neg	tmp2, dst
-	ands	tmp2, tmp2, zva_bits_x
-	b.eq	1f			/* Already aligned.  */
-	/* Not aligned, check that there's enough to copy after alignment.  */
-	sub	tmp1, count, tmp2
-	cmp	tmp1, #64
-	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
-	b.lt	.Lnot_short
-	/* We know that there's at least 64 bytes to zero and that it's safe
-	 * to overrun by 64 bytes.  */
-	mov	count, tmp1
-2:
-	stp	A_l, A_l, [dst]
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	subs	tmp2, tmp2, #64
-	stp	A_l, A_l, [dst, #48]
-	add	dst, dst, #64
-	b.ge	2b
-	/* We've overrun a bit, so adjust dst downwards.  */
-	add	dst, dst, tmp2
-1:
-	sub	count, count, zva_len_x
-3:
-	dc	zva, dst
-	add	dst, dst, zva_len_x
-	subs	count, count, zva_len_x
-	b.ge	3b
-	ands	count, count, zva_bits_x
-	b.ne	.Ltail_maybe_long
+	tbnz	tmp1w, 4, L(no_zva)
+	and	tmp1w, tmp1w, 15
+	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	b.ne	 L(zva_128)
+
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.
+	 */
+L(zva_64):
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
-#ifdef BZERO
-END(bzero)
-#else
+
+	.p2align 3
+L(zva_128):
+	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	b.ne	L(zva_other)
+
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(zva_other):
+	mov	tmp2w, 4
+	lsl	zva_lenw, tmp2w, tmp1w
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	b	L(tail64)
+
 END(memset)
-#endif
-
-#ifdef MAYBE_VIRT
-	.bss
-	.p2align 2
-.Lcache_clear:
-	.space 4
-#endif
diff --git a/libc/arch-arm64/generic/bionic/strlen.S b/libc/arch-arm64/generic/bionic/strlen.S
index 3bd9809..6e540fc 100644
--- a/libc/arch-arm64/generic/bionic/strlen.S
+++ b/libc/arch-arm64/generic/bionic/strlen.S
@@ -1,16 +1,16 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013-2015, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are met:
        * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
+	 notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
+	 notice, this list of conditions and the following disclaimer in the
+	 documentation and/or other materials provided with the distribution.
        * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
+	 names of its contributors may be used to endorse or promote products
+	 derived from this software without specific prior written permission.
 
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -22,16 +22,19 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
 #include <private/bionic_asm.h>
 
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.	 */
+
 /* Arguments and results.  */
 #define srcin		x0
 #define len		x0
@@ -40,87 +43,185 @@
 #define src		x1
 #define data1		x2
 #define data2		x3
-#define data2a		x4
-#define has_nul1	x5
-#define has_nul2	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define tmp4		x10
-#define zeroones	x11
-#define pos		x12
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+
+#define L(l) .L ## l
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word. A faster check
+	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+	   false hits for characters 129..255.	*/
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
-	/* Start of critial section -- keep to one 64Byte cache line.  */
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned ldp
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   strlen will be repeatedly called on strings with the same length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 32 bytes per iteration
+	   using the fast NUL check.  If we encounter non-ASCII characters,
+	   fallback to a second loop using the full NUL check.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
 ENTRY(strlen)
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-.Lloop:
-	ldp	data1, data2, [src], #16
-.Lrealigned:
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lloop
-	/* End of critical section -- keep to one 64Byte cache line.  */
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
 
-	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
-#ifdef __AARCH64EB__
-	mov	data2, data1
-#endif
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-.Lnul_in_data2:
+	/* Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+	/* The inner loop processes 32 bytes per iteration and uses the fast
+	   NUL check.  If we encounter non-ASCII characters, use a second
+	   loop with the accurate NUL check.  */
+	.p2align 4
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+L(main_loop):
+	ldp	data1, data2, [src, 32]!
+.Lpage_cross_entry:
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	bne	1f
+	ldp	data1, data2, [src, 16]
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	beq	L(main_loop)
+	add	src, src, 16
+1:
+	/* The fast check failed, so do the slower, accurate NUL check.	 */
+	orr	tmp2, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+
+	/* Enter with C = has_nul1 == 0.  */
+L(tail):
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
+	   string is 0x01) means we cannot use has_nul1/2 directly.  The
 	   easiest way to get the correct byte is to byte-swap the data
 	   and calculate the syndrome a second time.  */
-	rev	data2, data2
-	sub	tmp1, data2, zeroones
-	orr	tmp2, data2, #REP8_7f
-	bic	has_nul2, tmp1, tmp2
+	csel	data1, data1, data2, cc
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, cc
 #endif
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	sub	len, src, srcin
+	rev	has_nul1, has_nul1
+	add	tmp2, len, 8
+	clz	tmp1, has_nul1
+	csel	len, len, tmp2, cc
+	add	len, len, tmp1, lsr 3
 	ret
 
-.Lmisaligned:
-	cmp	tmp1, #8
-	neg	tmp1, tmp1
-	ldp	data1, data2, [src], #16
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	mov	tmp2, #~0
+L(nonascii_loop):
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	bne	L(tail)
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+	b	L(tail)
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0x7f, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	bic	src, srcin, 15
+	ldp	data1, data2, [src]
+	lsl	tmp1, srcin, 3
+	mov	tmp4, -1
 #ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	/* Big-endian.	Early bytes are at MSB.	 */
+	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	.Lrealigned
+	orr	tmp1, tmp1, REP8_80
+	orn	data1, data1, tmp1
+	orn	tmp2, data2, tmp1
+	tst	srcin, 8
+	csel	data1, data1, tmp4, eq
+	csel	data2, data2, tmp2, eq
+	b	L(page_cross_entry)
 
 END(strlen)
diff --git a/libc/arch-arm64/generic/bionic/strrchr.S b/libc/arch-arm64/generic/bionic/strrchr.S
new file mode 100644
index 0000000..46b5031
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strrchr.S
@@ -0,0 +1,171 @@
+/*
+   strrchr - find last instance of a character in a string
+
+   Copyright (c) 2014, ARM Limited
+   All rights Reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the company nor the names of its contributors
+         may be used to endorse or promote products derived from this
+         software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+#define src_match	x6
+#define src_offset	x7
+#define const_m1	x8
+#define tmp4		x9
+#define nul_match	x10
+#define chr_match	x11
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+/* Locals and temporaries.  */
+
+ENTRY(strrchr)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	mov	src_offset, #0
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	.Laligned
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	mov	nul_match, vhas_nul1.2d[0]
+	lsl	tmp1, tmp1, #1
+	mov	const_m1, #~0
+	mov	chr_match, vhas_chr1.2d[0]
+	lsr	tmp3, const_m1, tmp1
+
+	bic	nul_match, nul_match, tmp3	// Mask padding bits.
+	bic	chr_match, chr_match, tmp3	// Mask padding bits.
+	cbnz	nul_match, .Ltail
+
+.Lloop:
+	cmp	chr_match, #0
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+.Laligned:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	mov	nul_match, vend1.2d[0]
+	mov	chr_match, vhas_chr1.2d[0]
+	cbz	nul_match, .Lloop
+
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+	mov	nul_match, vhas_nul1.2d[0]
+
+.Ltail:
+	/* Work out exactly where the string ends.  */
+	sub	tmp4, nul_match, #1
+	eor	tmp4, tmp4, nul_match
+	ands	chr_match, chr_match, tmp4
+	/* And pick the values corresponding to the last match.  */
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+
+	/* Count down from the top of the syndrome to find the last match.  */
+	clz	tmp3, src_offset
+	/* Src_match points beyond the word containing the match, so we can
+	   simply subtract half the bit-offset into the syndrome.  Because
+	   we are counting down, we need to go back one more character.  */
+	add	tmp3, tmp3, #2
+	sub	result, src_match, tmp3, lsr #1
+	/* But if the syndrome shows no match was found, then return NULL.  */
+	cmp	src_offset, #0
+	csel	result, result, xzr, ne
+
+	ret
+
+END(strrchr)
diff --git a/libc/arch-arm64/generic/generic.mk b/libc/arch-arm64/generic/generic.mk
index 1b595aa..4512dc5 100644
--- a/libc/arch-arm64/generic/generic.mk
+++ b/libc/arch-arm64/generic/generic.mk
@@ -11,4 +11,5 @@ libc_bionic_src_files_arm64 += \
     arch-arm64/generic/bionic/strlen.S \
     arch-arm64/generic/bionic/strncmp.S \
     arch-arm64/generic/bionic/strnlen.S \
+    arch-arm64/generic/bionic/strrchr.S \
     arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-arm64/kryo/bionic/memcpy.S b/libc/arch-arm64/kryo/bionic/memcpy.S
new file mode 100644
index 0000000..87e1b3b
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+ENTRY(__memcpy_chk)
+  cmp   x2, x3
+  b.hi  __memcpy_chk_fail
+
+  // Fall through to memcpy...
+  b memcpy
+END(__memcpy_chk)
+
+        .align  6
+ENTRY(memcpy)
+  #include "memcpy_base.S"
+END(memcpy)
+
+ENTRY_PRIVATE(__memcpy_chk_fail)
+  // Preserve for accurate backtrace.
+  stp  x29, x30, [sp, -16]!
+  .cfi_def_cfa_offset 16
+  .cfi_rel_offset x29, 0
+  .cfi_rel_offset x30, 8
+
+  adrp  x0, error_string
+  add   x0, x0, :lo12:error_string
+  ldr   x1, error_code
+  bl    __fortify_chk_fail
+error_code:
+  .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+END(__memcpy_chk_fail)
+
+  .data
+  .align 2
+error_string:
+  .string "memcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm64/kryo/bionic/memcpy_base.S b/libc/arch-arm64/kryo/bionic/memcpy_base.S
new file mode 100644
index 0000000..0096bb7
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy_base.S
@@ -0,0 +1,244 @@
+/* Copyright (c) 2015 The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of The Linux Foundation nor the names of its contributors may
+ *       be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef PLDOFFS
+#undef PLDOFFS
+#endif
+#define PLDOFFS		(16)
+
+#ifdef PLDTHRESH
+#undef PLDTHRESH
+#endif
+#define PLDTHRESH (PLDOFFS)
+
+#ifdef BBTHRESH
+#undef BBTHRESH
+#endif
+#define BBTHRESH (2048/128)
+
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+
+#ifdef PLDSIZE
+#undef PLDSIZE
+#endif
+#define PLDSIZE	(128)
+
+kryo_bb_memcpy:
+	mov	x11, x0
+	cmp	x2, #4
+	blo	kryo_bb_lt4
+	cmp	x2, #16
+	blo	kryo_bb_lt16
+	cmp	x2, #32
+	blo	kryo_bb_16
+	cmp	x2, #64
+	blo	kryo_bb_copy_32_a
+	cmp	x2, #128
+	blo	kryo_bb_copy_64_a
+
+	// we have at least 127 bytes to achieve 128-byte alignment
+	neg	x3, x1			// calculate count to get SOURCE aligned
+	ands	x3, x3, #0x7F
+	b.eq	kryo_bb_source_aligned	// already aligned
+	// alignment fixup, small to large (favorable alignment)
+	tbz	x3, #0, 1f
+	ldrb	w5, [x1], #1
+	strb	w5, [x0], #1
+1:	tbz	x3, #1, 2f
+	ldrh	w6, [x1], #2
+	strh	w6, [x0], #2
+2:	tbz	x3, #2, 3f
+	ldr	w8, [x1], #4
+	str	w8, [x0], #4
+3:	tbz	x3, #3, 4f
+	ldr	x9, [x1], #8
+	str	x9, [x0], #8
+4:	tbz	x3, #4, 5f
+	ldr	q7, [x1], #16
+	str	q7, [x0], #16
+5:	tbz	x3, #5, 55f
+	ldp	q0, q1, [x1], #32
+	stp	q0, q1, [x0], #32
+55:	tbz	x3, #6, 6f
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+6:	subs	x2, x2, x3		// fixup count after alignment
+	b.eq	kryo_bb_exit
+	cmp	x2, #128
+	blo	kryo_bb_copy_64_a
+kryo_bb_source_aligned:
+	lsr	x12, x2, #7
+	cmp	x12, #PLDTHRESH
+	bls	kryo_bb_copy_128_loop_nopld
+
+	cmp	x12, #BBTHRESH
+	bls	kryo_bb_prime_pump
+
+	add	x14, x0, #0x400
+	add	x9,  x1, #(PLDOFFS*PLDSIZE)
+	sub	x14, x14, x9
+	lsl	x14, x14, #(21+32)
+	lsr	x14, x14, #(21+32)
+	add	x14, x14, #(PLDOFFS*PLDSIZE)
+	cmp	x12, x14, lsr #7
+	bls	kryo_bb_prime_pump
+
+	mov	x9, #(PLDOFFS)
+	lsr     x13, x14, #7
+	subs    x9, x13, x9
+	bls	kryo_bb_prime_pump
+
+	add	x10, x1, x14
+	bic	x10, x10, #0x7F		// Round to multiple of PLDSIZE
+
+	sub	x12, x12, x14, lsr #7
+	cmp	x9, x12
+	sub     x13, x12, x9
+	csel    x12, x13, x12, LS
+	csel    x9, x12, x9, HI
+	csel    x12, xzr, x12, HI
+
+	prfm	PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)]
+	prfm	PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)]
+kryo_bb_copy_128_loop_outer_doublepld:
+	prfm	PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)]
+	prfm	PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64]
+	subs	x9, x9, #1
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	prfm	PLDL1KEEP, [x10]
+	prfm	PLDL1KEEP, [x10, #64]
+	add	x10, x10, #128
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_outer_doublepld
+	cmp	x12, #0
+	beq	kryo_bb_pop_before_nopld
+	cmp	x12, #(448*1024/128)
+	bls	kryo_bb_copy_128_loop_outer
+
+kryo_bb_copy_128_loop_ddr:
+	subs	x12, x12, #1
+	ldr	x3, [x10], #128
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_ddr
+	b	kryo_bb_pop_before_nopld
+
+kryo_bb_prime_pump:
+	mov	x14, #(PLDOFFS*PLDSIZE)
+	add	x10, x1, #(PLDOFFS*PLDSIZE)
+	bic	x10, x10, #0x7F
+	sub	x12, x12, #PLDOFFS
+	prfm	PLDL1KEEP, [x10, #(-1*PLDSIZE)]
+	prfm	PLDL1KEEP, [x10, #(-1*PLDSIZE+64)]
+	cmp	x12, #(448*1024/128)
+	bhi	kryo_bb_copy_128_loop_ddr
+
+kryo_bb_copy_128_loop_outer:
+	subs	x12, x12, #1
+	prfm	PLDL1KEEP, [x10]
+	prfm	PLDL1KEEP, [x10, #64]
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	add	x10, x10, #128
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_outer
+
+kryo_bb_pop_before_nopld:
+	lsr	x12, x14, #7
+kryo_bb_copy_128_loop_nopld:
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	subs	x12, x12, #1
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_nopld
+	ands	x2, x2, #0x7f
+	beq	kryo_bb_exit
+
+kryo_bb_copy_64_a:
+	tbz	x2, #6, kryo_bb_copy_32_a
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+kryo_bb_copy_32_a:
+	tbz	x2, #5, kryo_bb_16
+	ldp	q0, q1, [x1], #32
+	stp	q0, q1, [x0], #32
+kryo_bb_16:
+	tbz	x2, #4, kryo_bb_lt16
+	ldr	q7, [x1], #16
+	str	q7, [x0], #16
+	ands	x2, x2, #0x0f
+	beq	kryo_bb_exit
+kryo_bb_lt16:
+	tbz	x2, #3, kryo_bb_lt8
+	ldr	x3, [x1], #8
+	str	x3, [x0], #8
+kryo_bb_lt8:
+	tbz	x2, #2, kryo_bb_lt4
+	ldr	w3, [x1], #4
+	str	w3, [x0], #4
+kryo_bb_lt4:
+	tbz	x2, #1, kryo_bb_lt2
+	ldrh	w3, [x1], #2
+	strh	w3, [x0], #2
+kryo_bb_lt2:
+	tbz	x2, #0, kryo_bb_exit
+	ldrb	w3, [x1], #1
+	strb	w3, [x0], #1
+kryo_bb_exit:
+	mov	x0, x11
+	ret
+
diff --git a/libc/arch-arm64/kryo/kryo.mk b/libc/arch-arm64/kryo/kryo.mk
new file mode 100644
index 0000000..1d901d0
--- /dev/null
+++ b/libc/arch-arm64/kryo/kryo.mk
@@ -0,0 +1,15 @@
+libc_bionic_src_files_arm64 += \
+    arch-arm64/generic/bionic/memchr.S \
+    arch-arm64/generic/bionic/memcmp.S \
+    arch-arm64/kryo/bionic/memcpy.S \
+    arch-arm64/generic/bionic/memmove.S \
+    arch-arm64/generic/bionic/memset.S \
+    arch-arm64/generic/bionic/stpcpy.S \
+    arch-arm64/generic/bionic/strchr.S \
+    arch-arm64/generic/bionic/strcmp.S \
+    arch-arm64/generic/bionic/strcpy.S \
+    arch-arm64/generic/bionic/strlen.S \
+    arch-arm64/generic/bionic/strncmp.S \
+    arch-arm64/generic/bionic/strnlen.S \
+    arch-arm64/generic/bionic/strrchr.S \
+    arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
index 0dbffad..6a5afd6 100644
--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -91,9 +91,6 @@ name:		\
 	.section .text.sse2,"ax",@progbits
 ENTRY (MEMMOVE)
 	ENTRANCE
-#ifdef USE_AS_BCOPY
-	xchg	%rsi, %rdi
-#endif
 	mov	%rdi, %rax
 
 /* Check whether we should copy backward or forward.  */
diff --git a/libc/bionic/malloc_debug_check.cpp b/libc/bionic/malloc_debug_check.cpp
index dee03fa..ad0e613 100644
--- a/libc/bionic/malloc_debug_check.cpp
+++ b/libc/bionic/malloc_debug_check.cpp
@@ -45,6 +45,7 @@
 #include <time.h>
 #include <unistd.h>
 #include <unwind.h>
+#include <signal.h>
 
 #include "debug_mapinfo.h"
 #include "debug_stacktrace.h"
@@ -55,6 +56,14 @@
 #include "private/libc_logging.h"
 #include "private/ScopedPthreadMutexLocker.h"
 
+static unsigned int malloc_sig_enabled = 0;
+static unsigned int min_allocation_report_limit;
+static unsigned int max_allocation_limit;
+static const char* process_name;
+static size_t total_count = 0;
+static bool isDumped = false;
+static bool sigHandled = false;
+
 #define MAX_BACKTRACE_DEPTH 16
 #define ALLOCATION_TAG      0x1ee7d00d
 #define BACKLOG_TAG         0xbabecafe
@@ -63,6 +72,11 @@
 #define FRONT_GUARD_LEN     (1<<5)
 #define REAR_GUARD          0xbb
 #define REAR_GUARD_LEN      (1<<5)
+#define FRONT_GUARD_SS      0xab
+#define DEBUG_SIGNAL SIGWINCH
+
+static void malloc_sigaction(int signum, siginfo_t * sg, void * cxt);
+static struct sigaction default_sa;
 
 static void log_message(const char* format, ...) {
   va_list args;
@@ -135,9 +149,14 @@ static inline void init_front_guard(hdr_t* hdr) {
     memset(hdr->front_guard, FRONT_GUARD, FRONT_GUARD_LEN);
 }
 
+static inline void set_snapshot(hdr_t* hdr) {
+    memset(hdr->front_guard, FRONT_GUARD_SS, FRONT_GUARD_LEN);
+}
+
 static inline bool is_front_guard_valid(hdr_t* hdr) {
     for (size_t i = 0; i < FRONT_GUARD_LEN; i++) {
-        if (hdr->front_guard[i] != FRONT_GUARD) {
+        if (!((hdr->front_guard[i] == FRONT_GUARD) ||
+                    (hdr->front_guard[i] == FRONT_GUARD_SS))) {
             return false;
         }
     }
@@ -171,6 +190,9 @@ static inline bool is_rear_guard_valid(hdr_t* hdr) {
 }
 
 static inline void add_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
+    if (hdr->tag == ALLOCATION_TAG) {
+        total_count += hdr->size;
+    }
     hdr->prev = NULL;
     hdr->next = *head;
     if (*head)
@@ -181,6 +203,9 @@ static inline void add_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
 }
 
 static inline int del_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
+    if (hdr->tag == ALLOCATION_TAG) {
+        total_count -= hdr->size;
+    }
     if (hdr->prev) {
         hdr->prev->next = hdr->next;
     } else {
@@ -194,6 +219,25 @@ static inline int del_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
     return 0;
 }
 
+static void snapshot_report_leaked_nodes() {
+    log_message("%s: %s\n", __FILE__, __FUNCTION__);
+    hdr_t * iterator = head;
+    size_t total_size = 0;
+    do {
+        if (iterator->front_guard[0] == FRONT_GUARD &&
+                iterator->size >= min_allocation_report_limit) {
+            log_message("obj %p, size %d", iterator, iterator->size);
+            total_size += iterator->size;
+            log_backtrace(iterator->bt, iterator->bt_depth);
+            log_message("------------------------------"); // as an end marker
+            // Marking the node as we do not want to print it again.
+            set_snapshot(iterator);
+        }
+        iterator = iterator->next;
+    } while (iterator);
+    log_message("Total Pending allocations after last snapshot: %d", total_size);
+}
+
 static inline void add(hdr_t* hdr, size_t size) {
     ScopedPthreadMutexLocker locker(&lock);
     hdr->tag = ALLOCATION_TAG;
@@ -202,6 +246,11 @@ static inline void add(hdr_t* hdr, size_t size) {
     init_rear_guard(hdr);
     ++g_allocated_block_count;
     add_locked(hdr, &tail, &head);
+    if ((total_count >= max_allocation_limit) && !isDumped && malloc_sig_enabled) {
+        isDumped = true;
+        sigHandled = true; // Need to bypass the snapshot
+        kill(getpid(), DEBUG_SIGNAL);
+    }
 }
 
 static inline int del(hdr_t* hdr) {
@@ -233,7 +282,8 @@ static bool was_used_after_free(hdr_t* hdr) {
 static inline int check_guards(hdr_t* hdr, int* safe) {
     *safe = 1;
     if (!is_front_guard_valid(hdr)) {
-        if (hdr->front_guard[0] == FRONT_GUARD) {
+        if ((hdr->front_guard[0] == FRONT_GUARD) ||
+                ((hdr->front_guard[0] == FRONT_GUARD_SS))) {
             log_message("+++ ALLOCATION %p SIZE %d HAS A CORRUPTED FRONT GUARD\n",
                        user(hdr), hdr->size);
         } else {
@@ -656,6 +706,42 @@ extern "C" bool malloc_debug_initialize(HashTable* hash_table, const MallocDebug
     __libc_format_log(ANDROID_LOG_INFO, "libc", "not gathering backtrace information\n");
   }
 
+  if (__system_property_get("libc.debug.malloc", env)) {
+    if(atoi(env) == 40) malloc_sig_enabled = 1;
+  }
+
+  if (malloc_sig_enabled) {
+    char debug_proc_size[PROP_VALUE_MAX];
+    if (__system_property_get("libc.debug.malloc.maxprocsize", debug_proc_size))
+      max_allocation_limit = atoi(debug_proc_size);
+    else
+      max_allocation_limit = 30 * 1024 * 1024; // In Bytes [Default is 30 MB]
+    if (__system_property_get("libc.debug.malloc.minalloclim", debug_proc_size))
+      min_allocation_report_limit = atoi(debug_proc_size);
+    else
+      min_allocation_report_limit = 10 * 1024; // In Bytes [Default is 10 KB]
+    process_name = getprogname();
+  }
+
+/* Initializes malloc debugging framework.
+ * See comments on MallocDebugInit in malloc_debug_common.h
+ */
+  if (malloc_sig_enabled) {
+    struct sigaction sa; //local or static?
+    sa.sa_handler = NULL;
+    sa.sa_sigaction = malloc_sigaction;
+    sigemptyset(&sa.sa_mask);
+    sigaddset(&sa.sa_mask, DEBUG_SIGNAL);
+    sa.sa_flags = SA_SIGINFO;
+    sa.sa_restorer = NULL;
+    if (sigaction(DEBUG_SIGNAL, &sa, &default_sa) < 0) {
+      log_message("Failed to register signal handler w/ errno %s", strerror(errno));
+      malloc_sig_enabled = 0;
+    } else {
+      log_message("Registered signal handler");
+      sigHandled = false;
+    }
+  }
   if (g_backtrace_enabled) {
     backtrace_startup();
   }
@@ -668,9 +754,66 @@ extern "C" void malloc_debug_finalize(int malloc_debug_level) {
   if (malloc_debug_level == 10) {
     ReportMemoryLeaks();
   }
+  if (malloc_sig_enabled) {
+    log_message("Deregister %d signal handler", DEBUG_SIGNAL);
+    sigaction(DEBUG_SIGNAL, &default_sa, NULL);
+    malloc_sig_enabled = 0;
+    sigHandled = false;
+  }
   if (g_backtrace_enabled) {
     backtrace_shutdown();
   }
 
   pthread_setspecific(g_debug_calls_disabled, NULL);
 }
+
+static void snapshot_nodes_locked() {
+  log_message("%s: %s\n", __FILE__, __FUNCTION__);
+  hdr_t * iterator = head;
+  do {
+    if (iterator->front_guard[0] == FRONT_GUARD) {
+      set_snapshot(iterator);
+    }
+    iterator = iterator->next;
+  } while (iterator);
+}
+
+static void malloc_sigaction(int signum, siginfo_t * info, void * context)
+{
+  log_message("%s: %s\n", __FILE__, __FUNCTION__);
+  log_message("%s got %d signal from PID: %d (context:%x)\n",
+          __func__, signum, info->si_pid, context);
+
+  if (signum != DEBUG_SIGNAL) {
+    log_message("RECEIVED %d instead of %d\n", signum, DEBUG_SIGNAL);
+    return;
+  }
+
+  log_message("Process under observation:%s", process_name);
+  log_message("Maximum process size limit:%d Bytes", max_allocation_limit);
+  log_message("Won't print allocation below %d Bytes", min_allocation_report_limit);
+  log_message("Total count: %d\n", total_count);
+
+  if (!head) {
+    log_message("No allocations?");
+    return;
+  }
+  // If sigHandled is false, meaning it's being handled first time
+  if (!sigHandled) {
+    sigHandled = true;
+    // Marking the nodes assuming that they should not be leaked nodes.
+    snapshot_nodes_locked();
+  } else {
+    // We need to print new allocations now
+    log_message("Start dumping allocations of the process %s", process_name);
+    log_message("+++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ ***\n");
+
+    // Print allocations of the process
+    if (g_backtrace_enabled)
+        snapshot_report_leaked_nodes();
+
+    log_message("*** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++\n");
+    log_message("Completed dumping allocations of the process %s", process_name);
+  }
+  return;
+}
diff --git a/libc/bionic/malloc_debug_common.cpp b/libc/bionic/malloc_debug_common.cpp
index ee796c6..12fc6dd 100644
--- a/libc/bionic/malloc_debug_common.cpp
+++ b/libc/bionic/malloc_debug_common.cpp
@@ -396,6 +396,9 @@ static void malloc_init_impl() {
       }
       so_name = "libc_malloc_debug_qemu.so";
       break;
+    case 40:
+      so_name = "libc_malloc_debug_leak.so";
+      break;
     default:
       error_log("%s: Debug level %d is unknown\n", getprogname(), g_malloc_debug_level);
       return;
@@ -456,6 +459,9 @@ static void malloc_init_impl() {
     case 20:
       InitMalloc(malloc_impl_handle, &malloc_dispatch_table, "qemu_instrumented");
       break;
+    case 40:
+      InitMalloc(malloc_impl_handle, &malloc_dispatch_table, "chk");
+      break;
     default:
       break;
   }
diff --git a/libc/bionic/mmap.cpp b/libc/bionic/mmap.cpp
index 8f25a89..53e8b46 100644
--- a/libc/bionic/mmap.cpp
+++ b/libc/bionic/mmap.cpp
@@ -36,6 +36,11 @@
 extern "C" void*  __mmap2(void*, size_t, int, int, int, size_t);
 
 #define MMAP2_SHIFT 12 // 2**12 == 4096
+#ifdef LEGACY_MMAP
+#define TO_64(a) ((a) & 0x00000000ffffffff)
+#else
+#define TO_64(a) (a)
+#endif
 
 static bool kernel_has_MADV_MERGEABLE = true;
 
@@ -60,5 +65,5 @@ void* mmap64(void* addr, size_t size, int prot, int flags, int fd, off64_t offse
 }
 
 void* mmap(void* addr, size_t size, int prot, int flags, int fd, off_t offset) {
-  return mmap64(addr, size, prot, flags, fd, static_cast<off64_t>(offset));
+  return mmap64(addr, size, prot, flags, fd, TO_64(static_cast<off64_t>(offset)));
 }
diff --git a/libc/include/paths.h b/libc/include/paths.h
index 82c2804..7700cdd 100644
--- a/libc/include/paths.h
+++ b/libc/include/paths.h
@@ -33,6 +33,7 @@
 #define	_PATHS_H_
 
 #define	_PATH_BSHELL	"/system/bin/sh"
+#define	_PATH_BSHELL2	"/sbin/sh"
 #define	_PATH_CONSOLE	"/dev/console"
 #define	_PATH_DEFPATH	"/sbin:/vendor/bin:/system/sbin:/system/bin:/system/xbin"
 #define	_PATH_DEV	"/dev/"
diff --git a/libc/include/regex.h b/libc/include/regex.h
index aec38e3..b06a515 100644
--- a/libc/include/regex.h
+++ b/libc/include/regex.h
@@ -42,8 +42,9 @@
 #include <sys/cdefs.h>
 #include <sys/types.h>
 
-/* types */
-typedef off_t regoff_t;
+/* POSIX says regoff_t is at least as large as the larger of ptrdiff_t and
+ * ssize_t. BSD uses off_t, but that interacts badly with _FILE_OFFSET_BITS. */
+typedef ssize_t regoff_t;
 
 typedef struct {
 	int re_magic;
diff --git a/libc/kernel/uapi/linux/android_alarm.h b/libc/kernel/uapi/linux/android_alarm.h
index 801a01e..9f2de28 100644
--- a/libc/kernel/uapi/linux/android_alarm.h
+++ b/libc/kernel/uapi/linux/android_alarm.h
@@ -28,28 +28,31 @@ enum android_alarm_type {
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
   ANDROID_ALARM_ELAPSED_REALTIME,
   ANDROID_ALARM_SYSTEMTIME,
+  ANDROID_ALARM_RTC_POWEROFF_WAKEUP,
   ANDROID_ALARM_TYPE_COUNT,
-};
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+};
 enum android_alarm_return_flags {
   ANDROID_ALARM_RTC_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_WAKEUP,
   ANDROID_ALARM_RTC_MASK = 1U << ANDROID_ALARM_RTC,
-  ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+  ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
   ANDROID_ALARM_ELAPSED_REALTIME_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME,
   ANDROID_ALARM_SYSTEMTIME_MASK = 1U << ANDROID_ALARM_SYSTEMTIME,
+  ANDROID_ALARM_RTC_POWEROFF_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_POWEROFF_WAKEUP,
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
   ANDROID_ALARM_TIME_CHANGE_MASK = 1U << 16
 };
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_CLEAR(type) _IO('a', 0 | ((type) << 4))
 #define ANDROID_ALARM_WAIT _IO('a', 1)
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ALARM_IOW(c,type,size) _IOW('a', (c) | ((type) << 4), size)
 #define ANDROID_ALARM_SET(type) ALARM_IOW(2, type, struct timespec)
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_SET_AND_WAIT(type) ALARM_IOW(3, type, struct timespec)
 #define ANDROID_ALARM_GET_TIME(type) ALARM_IOW(4, type, struct timespec)
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_SET_RTC _IOW('a', 5, struct timespec)
 #define ANDROID_ALARM_BASE_CMD(cmd) (cmd & ~(_IOC(0, 0, 0xf0, 0)))
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_IOCTL_TO_TYPE(cmd) (_IOC_NR(cmd) >> 4)
 #endif
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
diff --git a/libc/kernel/uapi/linux/time.h b/libc/kernel/uapi/linux/time.h
index bf245fc..5690d27 100644
--- a/libc/kernel/uapi/linux/time.h
+++ b/libc/kernel/uapi/linux/time.h
@@ -67,9 +67,10 @@ struct itimerval {
 #define CLOCK_SGI_CYCLE 10
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define CLOCK_TAI 11
+#define CLOCK_POWEROFF_ALARM 12
 #define MAX_CLOCKS 16
 #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC)
-#define CLOCKS_MONO CLOCK_MONOTONIC
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+#define CLOCKS_MONO CLOCK_MONOTONIC
 #define TIMER_ABSTIME 0x01
 #endif
diff --git a/libc/upstream-netbsd/lib/libc/gen/popen.c b/libc/upstream-netbsd/lib/libc/gen/popen.c
index 593e346..b6ce47c 100644
--- a/libc/upstream-netbsd/lib/libc/gen/popen.c
+++ b/libc/upstream-netbsd/lib/libc/gen/popen.c
@@ -152,6 +152,8 @@ popen(const char *command, const char *type)
 		}
 
 		execl(_PATH_BSHELL, "sh", "-c", command, NULL);
+		if (errno == ENOENT)
+			execl(_PATH_BSHELL2, "sh", "-c", command, NULL);
 		_exit(127);
 		/* NOTREACHED */
 	}