106 files changed, 4126 insertions, 2280 deletions
diff --git a/benchmarks/math_benchmark.cpp b/benchmarks/math_benchmark.cpp
index 4de28d1..ed5b56c 100644
--- a/benchmarks/math_benchmark.cpp
+++ b/benchmarks/math_benchmark.cpp
@@ -65,6 +65,50 @@ void BM_math_logb::Run(int iters) {
   StopBenchmarkTiming();
 }
 
+BENCHMARK_WITH_ARG(BM_math_isfinite_macro, double)->AT_COMMON_VALS;
+void BM_math_isfinite_macro::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += isfinite(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
+#if defined(__BIONIC__)
+#define test_isfinite __isfinite
+#else
+#define test_isfinite __finite
+#endif
+BENCHMARK_WITH_ARG(BM_math_isfinite, double)->AT_COMMON_VALS;
+void BM_math_isfinite::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += test_isfinite(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
+BENCHMARK_WITH_ARG(BM_math_isinf_macro, double)->AT_COMMON_VALS;
+void BM_math_isinf_macro::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += isinf(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
 BENCHMARK_WITH_ARG(BM_math_isinf, double)->AT_COMMON_VALS;
 void BM_math_isinf::Run(int iters, double value) {
   StartBenchmarkTiming();
@@ -78,6 +122,60 @@ void BM_math_isinf::Run(int iters, double value) {
   StopBenchmarkTiming();
 }
 
+BENCHMARK_WITH_ARG(BM_math_isnan_macro, double)->AT_COMMON_VALS;
+void BM_math_isnan_macro::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += isnan(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
+BENCHMARK_WITH_ARG(BM_math_isnan, double)->AT_COMMON_VALS;
+void BM_math_isnan::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += (isnan)(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
+BENCHMARK_WITH_ARG(BM_math_isnormal_macro, double)->AT_COMMON_VALS;
+void BM_math_isnormal_macro::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += isnormal(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
+#if defined(__BIONIC__)
+BENCHMARK_WITH_ARG(BM_math_isnormal, double)->AT_COMMON_VALS;
+void BM_math_isnormal::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += (__isnormal)(v);
+  }
+
+  StopBenchmarkTiming();
+}
+#endif
+
 BENCHMARK_NO_ARG(BM_math_sin_fast);
 void BM_math_sin_fast::Run(int iters) {
   StartBenchmarkTiming();
@@ -134,3 +232,55 @@ void BM_math_fpclassify::Run(int iters, double value) {
 
   StopBenchmarkTiming();
 }
+
+BENCHMARK_WITH_ARG(BM_math_signbit_macro, double)->AT_COMMON_VALS;
+void BM_math_signbit_macro::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += signbit(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
+BENCHMARK_WITH_ARG(BM_math_signbit, double)->AT_COMMON_VALS;
+void BM_math_signbit::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += (__signbit)(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
+BENCHMARK_WITH_ARG(BM_math_fabs_macro, double)->AT_COMMON_VALS;
+void BM_math_fabs_macro::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += fabs(v);
+  }
+
+  StopBenchmarkTiming();
+}
+
+BENCHMARK_WITH_ARG(BM_math_fabs, double)->AT_COMMON_VALS;
+void BM_math_fabs::Run(int iters, double value) {
+  StartBenchmarkTiming();
+
+  d = 0.0;
+  v = value;
+  for (int i = 0; i < iters; ++i) {
+    d += (fabs)(v);
+  }
+
+  StopBenchmarkTiming();
+}
diff --git a/libc/Android.mk b/libc/Android.mk
index f0c5e9f..f7f2adc 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -611,6 +611,10 @@ ifneq ($(BOARD_MALLOC_ALIGNMENT),)
   libc_common_cflags += -DMALLOC_ALIGNMENT=$(BOARD_MALLOC_ALIGNMENT)
 endif
 
+ifeq ($(BOARD_USES_LEGACY_MMAP),true)
+  libc_common_cflags += -DLEGACY_MMAP
+endif
+
 # Define some common conlyflags
 libc_common_conlyflags := \
     -std=gnu99
@@ -1394,6 +1398,9 @@ LOCAL_SRC_FILES_arm += \
 LOCAL_ADDRESS_SANITIZER := false
 LOCAL_NATIVE_COVERAGE := $(bionic_coverage)
 
+# Allow devices to provide additional symbols
+LOCAL_WHOLE_STATIC_LIBRARIES += $(BOARD_PROVIDES_ADDITIONAL_BIONIC_STATIC_LIBS)
+
 include $(BUILD_SHARED_LIBRARY)
 
 
diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk
index d72a160..c2b80c5 100644
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -20,7 +20,6 @@ libc_freebsd_src_files_arm += \
     upstream-freebsd/lib/libc/string/wmemmove.c \
 
 libc_openbsd_src_files_arm += \
-    upstream-openbsd/lib/libc/string/memchr.c \
     upstream-openbsd/lib/libc/string/memrchr.c \
     upstream-openbsd/lib/libc/string/stpncpy.c \
     upstream-openbsd/lib/libc/string/strlcat.c \
@@ -52,7 +51,7 @@ ifeq ($(strip $(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)),)
 endif
 cpu_variant_mk := $(LOCAL_PATH)/arch-arm/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT).mk
 ifeq ($(wildcard $(cpu_variant_mk)),)
-$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
+$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, scorpion, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
 endif
 include $(cpu_variant_mk)
 libc_common_additional_dependencies += $(cpu_variant_mk)
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
index a2e9c22..3692f04 100644
--- a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,191 +26,7 @@
  * SUCH DAMAGE.
  */
 
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
 
-    .syntax unified
-
-    .thumb
-    .thumb_func
-
-// Get the length of src string, then get the source of the dst string.
-// Check that the two lengths together don't exceed the threshold, then
-// do a memcpy of the data.
-ENTRY(__strcat_chk)
-    pld     [r0, #0]
-    push    {r0, lr}
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-    push    {r4, r5}
-    .cfi_adjust_cfa_offset 8
-    .cfi_rel_offset r4, 0
-    .cfi_rel_offset r5, 4
-
-    mov     lr, r2
-
-    // Save the dst register to r5
-    mov     r5, r0
-
-    // Zero out r4
-    eor     r4, r4, r4
-
-    // r1 contains the address of the string to count.
-.L_strlen_start:
-    mov     r0, r1
-    ands    r3, r1, #7
-    beq     .L_mainloop
-
-    // Align to a double word (64 bits).
-    rsb     r3, r3, #8
-    lsls    ip, r3, #31
-    beq     .L_align_to_32
-
-    ldrb    r2, [r1], #1
-    cbz     r2, .L_update_count_and_finish
-
-.L_align_to_32:
-    bcc     .L_align_to_64
-    ands    ip, r3, #2
-    beq     .L_align_to_64
-
-    ldrb    r2, [r1], #1
-    cbz     r2, .L_update_count_and_finish
-    ldrb    r2, [r1], #1
-    cbz     r2, .L_update_count_and_finish
-
-.L_align_to_64:
-    tst     r3, #4
-    beq     .L_mainloop
-    ldr     r3, [r1], #4
-
-    sub     ip, r3, #0x01010101
-    bic     ip, ip, r3
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_second_register
-
-    .p2align 2
-.L_mainloop:
-    ldrd    r2, r3, [r1], #8
-
-    pld     [r1, #64]
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_first_register
-
-    sub     ip, r3, #0x01010101
-    bic     ip, ip, r3
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_second_register
-    b       .L_mainloop
-
-.L_update_count_and_finish:
-    sub     r3, r1, r0
-    sub     r3, r3, #1
-    b       .L_finish
-
-.L_zero_in_first_register:
-    sub     r3, r1, r0
-    lsls    r2, ip, #17
-    bne     .L_sub8_and_finish
-    bcs     .L_sub7_and_finish
-    lsls    ip, ip, #1
-    bne     .L_sub6_and_finish
-
-    sub     r3, r3, #5
-    b       .L_finish
-
-.L_sub8_and_finish:
-    sub     r3, r3, #8
-    b       .L_finish
-
-.L_sub7_and_finish:
-    sub     r3, r3, #7
-    b       .L_finish
-
-.L_sub6_and_finish:
-    sub     r3, r3, #6
-    b       .L_finish
-
-.L_zero_in_second_register:
-    sub     r3, r1, r0
-    lsls    r2, ip, #17
-    bne     .L_sub4_and_finish
-    bcs     .L_sub3_and_finish
-    lsls    ip, ip, #1
-    bne     .L_sub2_and_finish
-
-    sub     r3, r3, #1
-    b       .L_finish
-
-.L_sub4_and_finish:
-    sub     r3, r3, #4
-    b       .L_finish
-
-.L_sub3_and_finish:
-    sub     r3, r3, #3
-    b       .L_finish
-
-.L_sub2_and_finish:
-    sub     r3, r3, #2
-
-.L_finish:
-    cmp     r4, #0
-    bne     .L_strlen_done
-
-    // Time to get the dst string length.
-    mov     r1, r5
-
-    // Save the original source address to r5.
-    mov     r5, r0
-
-    // Save the current length (adding 1 for the terminator).
-    add     r4, r3, #1
-    b       .L_strlen_start
-
-    // r0 holds the pointer to the dst string.
-    // r3 holds the dst string length.
-    // r4 holds the src string length + 1.
-.L_strlen_done:
-    add     r2, r3, r4
-    cmp     r2, lr
-    bhi     __strcat_chk_failed
-
-    // Set up the registers for the memcpy code.
-    mov     r1, r5
-    pld     [r1, #64]
-    mov     r2, r4
-    add     r0, r0, r3
-    pop     {r4, r5}
-END(__strcat_chk)
-
-#define MEMCPY_BASE         __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
-
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__strcat_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-    .cfi_adjust_cfa_offset 8
-    .cfi_rel_offset r4, 0
-    .cfi_rel_offset r5, 4
-
-    ldr     r0, error_message
-    ldr     r1, error_code
-1:
-    add     r0, pc
-    bl      __fortify_chk_fail
-error_code:
-    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
-error_message:
-    .word   error_string-(1b+4)
-END(__strcat_chk_failed)
-
-    .data
-error_string:
-    .string "strcat: prevented write past end of buffer"
+#include "__strcat_chk_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S
new file mode 100644
index 0000000..de66967
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+    .syntax unified
+
+    .thumb
+    .thumb_func
+
+// Get the length of src string, then get the source of the dst string.
+// Check that the two lengths together don't exceed the threshold, then
+// do a memcpy of the data.
+ENTRY(__strcat_chk)
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+    push    {r4, r5}
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+
+    mov     lr, r2
+
+    // Save the dst register to r5
+    mov     r5, r0
+
+    // Zero out r4
+    eor     r4, r4, r4
+
+    // r1 contains the address of the string to count.
+.L_strlen_start:
+    mov     r0, r1
+    ands    r3, r1, #7
+    beq     .L_mainloop
+
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+    ands    ip, r3, #2
+    beq     .L_align_to_64
+
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+    ldrb    r2, [r1], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r3, [r1], #4
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+
+    .p2align 2
+.L_mainloop:
+    ldrd    r2, r3, [r1], #8
+
+    pld     [r1, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_update_count_and_finish:
+    sub     r3, r1, r0
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_zero_in_first_register:
+    sub     r3, r1, r0
+    lsls    r2, ip, #17
+    bne     .L_sub8_and_finish
+    bcs     .L_sub7_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub6_and_finish
+
+    sub     r3, r3, #5
+    b       .L_finish
+
+.L_sub8_and_finish:
+    sub     r3, r3, #8
+    b       .L_finish
+
+.L_sub7_and_finish:
+    sub     r3, r3, #7
+    b       .L_finish
+
+.L_sub6_and_finish:
+    sub     r3, r3, #6
+    b       .L_finish
+
+.L_zero_in_second_register:
+    sub     r3, r1, r0
+    lsls    r2, ip, #17
+    bne     .L_sub4_and_finish
+    bcs     .L_sub3_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub2_and_finish
+
+    sub     r3, r3, #1
+    b       .L_finish
+
+.L_sub4_and_finish:
+    sub     r3, r3, #4
+    b       .L_finish
+
+.L_sub3_and_finish:
+    sub     r3, r3, #3
+    b       .L_finish
+
+.L_sub2_and_finish:
+    sub     r3, r3, #2
+
+.L_finish:
+    cmp     r4, #0
+    bne     .L_strlen_done
+
+    // Time to get the dst string length.
+    mov     r1, r5
+
+    // Save the original source address to r5.
+    mov     r5, r0
+
+    // Save the current length (adding 1 for the terminator).
+    add     r4, r3, #1
+    b       .L_strlen_start
+
+    // r0 holds the pointer to the dst string.
+    // r3 holds the dst string length.
+    // r4 holds the src string length + 1.
+.L_strlen_done:
+    add     r2, r3, r4
+    cmp     r2, lr
+    bhi     .L_strcat_chk_failed
+
+    // Set up the registers for the memcpy code.
+    mov     r1, r5
+    pld     [r1, #64]
+    mov     r2, r4
+    add     r0, r0, r3
+    pop     {r4, r5}
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore r4
+    .cfi_restore r5
+
+#include MEMCPY_BASE
+
+    // Undo the above cfi directives
+    .cfi_adjust_cfa_offset 8
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+.L_strcat_chk_failed:
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+END(__strcat_chk)
+
+    .data
+error_string:
+    .string "strcat: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
index db76686..d8cb3d9 100644
--- a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,155 +26,7 @@
  * SUCH DAMAGE.
  */
 
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
 
-    .syntax unified
-
-    .thumb
-    .thumb_func
-
-// Get the length of the source string first, then do a memcpy of the data
-// instead of a strcpy.
-ENTRY(__strcpy_chk)
-    pld     [r0, #0]
-    push    {r0, lr}
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-
-    mov     lr, r2
-    mov     r0, r1
-
-    ands    r3, r1, #7
-    beq     .L_mainloop
-
-    // Align to a double word (64 bits).
-    rsb     r3, r3, #8
-    lsls    ip, r3, #31
-    beq     .L_align_to_32
-
-    ldrb    r2, [r0], #1
-    cbz     r2, .L_update_count_and_finish
-
-.L_align_to_32:
-    bcc     .L_align_to_64
-    ands    ip, r3, #2
-    beq     .L_align_to_64
-
-    ldrb    r2, [r0], #1
-    cbz     r2, .L_update_count_and_finish
-    ldrb    r2, [r0], #1
-    cbz     r2, .L_update_count_and_finish
-
-.L_align_to_64:
-    tst     r3, #4
-    beq     .L_mainloop
-    ldr     r3, [r0], #4
-
-    sub     ip, r3, #0x01010101
-    bic     ip, ip, r3
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_second_register
-
-    .p2align 2
-.L_mainloop:
-    ldrd    r2, r3, [r0], #8
-
-    pld     [r0, #64]
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_first_register
-
-    sub     ip, r3, #0x01010101
-    bic     ip, ip, r3
-    ands    ip, ip, #0x80808080
-    bne     .L_zero_in_second_register
-    b       .L_mainloop
-
-.L_update_count_and_finish:
-    sub     r3, r0, r1
-    sub     r3, r3, #1
-    b       .L_check_size
-
-.L_zero_in_first_register:
-    sub     r3, r0, r1
-    lsls    r2, ip, #17
-    bne     .L_sub8_and_finish
-    bcs     .L_sub7_and_finish
-    lsls    ip, ip, #1
-    bne     .L_sub6_and_finish
-
-    sub     r3, r3, #5
-    b       .L_check_size
-
-.L_sub8_and_finish:
-    sub     r3, r3, #8
-    b       .L_check_size
-
-.L_sub7_and_finish:
-    sub     r3, r3, #7
-    b       .L_check_size
-
-.L_sub6_and_finish:
-    sub     r3, r3, #6
-    b       .L_check_size
-
-.L_zero_in_second_register:
-    sub     r3, r0, r1
-    lsls    r2, ip, #17
-    bne     .L_sub4_and_finish
-    bcs     .L_sub3_and_finish
-    lsls    ip, ip, #1
-    bne     .L_sub2_and_finish
-
-    sub     r3, r3, #1
-    b       .L_check_size
-
-.L_sub4_and_finish:
-    sub     r3, r3, #4
-    b       .L_check_size
-
-.L_sub3_and_finish:
-    sub     r3, r3, #3
-    b       .L_check_size
-
-.L_sub2_and_finish:
-    sub     r3, r3, #2
-
-.L_check_size:
-    pld     [r1, #0]
-    pld     [r1, #64]
-    ldr     r0, [sp]
-    cmp     r3, lr
-    bhs     __strcpy_chk_failed
-
-    // Add 1 for copy length to get the string terminator.
-    add     r2, r3, #1
-END(__strcpy_chk)
-
-#define MEMCPY_BASE         __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__strcpy_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-
-    ldr     r0, error_message
-    ldr     r1, error_code
-1:
-    add     r0, pc
-    bl      __fortify_chk_fail
-error_code:
-    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
-error_message:
-    .word   error_string-(1b+4)
-END(__strcpy_chk_failed)
-
-    .data
-error_string:
-    .string "strcpy: prevented write past end of buffer"
+#include "__strcpy_chk_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S
new file mode 100644
index 0000000..69ebcb4
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+    .syntax unified
+
+    .thumb
+    .thumb_func
+
+// Get the length of the source string first, then do a memcpy of the data
+// instead of a strcpy.
+ENTRY(__strcpy_chk)
+    pld     [r0, #0]
+    push    {r0, lr}
+    .cfi_def_cfa_offset 8
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset lr, 4
+
+    mov     lr, r2
+    mov     r0, r1
+
+    ands    r3, r1, #7
+    beq     .L_mainloop
+
+    // Align to a double word (64 bits).
+    rsb     r3, r3, #8
+    lsls    ip, r3, #31
+    beq     .L_align_to_32
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_32:
+    bcc     .L_align_to_64
+    ands    ip, r3, #2
+    beq     .L_align_to_64
+
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+    ldrb    r2, [r0], #1
+    cbz     r2, .L_update_count_and_finish
+
+.L_align_to_64:
+    tst     r3, #4
+    beq     .L_mainloop
+    ldr     r3, [r0], #4
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+
+    .p2align 2
+.L_mainloop:
+    ldrd    r2, r3, [r0], #8
+
+    pld     [r0, #64]
+
+    sub     ip, r2, #0x01010101
+    bic     ip, ip, r2
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_first_register
+
+    sub     ip, r3, #0x01010101
+    bic     ip, ip, r3
+    ands    ip, ip, #0x80808080
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
+
+.L_update_count_and_finish:
+    sub     r3, r0, r1
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_zero_in_first_register:
+    sub     r3, r0, r1
+    lsls    r2, ip, #17
+    bne     .L_sub8_and_finish
+    bcs     .L_sub7_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub6_and_finish
+
+    sub     r3, r3, #5
+    b       .L_check_size
+
+.L_sub8_and_finish:
+    sub     r3, r3, #8
+    b       .L_check_size
+
+.L_sub7_and_finish:
+    sub     r3, r3, #7
+    b       .L_check_size
+
+.L_sub6_and_finish:
+    sub     r3, r3, #6
+    b       .L_check_size
+
+.L_zero_in_second_register:
+    sub     r3, r0, r1
+    lsls    r2, ip, #17
+    bne     .L_sub4_and_finish
+    bcs     .L_sub3_and_finish
+    lsls    ip, ip, #1
+    bne     .L_sub2_and_finish
+
+    sub     r3, r3, #1
+    b       .L_check_size
+
+.L_sub4_and_finish:
+    sub     r3, r3, #4
+    b       .L_check_size
+
+.L_sub3_and_finish:
+    sub     r3, r3, #3
+    b       .L_check_size
+
+.L_sub2_and_finish:
+    sub     r3, r3, #2
+
+.L_check_size:
+    pld     [r1, #0]
+    pld     [r1, #64]
+    ldr     r0, [sp]
+    cmp     r3, lr
+    bhs     .L_strcpy_chk_failed
+
+    // Add 1 for copy length to get the string terminator.
+    add     r2, r3, #1
+
+#include MEMCPY_BASE
+
+.L_strcpy_chk_failed:
+    ldr     r0, error_message
+    ldr     r1, error_code
+1:
+    add     r0, pc
+    bl      __fortify_chk_fail
+error_code:
+    .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
+error_message:
+    .word   error_string-(1b+4)
+END(__strcpy_chk)
+
+    .data
+error_string:
+    .string "strcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S
index 410b663..537f3de 100644
--- a/libc/arch-arm/cortex-a15/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,79 +25,8 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
-/*
- * Copyright (c) 2013 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-// Prototype: void *memcpy (void *dst, const void *src, size_t count).
-
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
-
-        .text
-        .syntax unified
-        .fpu    neon
-
-ENTRY(__memcpy_chk)
-        cmp     r2, r3
-        bhi     __memcpy_chk_fail
-
-        // Fall through to memcpy...
-END(__memcpy_chk)
-
-ENTRY(memcpy)
-        pld     [r1, #64]
-        push    {r0, lr}
-        .cfi_def_cfa_offset 8
-        .cfi_rel_offset r0, 0
-        .cfi_rel_offset lr, 4
-END(memcpy)
-
-#define MEMCPY_BASE         __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__memcpy_chk_fail)
-        // Preserve lr for backtrace.
-        push    {lr}
-        .cfi_def_cfa_offset 4
-        .cfi_rel_offset lr, 0
 
-        ldr     r0, error_message
-        ldr     r1, error_code
-1:
-        add     r0, pc
-        bl      __fortify_chk_fail
-error_code:
-        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
-error_message:
-        .word   error_string-(1b+8)
-END(__memcpy_chk_fail)
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
 
-        .data
-error_string:
-        .string "memcpy: prevented write past end of buffer"
+#include "memcpy_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
index 2a73852..aac737d 100644
--- a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
@@ -53,11 +53,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-ENTRY_PRIVATE(MEMCPY_BASE)
-        .cfi_def_cfa_offset 8
-        .cfi_rel_offset r0, 0
-        .cfi_rel_offset lr, 4
-
+.L_memcpy_base:
         // Assumes that n >= 0, and dst, src are valid pointers.
         // For any sizes less than 832 use the neon code that doesn't
         // care about the src alignment. This avoids any checks
@@ -168,12 +164,6 @@ ENTRY_PRIVATE(MEMCPY_BASE)
         eor     r3, r0, r1
         ands    r3, r3, #0x3
         bne     .L_copy_unknown_alignment
-END(MEMCPY_BASE)
-
-ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
-        .cfi_def_cfa_offset 8
-        .cfi_rel_offset r0, 0
-        .cfi_rel_offset lr, 4
 
         // To try and improve performance, stack layout changed,
         // i.e., not keeping the stack looking like users expect
@@ -185,7 +175,7 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
         strd    r6, r7, [sp, #-8]!
         .cfi_adjust_cfa_offset 8
         .cfi_rel_offset r6, 0
-        .cfi_rel_offset r7, 0
+        .cfi_rel_offset r7, 4
         strd    r8, r9, [sp, #-8]!
         .cfi_adjust_cfa_offset 8
         .cfi_rel_offset r8, 0
@@ -291,10 +281,28 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
 
         // Restore registers: optimized pop {r0, pc}
         ldrd    r8, r9, [sp], #8
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r8
+        .cfi_restore r9
         ldrd    r6, r7, [sp], #8
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r6
+        .cfi_restore r7
         ldrd    r4, r5, [sp], #8
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r4
+        .cfi_restore r5
         pop     {r0, pc}
 
+        // Put the cfi directives back for the below instructions.
+        .cfi_adjust_cfa_offset 24
+        .cfi_rel_offset r4, 0
+        .cfi_rel_offset r5, 4
+        .cfi_rel_offset r6, 8
+        .cfi_rel_offset r7, 12
+        .cfi_rel_offset r8, 16
+        .cfi_rel_offset r9, 20
+
 .L_dst_not_word_aligned:
         // Align dst to word.
         rsb     ip, ip, #4
@@ -315,4 +323,12 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
 
         // Src is guaranteed to be at least word aligned by this point.
         b       .L_word_aligned
-END(MEMCPY_BASE_ALIGNED)
+
+        // Undo any cfi directives from above.
+        .cfi_adjust_cfa_offset -24
+        .cfi_restore r4
+        .cfi_restore r5
+        .cfi_restore r6
+        .cfi_restore r7
+        .cfi_restore r8
+        .cfi_restore r9
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_common.S b/libc/arch-arm/cortex-a15/bionic/memcpy_common.S
new file mode 100644
index 0000000..464fb46
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy_common.S
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+        .text
+        .syntax unified
+        .fpu    neon
+
+ENTRY(__memcpy_chk)
+        cmp     r2, r3
+        bhi     .L_memcpy_chk_fail
+
+        // Fall through to memcpy...
+END(__memcpy_chk)
+
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).
+ENTRY(memcpy)
+        pld     [r1, #64]
+        push    {r0, lr}
+        .cfi_def_cfa_offset 8
+        .cfi_rel_offset r0, 0
+        .cfi_rel_offset lr, 4
+
+#include MEMCPY_BASE
+
+        // Undo the cfi instructions from above.
+        .cfi_def_cfa_offset 0
+        .cfi_restore r0
+        .cfi_restore lr
+.L_memcpy_chk_fail:
+        // Preserve lr for backtrace.
+        push    {lr}
+        .cfi_adjust_cfa_offset 4
+        .cfi_rel_offset lr, 0
+
+        ldr     r0, error_message
+        ldr     r1, error_code
+1:
+        add     r0, pc
+        bl      __fortify_chk_fail
+error_code:
+        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+error_message:
+        .word   error_string-(1b+8)
+END(memcpy)
+
+        .data
+error_string:
+        .string "memcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/strcat.S b/libc/arch-arm/cortex-a15/bionic/strcat.S
index b95be94..157cc9f 100644
--- a/libc/arch-arm/cortex-a15/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a15/bionic/strcat.S
@@ -70,7 +70,7 @@
 
     .macro m_scan_byte
     ldrb    r3, [r0]
-    cbz     r3, strcat_r0_scan_done
+    cbz     r3, .L_strcat_r0_scan_done
     add     r0, #1
     .endm // m_scan_byte
 
@@ -84,10 +84,10 @@ ENTRY(strcat)
     // Quick check to see if src is empty.
     ldrb    r2, [r1]
     pld     [r1, #0]
-    cbnz    r2, strcat_continue
+    cbnz    r2, .L_strcat_continue
     bx      lr
 
-strcat_continue:
+.L_strcat_continue:
     // To speed up really small dst strings, unroll checking the first 4 bytes.
     m_push
     m_scan_byte
@@ -96,95 +96,102 @@ strcat_continue:
     m_scan_byte
 
     ands    r3, r0, #7
-    beq     strcat_mainloop
+    beq     .L_strcat_mainloop
 
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcat_align_to_32
+    beq     .L_strcat_align_to_32
 
     ldrb    r5, [r0]
-    cbz     r5, strcat_r0_scan_done
+    cbz     r5, .L_strcat_r0_scan_done
     add     r0, r0, #1
 
-strcat_align_to_32:
-    bcc     strcat_align_to_64
+.L_strcat_align_to_32:
+    bcc     .L_strcat_align_to_64
 
     ldrb    r2, [r0]
-    cbz     r2, strcat_r0_scan_done
+    cbz     r2, .L_strcat_r0_scan_done
     add     r0, r0, #1
     ldrb    r4, [r0]
-    cbz     r4, strcat_r0_scan_done
+    cbz     r4, .L_strcat_r0_scan_done
     add     r0, r0, #1
 
-strcat_align_to_64:
+.L_strcat_align_to_64:
     tst     r3, #4
-    beq     strcat_mainloop
+    beq     .L_strcat_mainloop
     ldr     r3, [r0], #4
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .L_strcat_zero_in_second_register
+    b       .L_strcat_mainloop
 
-strcat_r0_scan_done:
+.L_strcat_r0_scan_done:
     // For short copies, hard-code checking the first 8 bytes since this
     // new code doesn't win until after about 8 bytes.
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
-
-strcpy_finish:
+    m_copy_byte reg=r2, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r5, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r2, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.L_strcpy_finish
+    m_copy_byte reg=r5, cmd=cbnz, label=.L_strcpy_continue
+
+.L_strcpy_finish:
     m_pop
 
-strcpy_continue:
+.L_strcpy_continue:
     ands    r3, r0, #7
-    beq     strcpy_check_src_align
+    beq     .L_strcpy_check_src_align
 
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcpy_align_to_32
+    beq     .L_strcpy_align_to_32
 
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .L_strcpy_complete
 
-strcpy_align_to_32:
-    bcc     strcpy_align_to_64
+.L_strcpy_align_to_32:
+    bcc     .L_strcpy_align_to_64
 
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .L_strcpy_complete
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .L_strcpy_complete
 
-strcpy_align_to_64:
+.L_strcpy_align_to_64:
     tst     r3, #4
-    beq     strcpy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
-    str     r2, [r0], #4
+    beq     .L_strcpy_check_src_align
+    // Read one byte at a time since we don't know the src alignment
+    // and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .L_strcpy_complete
 
-strcpy_check_src_align:
+.L_strcpy_check_src_align:
     // At this point dst is aligned to a double word, check if src
     // is also aligned to a double word.
     ands    r3, r1, #7
-    bne     strcpy_unaligned_copy
+    bne     .L_strcpy_unaligned_copy
 
     .p2align 2
-strcpy_mainloop:
+.L_strcpy_mainloop:
     ldrd    r2, r3, [r1], #8
 
     pld     [r1, #64]
@@ -192,128 +199,128 @@ strcpy_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_mainloop
+    b       .L_strcpy_mainloop
 
-strcpy_complete:
+.L_strcpy_complete:
     m_pop
 
-strcpy_zero_in_first_register:
+.L_strcpy_zero_in_first_register:
     lsls    lr, ip, #17
-    bne     strcpy_copy1byte
-    bcs     strcpy_copy2bytes
+    bne     .L_strcpy_copy1byte
+    bcs     .L_strcpy_copy2bytes
     lsls    ip, ip, #1
-    bne     strcpy_copy3bytes
+    bne     .L_strcpy_copy3bytes
 
-strcpy_copy4bytes:
+.L_strcpy_copy4bytes:
     // Copy 4 bytes to the destiniation.
     str     r2, [r0]
     m_pop
 
-strcpy_copy1byte:
+.L_strcpy_copy1byte:
     strb    r2, [r0]
     m_pop
 
-strcpy_copy2bytes:
+.L_strcpy_copy2bytes:
     strh    r2, [r0]
     m_pop
 
-strcpy_copy3bytes:
+.L_strcpy_copy3bytes:
     strh    r2, [r0], #2
     lsr     r2, #16
     strb    r2, [r0]
     m_pop
 
-strcpy_zero_in_second_register:
+.L_strcpy_zero_in_second_register:
     lsls    lr, ip, #17
-    bne     strcpy_copy5bytes
-    bcs     strcpy_copy6bytes
+    bne     .L_strcpy_copy5bytes
+    bcs     .L_strcpy_copy6bytes
     lsls    ip, ip, #1
-    bne     strcpy_copy7bytes
+    bne     .L_strcpy_copy7bytes
 
     // Copy 8 bytes to the destination.
     strd    r2, r3, [r0]
     m_pop
 
-strcpy_copy5bytes:
+.L_strcpy_copy5bytes:
     str     r2, [r0], #4
     strb    r3, [r0]
     m_pop
 
-strcpy_copy6bytes:
+.L_strcpy_copy6bytes:
     str     r2, [r0], #4
     strh    r3, [r0]
     m_pop
 
-strcpy_copy7bytes:
+.L_strcpy_copy7bytes:
     str     r2, [r0], #4
     strh    r3, [r0], #2
     lsr     r3, #16
     strb    r3, [r0]
     m_pop
 
-strcpy_unaligned_copy:
+.L_strcpy_unaligned_copy:
     // Dst is aligned to a double word, while src is at an unknown alignment.
     // There are 7 different versions of the unaligned copy code
     // to prevent overreading the src. The mainloop of every single version
     // will store 64 bits per loop. The difference is how much of src can
     // be read without potentially crossing a page boundary.
     tbb     [pc, r3]
-strcpy_unaligned_branchtable:
+.L_strcpy_unaligned_branchtable:
     .byte 0
-    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign7 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign6 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign5 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign4 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign3 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign2 - .L_strcpy_unaligned_branchtable)/2)
+    .byte ((.L_strcpy_unalign1 - .L_strcpy_unaligned_branchtable)/2)
 
     .p2align 2
     // Can read 7 bytes before possibly crossing a page.
-strcpy_unalign7:
+.L_strcpy_unalign7:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     ldrb    r3, [r1]
-    cbz     r3, strcpy_unalign7_copy5bytes
+    cbz     r3, .L_strcpy_unalign7_copy5bytes
     ldrb    r4, [r1, #1]
-    cbz     r4, strcpy_unalign7_copy6bytes
+    cbz     r4, .L_strcpy_unalign7_copy6bytes
     ldrb    r5, [r1, #2]
-    cbz     r5, strcpy_unalign7_copy7bytes
+    cbz     r5, .L_strcpy_unalign7_copy7bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     lsrs    ip, r3, #24
     strd    r2, r3, [r0], #8
-    beq     strcpy_unalign_return
-    b       strcpy_unalign7
+    beq     .L_strcpy_unalign_return
+    b       .L_strcpy_unalign7
 
-strcpy_unalign7_copy5bytes:
+.L_strcpy_unalign7_copy5bytes:
     str     r2, [r0], #4
     strb    r3, [r0]
-strcpy_unalign_return:
+.L_strcpy_unalign_return:
     m_pop
 
-strcpy_unalign7_copy6bytes:
+.L_strcpy_unalign7_copy6bytes:
     str     r2, [r0], #4
     strb    r3, [r0], #1
     strb    r4, [r0], #1
     m_pop
 
-strcpy_unalign7_copy7bytes:
+.L_strcpy_unalign7_copy7bytes:
     str     r2, [r0], #4
     strb    r3, [r0], #1
     strb    r4, [r0], #1
@@ -322,41 +329,41 @@ strcpy_unalign7_copy7bytes:
 
     .p2align 2
     // Can read 6 bytes before possibly crossing a page.
-strcpy_unalign6:
+.L_strcpy_unalign6:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .L_strcpy_unalign_copy5bytes
     ldrb    r5, [r1, #1]
-    cbz     r5, strcpy_unalign_copy6bytes
+    cbz     r5, .L_strcpy_unalign_copy6bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r3, #0xff0000
-    beq     strcpy_copy7bytes
+    beq     .L_strcpy_copy7bytes
     lsrs    ip, r3, #24
     strd    r2, r3, [r0], #8
-    beq     strcpy_unalign_return
-    b       strcpy_unalign6
+    beq     .L_strcpy_unalign_return
+    b       .L_strcpy_unalign6
 
     .p2align 2
     // Can read 5 bytes before possibly crossing a page.
-strcpy_unalign5:
+.L_strcpy_unalign5:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .L_strcpy_unalign_copy5bytes
 
     ldr     r3, [r1], #4
 
@@ -365,17 +372,17 @@ strcpy_unalign5:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign5
+    b       .L_strcpy_unalign5
 
-strcpy_unalign_copy5bytes:
+.L_strcpy_unalign_copy5bytes:
     str     r2, [r0], #4
     strb    r4, [r0]
     m_pop
 
-strcpy_unalign_copy6bytes:
+.L_strcpy_unalign_copy6bytes:
     str     r2, [r0], #4
     strb    r4, [r0], #1
     strb    r5, [r0]
@@ -383,13 +390,13 @@ strcpy_unalign_copy6bytes:
 
     .p2align 2
     // Can read 4 bytes before possibly crossing a page.
-strcpy_unalign4:
+.L_strcpy_unalign4:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
@@ -397,20 +404,20 @@ strcpy_unalign4:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign4
+    b       .L_strcpy_unalign4
 
     .p2align 2
     // Can read 3 bytes before possibly crossing a page.
-strcpy_unalign3:
+.L_strcpy_unalign3:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign3_copy1byte
+    cbz     r2, .L_strcpy_unalign3_copy1byte
     ldrb    r3, [r1, #1]
-    cbz     r3, strcpy_unalign3_copy2bytes
+    cbz     r3, .L_strcpy_unalign3_copy2bytes
     ldrb    r4, [r1, #2]
-    cbz     r4, strcpy_unalign3_copy3bytes
+    cbz     r4, .L_strcpy_unalign3_copy3bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -418,26 +425,26 @@ strcpy_unalign3:
     pld     [r1, #64]
 
     lsrs    lr, r2, #24
-    beq     strcpy_copy4bytes
+    beq     .L_strcpy_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign3
+    b       .L_strcpy_unalign3
 
-strcpy_unalign3_copy1byte:
+.L_strcpy_unalign3_copy1byte:
     strb    r2, [r0]
     m_pop
 
-strcpy_unalign3_copy2bytes:
+.L_strcpy_unalign3_copy2bytes:
     strb    r2, [r0], #1
     strb    r3, [r0]
     m_pop
 
-strcpy_unalign3_copy3bytes:
+.L_strcpy_unalign3_copy3bytes:
     strb    r2, [r0], #1
     strb    r3, [r0], #1
     strb    r4, [r0]
@@ -445,34 +452,34 @@ strcpy_unalign3_copy3bytes:
 
     .p2align 2
     // Can read 2 bytes before possibly crossing a page.
-strcpy_unalign2:
+.L_strcpy_unalign2:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .L_strcpy_unalign_copy1byte
     ldrb    r4, [r1, #1]
-    cbz     r4, strcpy_unalign_copy2bytes
+    cbz     r4, .L_strcpy_unalign_copy2bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r2, #0xff0000
-    beq     strcpy_copy3bytes
+    beq     .L_strcpy_copy3bytes
     lsrs    ip, r2, #24
-    beq     strcpy_copy4bytes
+    beq     .L_strcpy_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign2
+    b       .L_strcpy_unalign2
 
     .p2align 2
     // Can read 1 byte before possibly crossing a page.
-strcpy_unalign1:
+.L_strcpy_unalign1:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .L_strcpy_unalign_copy1byte
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -482,27 +489,27 @@ strcpy_unalign1:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .L_strcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .L_strcpy_zero_in_second_register
 
     strd    r2, r3, [r0], #8
-    b       strcpy_unalign1
+    b       .L_strcpy_unalign1
 
-strcpy_unalign_copy1byte:
+.L_strcpy_unalign_copy1byte:
     strb    r2, [r0]
     m_pop
 
-strcpy_unalign_copy2bytes:
+.L_strcpy_unalign_copy2bytes:
     strb    r2, [r0], #1
     strb    r4, [r0]
     m_pop
 
     .p2align 2
-strcat_mainloop:
+.L_strcat_mainloop:
     ldrd    r2, r3, [r0], #8
 
     pld     [r0, #64]
@@ -510,59 +517,59 @@ strcat_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_first_register
+    bne     .L_strcat_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .L_strcat_zero_in_second_register
+    b       .L_strcat_mainloop
 
-strcat_zero_in_first_register:
+.L_strcat_zero_in_first_register:
     // Prefetch the src now, it's going to be used soon.
     pld     [r1, #0]
     lsls    lr, ip, #17
-    bne     strcat_sub8
-    bcs     strcat_sub7
+    bne     .L_strcat_sub8
+    bcs     .L_strcat_sub7
     lsls    ip, ip, #1
-    bne     strcat_sub6
+    bne     .L_strcat_sub6
 
     sub     r0, r0, #5
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub8:
+.L_strcat_sub8:
     sub     r0, r0, #8
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub7:
+.L_strcat_sub7:
     sub     r0, r0, #7
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub6:
+.L_strcat_sub6:
     sub     r0, r0, #6
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_zero_in_second_register:
+.L_strcat_zero_in_second_register:
     // Prefetch the src now, it's going to be used soon.
     pld     [r1, #0]
     lsls    lr, ip, #17
-    bne     strcat_sub4
-    bcs     strcat_sub3
+    bne     .L_strcat_sub4
+    bcs     .L_strcat_sub3
     lsls    ip, ip, #1
-    bne     strcat_sub2
+    bne     .L_strcat_sub2
 
     sub     r0, r0, #1
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub4:
+.L_strcat_sub4:
     sub     r0, r0, #4
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub3:
+.L_strcat_sub3:
     sub     r0, r0, #3
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 
-strcat_sub2:
+.L_strcat_sub2:
     sub     r0, r0, #2
-    b       strcat_r0_scan_done
+    b       .L_strcat_r0_scan_done
 END(strcat)
diff --git a/libc/arch-arm/cortex-a15/bionic/string_copy.S b/libc/arch-arm/cortex-a15/bionic/string_copy.S
index 20f0e91..92d1c98 100644
--- a/libc/arch-arm/cortex-a15/bionic/string_copy.S
+++ b/libc/arch-arm/cortex-a15/bionic/string_copy.S
@@ -149,13 +149,20 @@ ENTRY(strcpy)
 .Lstringcopy_align_to_64:
     tst     r3, #4
     beq     .Lstringcopy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .Lstringcopy_zero_in_first_register
-    str     r2, [r0], #4
+    // Read one byte at a time since we don't have any idea about the alignment
+    // of the source and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
 
 .Lstringcopy_check_src_align:
     // At this point dst is aligned to a double word, check if src
diff --git a/libc/arch-arm/cortex-a15/bionic/strlen.S b/libc/arch-arm/cortex-a15/bionic/strlen.S
index 9a0ce62..4fd6284 100644
--- a/libc/arch-arm/cortex-a15/bionic/strlen.S
+++ b/libc/arch-arm/cortex-a15/bionic/strlen.S
@@ -65,38 +65,38 @@ ENTRY(strlen)
     mov     r1, r0
 
     ands    r3, r0, #7
-    beq     mainloop
+    beq     .L_mainloop
 
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     align_to_32
+    beq     .L_align_to_32
 
     ldrb    r2, [r1], #1
-    cbz     r2, update_count_and_return
+    cbz     r2, .L_update_count_and_return
 
-align_to_32:
-    bcc     align_to_64
+.L_align_to_32:
+    bcc     .L_align_to_64
     ands    ip, r3, #2
-    beq     align_to_64
+    beq     .L_align_to_64
 
     ldrb    r2, [r1], #1
-    cbz     r2, update_count_and_return
+    cbz     r2, .L_update_count_and_return
     ldrb    r2, [r1], #1
-    cbz     r2, update_count_and_return
+    cbz     r2, .L_update_count_and_return
 
-align_to_64:
+.L_align_to_64:
     tst     r3, #4
-    beq     mainloop
+    beq     .L_mainloop
     ldr     r3, [r1], #4
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     zero_in_second_register
+    bne     .L_zero_in_second_register
 
     .p2align 2
-mainloop:
+.L_mainloop:
     ldrd    r2, r3, [r1], #8
 
     pld     [r1, #64]
@@ -104,62 +104,62 @@ mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     zero_in_first_register
+    bne     .L_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     zero_in_second_register
-    b       mainloop
+    bne     .L_zero_in_second_register
+    b       .L_mainloop
 
-update_count_and_return:
+.L_update_count_and_return:
     sub     r0, r1, r0
     sub     r0, r0, #1
     bx      lr
 
-zero_in_first_register:
+.L_zero_in_first_register:
     sub     r0, r1, r0
     lsls    r3, ip, #17
-    bne     sub8_and_return
-    bcs     sub7_and_return
+    bne     .L_sub8_and_return
+    bcs     .L_sub7_and_return
     lsls    ip, ip, #1
-    bne     sub6_and_return
+    bne     .L_sub6_and_return
 
     sub     r0, r0, #5
     bx      lr
 
-sub8_and_return:
+.L_sub8_and_return:
     sub     r0, r0, #8
     bx      lr
 
-sub7_and_return:
+.L_sub7_and_return:
     sub     r0, r0, #7
     bx      lr
 
-sub6_and_return:
+.L_sub6_and_return:
     sub     r0, r0, #6
     bx      lr
 
-zero_in_second_register:
+.L_zero_in_second_register:
     sub     r0, r1, r0
     lsls    r3, ip, #17
-    bne     sub4_and_return
-    bcs     sub3_and_return
+    bne     .L_sub4_and_return
+    bcs     .L_sub3_and_return
     lsls    ip, ip, #1
-    bne     sub2_and_return
+    bne     .L_sub2_and_return
 
     sub     r0, r0, #1
     bx      lr
 
-sub4_and_return:
+.L_sub4_and_return:
     sub     r0, r0, #4
     bx      lr
 
-sub3_and_return:
+.L_sub3_and_return:
     sub     r0, r0, #3
     bx      lr
 
-sub2_and_return:
+.L_sub2_and_return:
     sub     r0, r0, #2
     bx      lr
 END(strlen)
diff --git a/libc/arch-arm/cortex-a15/cortex-a15.mk b/libc/arch-arm/cortex-a15/cortex-a15.mk
index 6fa3270..202a3bf 100644
--- a/libc/arch-arm/cortex-a15/cortex-a15.mk
+++ b/libc/arch-arm/cortex-a15/cortex-a15.mk
@@ -10,6 +10,7 @@ libc_bionic_src_files_arm += \
     arch-arm/cortex-a15/bionic/strlen.S \
 
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
 
 libc_bionic_src_files_arm += \
diff --git a/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk b/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk
new file mode 100644
index 0000000..5d7efc6
--- /dev/null
+++ b/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk
@@ -0,0 +1,22 @@
+# This file represents the best optimized routines that are the middle
+# ground when running on a big/little system that is cortex-a57/cortex-a53.
+# The cortex-a7 optimized routines, and the cortex-a53 optimized routines
+# decrease performance on cortex-a57 processors by as much as 20%.
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S \
+    arch-arm/cortex-a15/bionic/memset.S \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/__strcat_chk.S \
+    arch-arm/cortex-a15/bionic/strcmp.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+    arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memcmp.S \
+    arch-arm/generic/bionic/memchr.S
+
+libc_bionic_src_files_arm += \
+    arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S
new file mode 100644
index 0000000..c5bc98a
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/__strcat_chk_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S
new file mode 100644
index 0000000..1f8945d
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/__strcpy_chk_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy.S b/libc/arch-arm/cortex-a53/bionic/memcpy.S
new file mode 100644
index 0000000..664f574
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/memcpy.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/memcpy_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy_base.S b/libc/arch-arm/cortex-a53/bionic/memcpy_base.S
new file mode 100644
index 0000000..2749fc8
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/memcpy_base.S
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+.L_memcpy_base:
+        // Assumes that n >= 0, and dst, src are valid pointers.
+        cmp     r2, #16
+        blo     .L_copy_less_than_16_unknown_align
+
+.L_copy_unknown_alignment:
+        // Unknown alignment of src and dst.
+        // Assumes that the first few bytes have already been prefetched.
+
+        // Align destination to 128 bits. The mainloop store instructions
+        // require this alignment or they will throw an exception.
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         2f
+
+        // Copy up to 15 bytes (count in r3).
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+
+        itt         mi
+        ldrbmi      lr, [r1], #1
+        strbmi      lr, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1], #1
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0], #1
+
+        movs        ip, r3, lsl #29
+        bge         1f
+        // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
+1:      bcc         2f
+        // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0, :64]!
+
+2:      // Make sure we have at least 64 bytes to copy.
+        subs        r2, r2, #64
+        blo         2f
+
+1:      // The main loop copies 64 bytes at a time.
+        vld1.8      {d0  - d3},   [r1]!
+        vld1.8      {d4  - d7},   [r1]!
+        subs        r2, r2, #64
+        vstmia      r0!, {d0 - d7}
+        pld         [r1, #(64*10)]
+        bhs         1b
+
+2:      // Fix-up the remaining count and make sure we have >= 32 bytes left.
+        adds        r2, r2, #32
+        blo         3f
+
+        // 32 bytes. These cache lines were already preloaded.
+        vld1.8      {d0 - d3},  [r1]!
+        sub         r2, r2, #32
+        vst1.8      {d0 - d3},  [r0, :128]!
+3:      // Less than 32 left.
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         .L_copy_less_than_16_unknown_align
+        // Copies 16 bytes, destination 128 bits aligned.
+        vld1.8      {d0, d1}, [r1]!
+        vst1.8      {d0, d1}, [r0, :128]!
+
+.L_copy_less_than_16_unknown_align:
+        // Copy up to 15 bytes (count in r2).
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vld1.8      {d0}, [r1]!
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+2:      // Copy 0 to 4 bytes.
+        lsls        r2, r2, #31
+        itt         ne
+        ldrbne      lr, [r1], #1
+        strbne      lr, [r0], #1
+        itttt       cs
+        ldrbcs      ip, [r1], #1
+        ldrbcs      lr, [r1]
+        strbcs      ip, [r0], #1
+        strbcs      lr, [r0]
+
+        pop         {r0, pc}
diff --git a/libc/arch-arm/cortex-a53/cortex-a53.mk b/libc/arch-arm/cortex-a53/cortex-a53.mk
index b5c337c..14aaa71 100644
--- a/libc/arch-arm/cortex-a53/cortex-a53.mk
+++ b/libc/arch-arm/cortex-a53/cortex-a53.mk
@@ -1 +1,21 @@
-include bionic/libc/arch-arm/cortex-a7/cortex-a7.mk
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a53/bionic/memcpy.S \
+    arch-arm/cortex-a53/bionic/__strcat_chk.S \
+    arch-arm/cortex-a53/bionic/__strcpy_chk.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a7/bionic/memset.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/strcmp.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
+    arch-arm/generic/bionic/memcmp.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a7/bionic/memset.S b/libc/arch-arm/cortex-a7/bionic/memset.S
new file mode 100644
index 0000000..6365b06
--- /dev/null
+++ b/libc/arch-arm/cortex-a7/bionic/memset.S
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/cpu-features.h>
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+        /*
+         * Optimized memset() for ARM.
+         *
+         * memset() returns its first argument.
+         */
+
+        .fpu        neon
+        .syntax     unified
+
+ENTRY(__memset_chk)
+        cmp         r2, r3
+        bls         .L_done
+
+        // Preserve lr for backtrace.
+        push        {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0
+
+        ldr         r0, error_message
+        ldr         r1, error_code
+1:
+        add         r0, pc
+        bl          __fortify_chk_fail
+error_code:
+        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
+error_message:
+        .word       error_string-(1b+8)
+END(__memset_chk)
+
+ENTRY(bzero)
+        mov         r2, r1
+        mov         r1, #0
+.L_done:
+        // Fall through to memset...
+END(bzero)
+
+ENTRY(memset)
+        mov         r3, r0
+        // At this point only d0, d1 are going to be used below.
+        vdup.8      q0, r1
+        cmp         r2, #16
+        blo         .L_set_less_than_16_unknown_align
+
+.L_check_alignment:
+        // Align destination to a double word to avoid the store crossing
+        // a cache line boundary.
+        ands        ip, r3, #7
+        bne         .L_do_double_word_align
+
+.L_double_word_aligned:
+        // Duplicate since the less than 64 can use d2, d3.
+        vmov        q1, q0
+        subs        r2, #64
+        blo         .L_set_less_than_64
+
+        // Duplicate the copy value so that we can store 64 bytes at a time.
+        vmov        q2, q0
+        vmov        q3, q0
+
+1:      // Main loop stores 64 bytes at a time.
+        subs        r2, #64
+        vstmia      r3!, {d0 - d7}
+        bge         1b
+
+.L_set_less_than_64:
+        // Restore r2 to the count of bytes left to set.
+        add         r2, #64
+        lsls        ip, r2, #27
+        bcc         .L_set_less_than_32
+        // Set 32 bytes.
+        vstmia      r3!, {d0 - d3}
+
+.L_set_less_than_32:
+        bpl         .L_set_less_than_16
+        // Set 16 bytes.
+        vstmia      r3!, {d0, d1}
+
+.L_set_less_than_16:
+        // Less than 16 bytes to set.
+        lsls        ip, r2, #29
+        bcc         .L_set_less_than_8
+
+        // Set 8 bytes.
+        vstmia      r3!, {d0}
+
+.L_set_less_than_8:
+        bpl         .L_set_less_than_4
+        // Set 4 bytes
+        vst1.32     {d0[0]}, [r3]!
+
+.L_set_less_than_4:
+        lsls        ip, r2, #31
+        it          ne
+        strbne      r1, [r3], #1
+        itt         cs
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3]
+        bx          lr
+
+.L_do_double_word_align:
+        rsb         ip, ip, #8
+        sub         r2, r2, ip
+
+        // Do this comparison now, otherwise we'll need to save a
+        // register to the stack since we've used all available
+        // registers.
+        cmp         ip, #4
+        blo         1f
+
+        // Need to do a four byte copy.
+        movs        ip, ip, lsl #31
+        it          mi
+        strbmi      r1, [r3], #1
+        itt         cs
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        vst1.32     {d0[0]}, [r3]!
+        b           .L_double_word_aligned
+
+1:
+        // No four byte copy.
+        movs        ip, ip, lsl #31
+        it          mi
+        strbmi      r1, [r3], #1
+        itt         cs
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        b           .L_double_word_aligned
+
+.L_set_less_than_16_unknown_align:
+        // Set up to 15 bytes.
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vst1.8      {d0}, [r3]!
+1:      bge         2f
+        vst1.32     {d0[0]}, [r3]!
+2:      movs        ip, r2, lsl #31
+        it          mi
+        strbmi      r1, [r3], #1
+        itt         cs
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        bx          lr
+END(memset)
+
+        .data
+error_string:
+        .string     "memset: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a7/cortex-a7.mk b/libc/arch-arm/cortex-a7/cortex-a7.mk
index 9af03d9..3629a57 100644
--- a/libc/arch-arm/cortex-a7/cortex-a7.mk
+++ b/libc/arch-arm/cortex-a7/cortex-a7.mk
@@ -1 +1,19 @@
-include bionic/libc/arch-arm/cortex-a15/cortex-a15.mk
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a7/bionic/memset.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/__strcat_chk.S \
+    arch-arm/cortex-a15/bionic/strcmp.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+    arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
+    arch-arm/generic/bionic/memcmp.S \
+
+libc_bionic_src_files_arm += \
+    arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
index 5e81305..6ab5a69 100644
--- a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
@@ -133,8 +133,7 @@ ENTRY_PRIVATE(MEMCPY_BASE)
         strbcs      ip, [r0], #1
         strbcs      lr, [r0], #1
 
-        ldmfd       sp!, {r0, lr}
-        bx          lr
+        ldmfd       sp!, {r0, pc}
 END(MEMCPY_BASE)
 
 ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
diff --git a/libc/arch-arm/cortex-a9/bionic/memset.S b/libc/arch-arm/cortex-a9/bionic/memset.S
index 8ee6ac2..b39fcc4 100644
--- a/libc/arch-arm/cortex-a9/bionic/memset.S
+++ b/libc/arch-arm/cortex-a9/bionic/memset.S
@@ -69,12 +69,9 @@ END(bzero)
 ENTRY(memset)
         // The neon memset only wins for less than 132.
         cmp         r2, #132
-        bhi         __memset_large_copy
-
-        stmfd       sp!, {r0}
-        .cfi_def_cfa_offset 4
-        .cfi_rel_offset r0, 0
+        bhi         .L_memset_large_copy
 
+        mov         r3, r0
         vdup.8      q0, r1
 
         /* make sure we have at least 32 bytes to write */
@@ -84,7 +81,7 @@ ENTRY(memset)
 
 1:      /* The main loop writes 32 bytes at a time */
         subs        r2, r2, #32
-        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d0 - d3}, [r3]!
         bhs         1b
 
 2:      /* less than 32 left */
@@ -93,22 +90,20 @@ ENTRY(memset)
         beq         3f
 
         // writes 16 bytes, 128-bits aligned
-        vst1.8      {d0, d1}, [r0]!
+        vst1.8      {d0, d1}, [r3]!
 3:      /* write up to 15-bytes (count in r2) */
         movs        ip, r2, lsl #29
         bcc         1f
-        vst1.8      {d0}, [r0]!
+        vst1.8      {d0}, [r3]!
 1:      bge         2f
-        vst1.32     {d0[0]}, [r0]!
+        vst1.32     {d0[0]}, [r3]!
 2:      movs        ip, r2, lsl #31
-        strbmi      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        ldmfd       sp!, {r0}
+        strbmi      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
         bx          lr
-END(memset)
 
-ENTRY_PRIVATE(__memset_large_copy)
+.L_memset_large_copy:
         /* compute the offset to align the destination
          * offset = (4-(src&3))&3 = -src & 3
          */
@@ -136,8 +131,7 @@ ENTRY_PRIVATE(__memset_large_copy)
         strbcs      r1, [r0], #1
         strbmi      r1, [r0], #1
         subs        r2, r2, r3
-        popls       {r0, r4-r7, lr}   /* return */
-        bxls        lr
+        popls       {r0, r4-r7, pc}   /* return */
 
         /* align the destination to a cache-line */
         mov         r12, r1
@@ -180,9 +174,8 @@ ENTRY_PRIVATE(__memset_large_copy)
         strhmi      r1, [r0], #2
         movs        r2, r2, lsl #2
         strbcs      r1, [r0]
-        ldmfd       sp!, {r0, r4-r7, lr}
-        bx          lr
-END(__memset_large_copy)
+        ldmfd       sp!, {r0, r4-r7, pc}
+END(memset)
 
         .data
 error_string:
diff --git a/libc/arch-arm/cortex-a9/bionic/strcat.S b/libc/arch-arm/cortex-a9/bionic/strcat.S
index f5a855e..9077a74 100644
--- a/libc/arch-arm/cortex-a9/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a9/bionic/strcat.S
@@ -70,7 +70,7 @@
 
     .macro m_scan_byte
     ldrb    r3, [r0]
-    cbz     r3, strcat_r0_scan_done
+    cbz     r3, .Lstrcat_r0_scan_done
     add     r0, #1
     .endm // m_scan_byte
 
@@ -84,10 +84,10 @@ ENTRY(strcat)
     // Quick check to see if src is empty.
     ldrb        r2, [r1]
     pld         [r1, #0]
-    cbnz        r2, strcat_continue
+    cbnz        r2, .Lstrcat_continue
     bx          lr
 
-strcat_continue:
+.Lstrcat_continue:
     // To speed up really small dst strings, unroll checking the first 4 bytes.
     m_push
     m_scan_byte
@@ -96,10 +96,10 @@ strcat_continue:
     m_scan_byte
 
     ands    r3, r0, #7
-    bne     strcat_align_src
+    bne     .Lstrcat_align_src
 
     .p2align 2
-strcat_mainloop:
+.Lstrcat_mainloop:
     ldmia   r0!, {r2, r3}
 
     pld     [r0, #64]
@@ -107,28 +107,28 @@ strcat_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_first_register
+    bne     .Lstrcat_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .Lstrcat_zero_in_second_register
+    b       .Lstrcat_mainloop
 
-strcat_zero_in_first_register:
+.Lstrcat_zero_in_first_register:
     sub     r0, r0, #4
 
-strcat_zero_in_second_register:
+.Lstrcat_zero_in_second_register:
     // Check for zero in byte 0.
     tst     ip, #0x80
     it      ne
     subne   r0, r0, #4
-    bne     strcat_r0_scan_done
+    bne     .Lstrcat_r0_scan_done
     // Check for zero in byte 1.
     tst     ip, #0x8000
     it      ne
     subne   r0, r0, #3
-    bne     strcat_r0_scan_done
+    bne     .Lstrcat_r0_scan_done
     // Check for zero in byte 2.
     tst     ip, #0x800000
     it      ne
@@ -137,33 +137,33 @@ strcat_zero_in_second_register:
     // Zero is in byte 3.
     subeq   r0, r0, #1
 
-strcat_r0_scan_done:
+.Lstrcat_r0_scan_done:
     // Unroll the first 8 bytes that will be copied.
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
-    m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
-
-strcpy_finish:
+    m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r5, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish
+    m_copy_byte reg=r5, cmd=cbnz, label=.Lstrcpy_continue
+
+.Lstrcpy_finish:
     m_ret   inst=pop
 
-strcpy_continue:
+.Lstrcpy_continue:
     pld     [r1, #0]
     ands    r3, r0, #7
-    bne     strcpy_align_dst
+    bne     .Lstrcpy_align_dst
 
-strcpy_check_src_align:
+.Lstrcpy_check_src_align:
     // At this point dst is aligned to a double word, check if src
     // is also aligned to a double word.
     ands    r3, r1, #7
-    bne     strcpy_unaligned_copy
+    bne     .Lstrcpy_unaligned_copy
 
     .p2align 2
-strcpy_mainloop:
+.Lstrcpy_mainloop:
     ldmia   r1!, {r2, r3}
 
     pld     [r1, #64]
@@ -171,17 +171,17 @@ strcpy_mainloop:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_mainloop
+    b       .Lstrcpy_mainloop
 
-strcpy_zero_in_first_register:
+.Lstrcpy_zero_in_first_register:
     lsls    lr, ip, #17
     itt     ne
     strbne  r2, [r0]
@@ -198,7 +198,7 @@ strcpy_zero_in_first_register:
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_zero_in_second_register:
+.Lstrcpy_zero_in_second_register:
     lsls    lr, ip, #17
     ittt    ne
     stmiane r0!, {r2}
@@ -218,18 +218,18 @@ strcpy_zero_in_second_register:
     strb    r4, [r0]
     m_ret   inst=pop
 
-strcpy_align_dst:
+.Lstrcpy_align_dst:
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcpy_align_to_32
+    beq     .Lstrcpy_align_to_32
 
     ldrb    r2, [r1], #1
     strb    r2, [r0], #1
-    cbz     r2, strcpy_complete
+    cbz     r2, .Lstrcpy_complete
 
-strcpy_align_to_32:
-    bcc     strcpy_align_to_64
+.Lstrcpy_align_to_32:
+    bcc     .Lstrcpy_align_to_64
 
     ldrb    r4, [r1], #1
     strb    r4, [r0], #1
@@ -242,76 +242,83 @@ strcpy_align_to_32:
     it      eq
     m_ret   inst=popeq
 
-strcpy_align_to_64:
+.Lstrcpy_align_to_64:
     tst     r3, #4
-    beq     strcpy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
-    stmia   r0!, {r2}
-    b       strcpy_check_src_align
+    beq     .Lstrcpy_check_src_align
+    // Read one byte at a time since we don't know the src alignment
+    // and we don't want to read into a different page.
+    ldrb    r4, [r1], #1
+    strb    r4, [r0], #1
+    cbz     r4, .Lstrcpy_complete
+    ldrb    r5, [r1], #1
+    strb    r5, [r0], #1
+    cbz     r5, .Lstrcpy_complete
+    ldrb    r4, [r1], #1
+    strb    r4, [r0], #1
+    cbz     r4, .Lstrcpy_complete
+    ldrb    r5, [r1], #1
+    strb    r5, [r0], #1
+    cbz     r5, .Lstrcpy_complete
+    b       .Lstrcpy_check_src_align
 
-strcpy_complete:
+.Lstrcpy_complete:
     m_ret   inst=pop
 
-strcpy_unaligned_copy:
+.Lstrcpy_unaligned_copy:
     // Dst is aligned to a double word, while src is at an unknown alignment.
     // There are 7 different versions of the unaligned copy code
     // to prevent overreading the src. The mainloop of every single version
     // will store 64 bits per loop. The difference is how much of src can
     // be read without potentially crossing a page boundary.
     tbb     [pc, r3]
-strcpy_unaligned_branchtable:
+.Lstrcpy_unaligned_branchtable:
     .byte 0
-    .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
-    .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign7 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign6 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign5 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign4 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign3 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign2 - .Lstrcpy_unaligned_branchtable)/2)
+    .byte ((.Lstrcpy_unalign1 - .Lstrcpy_unaligned_branchtable)/2)
 
     .p2align 2
     // Can read 7 bytes before possibly crossing a page.
-strcpy_unalign7:
+.Lstrcpy_unalign7:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r3, [r1]
-    cbz     r3, strcpy_unalign7_copy5bytes
+    cbz     r3, .Lstrcpy_unalign7_copy5bytes
     ldrb    r4, [r1, #1]
-    cbz     r4, strcpy_unalign7_copy6bytes
+    cbz     r4, .Lstrcpy_unalign7_copy6bytes
     ldrb    r5, [r1, #2]
-    cbz     r5, strcpy_unalign7_copy7bytes
+    cbz     r5, .Lstrcpy_unalign7_copy7bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     lsrs    ip, r3, #24
     stmia   r0!, {r2, r3}
-    beq     strcpy_unalign_return
-    b       strcpy_unalign7
+    beq     .Lstrcpy_unalign_return
+    b       .Lstrcpy_unalign7
 
-strcpy_unalign7_copy5bytes:
+.Lstrcpy_unalign7_copy5bytes:
     stmia   r0!, {r2}
     strb    r3, [r0]
-strcpy_unalign_return:
+.Lstrcpy_unalign_return:
     m_ret   inst=pop
 
-strcpy_unalign7_copy6bytes:
+.Lstrcpy_unalign7_copy6bytes:
     stmia   r0!, {r2}
     strb    r3, [r0], #1
     strb    r4, [r0], #1
     m_ret   inst=pop
 
-strcpy_unalign7_copy7bytes:
+.Lstrcpy_unalign7_copy7bytes:
     stmia   r0!, {r2}
     strb    r3, [r0], #1
     strb    r4, [r0], #1
@@ -320,30 +327,30 @@ strcpy_unalign7_copy7bytes:
 
     .p2align 2
     // Can read 6 bytes before possibly crossing a page.
-strcpy_unalign6:
+.Lstrcpy_unalign6:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .Lstrcpy_unalign_copy5bytes
     ldrb    r5, [r1, #1]
-    cbz     r5, strcpy_unalign_copy6bytes
+    cbz     r5, .Lstrcpy_unalign_copy6bytes
 
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r3, #0xff0000
-    beq     strcpy_unalign6_copy7bytes
+    beq     .Lstrcpy_unalign6_copy7bytes
     lsrs    ip, r3, #24
     stmia   r0!, {r2, r3}
-    beq     strcpy_unalign_return
-    b       strcpy_unalign6
+    beq     .Lstrcpy_unalign_return
+    b       .Lstrcpy_unalign6
 
-strcpy_unalign6_copy7bytes:
+.Lstrcpy_unalign6_copy7bytes:
     stmia   r0!, {r2}
     strh    r3, [r0], #2
     lsr     r3, #16
@@ -352,16 +359,16 @@ strcpy_unalign6_copy7bytes:
 
     .p2align 2
     // Can read 5 bytes before possibly crossing a page.
-strcpy_unalign5:
+.Lstrcpy_unalign5:
     ldr     r2, [r1], #4
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldrb    r4, [r1]
-    cbz     r4, strcpy_unalign_copy5bytes
+    cbz     r4, .Lstrcpy_unalign_copy5bytes
 
     ldr     r3, [r1], #4
 
@@ -370,17 +377,17 @@ strcpy_unalign5:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign5
+    b       .Lstrcpy_unalign5
 
-strcpy_unalign_copy5bytes:
+.Lstrcpy_unalign_copy5bytes:
     stmia   r0!, {r2}
     strb    r4, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy6bytes:
+.Lstrcpy_unalign_copy6bytes:
     stmia   r0!, {r2}
     strb    r4, [r0], #1
     strb    r5, [r0]
@@ -388,13 +395,13 @@ strcpy_unalign_copy6bytes:
 
     .p2align 2
     // Can read 4 bytes before possibly crossing a page.
-strcpy_unalign4:
+.Lstrcpy_unalign4:
     ldmia   r1!, {r2}
 
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     ldmia   r1!, {r3}
     pld     [r1, #64]
@@ -402,20 +409,20 @@ strcpy_unalign4:
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign4
+    b       .Lstrcpy_unalign4
 
     .p2align 2
     // Can read 3 bytes before possibly crossing a page.
-strcpy_unalign3:
+.Lstrcpy_unalign3:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign3_copy1byte
+    cbz     r2, .Lstrcpy_unalign3_copy1byte
     ldrb    r3, [r1, #1]
-    cbz     r3, strcpy_unalign3_copy2bytes
+    cbz     r3, .Lstrcpy_unalign3_copy2bytes
     ldrb    r4, [r1, #2]
-    cbz     r4, strcpy_unalign3_copy3bytes
+    cbz     r4, .Lstrcpy_unalign3_copy3bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -423,26 +430,26 @@ strcpy_unalign3:
     pld     [r1, #64]
 
     lsrs    lr, r2, #24
-    beq     strcpy_unalign_copy4bytes
+    beq     .Lstrcpy_unalign_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign3
+    b       .Lstrcpy_unalign3
 
-strcpy_unalign3_copy1byte:
+.Lstrcpy_unalign3_copy1byte:
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign3_copy2bytes:
+.Lstrcpy_unalign3_copy2bytes:
     strb    r2, [r0], #1
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_unalign3_copy3bytes:
+.Lstrcpy_unalign3_copy3bytes:
     strb    r2, [r0], #1
     strb    r3, [r0], #1
     strb    r4, [r0]
@@ -450,34 +457,34 @@ strcpy_unalign3_copy3bytes:
 
     .p2align 2
     // Can read 2 bytes before possibly crossing a page.
-strcpy_unalign2:
+.Lstrcpy_unalign2:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .Lstrcpy_unalign_copy1byte
     ldrb    r3, [r1, #1]
-    cbz     r3, strcpy_unalign_copy2bytes
+    cbz     r3, .Lstrcpy_unalign_copy2bytes
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
     pld     [r1, #64]
 
     tst     r2, #0xff0000
-    beq     strcpy_unalign_copy3bytes
+    beq     .Lstrcpy_unalign_copy3bytes
     lsrs    ip, r2, #24
-    beq     strcpy_unalign_copy4bytes
+    beq     .Lstrcpy_unalign_copy4bytes
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign2
+    b       .Lstrcpy_unalign2
 
     .p2align 2
     // Can read 1 byte before possibly crossing a page.
-strcpy_unalign1:
+.Lstrcpy_unalign1:
     ldrb    r2, [r1]
-    cbz     r2, strcpy_unalign_copy1byte
+    cbz     r2, .Lstrcpy_unalign_copy1byte
 
     ldr     r2, [r1], #4
     ldr     r3, [r1], #4
@@ -487,62 +494,62 @@ strcpy_unalign1:
     sub     ip, r2, #0x01010101
     bic     ip, ip, r2
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_first_register
+    bne     .Lstrcpy_zero_in_first_register
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcpy_zero_in_second_register
+    bne     .Lstrcpy_zero_in_second_register
 
     stmia   r0!, {r2, r3}
-    b       strcpy_unalign1
+    b       .Lstrcpy_unalign1
 
-strcpy_unalign_copy1byte:
+.Lstrcpy_unalign_copy1byte:
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy2bytes:
+.Lstrcpy_unalign_copy2bytes:
     strb    r2, [r0], #1
     strb    r3, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy3bytes:
+.Lstrcpy_unalign_copy3bytes:
     strh    r2, [r0], #2
     lsr     r2, #16
     strb    r2, [r0]
     m_ret   inst=pop
 
-strcpy_unalign_copy4bytes:
+.Lstrcpy_unalign_copy4bytes:
     stmia   r0, {r2}
     m_ret   inst=pop
 
-strcat_align_src:
+.Lstrcat_align_src:
     // Align to a double word (64 bits).
     rsb     r3, r3, #8
     lsls    ip, r3, #31
-    beq     strcat_align_to_32
+    beq     .Lstrcat_align_to_32
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
 
-strcat_align_to_32:
-    bcc     strcat_align_to_64
+.Lstrcat_align_to_32:
+    bcc     .Lstrcat_align_to_64
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
     ldrb    r2, [r0], #1
-    cbz     r2, strcat_r0_update
+    cbz     r2, .Lstrcat_r0_update
 
-strcat_align_to_64:
+.Lstrcat_align_to_64:
     tst     r3, #4
-    beq     strcat_mainloop
+    beq     .Lstrcat_mainloop
     ldr     r3, [r0], #4
 
     sub     ip, r3, #0x01010101
     bic     ip, ip, r3
     ands    ip, ip, #0x80808080
-    bne     strcat_zero_in_second_register
-    b       strcat_mainloop
+    bne     .Lstrcat_zero_in_second_register
+    b       .Lstrcat_mainloop
 
-strcat_r0_update:
+.Lstrcat_r0_update:
     sub     r0, r0, #1
-    b strcat_r0_scan_done
+    b .Lstrcat_r0_scan_done
 END(strcat)
diff --git a/libc/arch-arm/cortex-a9/bionic/string_copy.S b/libc/arch-arm/cortex-a9/bionic/string_copy.S
index caf5a11..642db0f 100644
--- a/libc/arch-arm/cortex-a9/bionic/string_copy.S
+++ b/libc/arch-arm/cortex-a9/bionic/string_copy.S
@@ -244,13 +244,20 @@ ENTRY(strcpy)
 .Lstringcopy_align_to_64:
     tst     r3, #4
     beq     .Lstringcopy_check_src_align
-    ldr     r2, [r1], #4
-
-    sub     ip, r2, #0x01010101
-    bic     ip, ip, r2
-    ands    ip, ip, #0x80808080
-    bne     .Lstringcopy_zero_in_first_register
-    stmia   r0!, {r2}
+    // Read one byte at a time since we don't have any idea about the alignment
+    // of the source and we don't want to read into a different page.
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
+    ldrb    r2, [r1], #1
+    strb    r2, [r0], #1
+    cbz     r2, .Lstringcopy_complete
     b       .Lstringcopy_check_src_align
 
 .Lstringcopy_complete:
diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk
index 7b38de1..db4bcc7 100644
--- a/libc/arch-arm/cortex-a9/cortex-a9.mk
+++ b/libc/arch-arm/cortex-a9/cortex-a9.mk
@@ -10,6 +10,7 @@ libc_bionic_src_files_arm += \
     arch-arm/cortex-a9/bionic/strlen.S \
 
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
 
 libc_bionic_src_files_arm += \
diff --git a/libc/arch-arm/denver/denver.mk b/libc/arch-arm/denver/denver.mk
index 5fddf95..e81f8c7 100644
--- a/libc/arch-arm/denver/denver.mk
+++ b/libc/arch-arm/denver/denver.mk
@@ -1,4 +1,5 @@
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
     arch-arm/denver/bionic/memcpy.S \
     arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/generic/bionic/memchr.S b/libc/arch-arm/generic/bionic/memchr.S
new file mode 100644
index 0000000..cb00d82
--- /dev/null
+++ b/libc/arch-arm/generic/bionic/memchr.S
@@ -0,0 +1,155 @@
+/* Copyright (c) 2010-2015, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This memchr routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.   It has a fast past for short sizes, and has
+   an optimised path for large data sets; the worst case is finding the
+   match early in a large data set.
+
+ */
+
+#include <private/bionic_asm.h>
+
+@ 2011-02-07 david.gilbert@linaro.org
+@    Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@    Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@    Removed unneeded cbz from align loop
+
+	.syntax unified
+	.arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+	.text
+	.thumb
+
+@ ---------------------------------------------------------------------------
+	.thumb_func
+ENTRY(memchr)
+	.p2align 4,,15
+	@ r0 = start of memory to scan
+	@ r1 = character to look for
+	@ r2 = length
+	@ returns r0 = pointer to character or NULL if not found
+	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
+
+	cmp	r2,#16		@ If it's short don't bother with anything clever
+	blt	20f
+
+	tst	r0, #7		@ If it's already aligned skip the next bit
+	beq	10f
+
+	@ Work up to an aligned point
+5:
+	ldrb	r3, [r0],#1
+	subs	r2, r2, #1
+	cmp	r3, r1
+	beq	50f		@ If it matches exit found
+	tst	r0, #7
+	bne	5b		@ If not aligned yet then do next byte
+
+10:
+	@ At this point, we are aligned, we know we have at least 8 bytes to work with
+	push	{r4,r5,r6,r7}
+	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
+	orr	r1, r1, r1, lsl #16
+	bic	r4, r2, #7	@ Number of double words to work with
+	mvns	r7, #0		@ all F's
+	movs	r3, #0
+
+15:
+	ldrd    r5,r6,[r0],#8
+	subs	r4, r4, #8
+	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
+	eor	r6,r6, r1
+	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	cbnz	r6, 60f
+	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
+
+	pop	{r4,r5,r6,r7}
+	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
+	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
+
+20:
+	cbz	r2, 40f		@ 0 length or hit the end already then not found
+
+21:  @ Post aligned section, or just a short call
+	ldrb	r3,[r0],#1
+	subs	r2,r2,#1
+	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
+	cbz	r3, 50f
+	bne	21b		@ on r2 flags
+
+40:
+	movs	r0,#0		@ not found
+	bx	lr
+
+50:
+	subs	r0,r0,#1	@ found
+	bx	lr
+
+60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+	@ r0 points to the start of the double word after the one that was tested
+	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
+	cmp	r5, #0
+	itte	eq
+	moveq	r5, r6		@ the end is in the 2nd word
+	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
+	subne	r0,r0,#7	@ or 2nd byte of 1st word
+
+	@ r0 currently points to the 3rd byte of the word containing the hit
+	tst	r5, # CHARTSTMASK(0)	@ 1st character
+	bne	61f
+	adds	r0,r0,#1
+	tst	r5, # CHARTSTMASK(1)	@ 2nd character
+	ittt	eq
+	addeq	r0,r0,#1
+	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
+	@ If not the 3rd must be the last one
+	addeq	r0,r0,#1
+
+61:
+	pop	{r4,r5,r6,r7}
+	subs	r0,r0,#1
+	bx	lr
+END(memchr)
diff --git a/libc/arch-arm/generic/bionic/memcmp.S b/libc/arch-arm/generic/bionic/memcmp.S
index c78dbd4..6643d55 100644
--- a/libc/arch-arm/generic/bionic/memcmp.S
+++ b/libc/arch-arm/generic/bionic/memcmp.S
@@ -221,8 +221,7 @@ ENTRY(memcmp)
         bne         8b
 
 9:      /* restore registers and return */
-        ldmfd       sp!, {r4, lr}
-        bx          lr
+        ldmfd       sp!, {r4, pc}
 
 10:     /* process less than 12 bytes */
         cmp         r2, #0
diff --git a/libc/arch-arm/generic/bionic/memcpy.S b/libc/arch-arm/generic/bionic/memcpy.S
index ea5a399..65cba4c 100644
--- a/libc/arch-arm/generic/bionic/memcpy.S
+++ b/libc/arch-arm/generic/bionic/memcpy.S
@@ -194,8 +194,7 @@ ENTRY(memcpy)
 
         /* we're done! restore everything and return */
 1:      ldmfd       sp!, {r5-r11}
-        ldmfd       sp!, {r0, r4, lr}
-        bx          lr
+        ldmfd       sp!, {r0, r4, pc}
 
         /********************************************************************/
 
@@ -385,8 +384,7 @@ ENTRY(memcpy)
 
         /* we're done! restore sp and spilled registers and return */
         add         sp,  sp, #28
-        ldmfd       sp!, {r0, r4, lr}
-        bx          lr
+        ldmfd       sp!, {r0, r4, pc}
 END(memcpy)
 
         // Only reached when the __memcpy_chk check fails.
diff --git a/libc/arch-arm/generic/bionic/memset.S b/libc/arch-arm/generic/bionic/memset.S
index d17a9c4..b8eabbf 100644
--- a/libc/arch-arm/generic/bionic/memset.S
+++ b/libc/arch-arm/generic/bionic/memset.S
@@ -82,8 +82,7 @@ ENTRY(memset)
         strbcs      r1, [r0], #1
         strbmi      r1, [r0], #1
         subs        r2, r2, r3
-        popls       {r0, r4-r7, lr}    /* return */
-        bxls        lr
+        popls       {r0, r4-r7, pc}    /* return */
 
         /* align the destination to a cache-line */
         mov         r12, r1
@@ -126,8 +125,7 @@ ENTRY(memset)
         strhmi      r1, [r0], #2
         movs        r2, r2, lsl #2
         strbcs      r1, [r0]
-        ldmfd       sp!, {r0, r4-r7, lr}
-        bx          lr
+        ldmfd       sp!, {r0, r4-r7, pc}
 END(memset)
 
         .data
diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index e49d6d2..016c882 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -1,4 +1,5 @@
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
     arch-arm/generic/bionic/memcpy.S \
     arch-arm/generic/bionic/memset.S \
diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S
index 246f159..1a39c5b 100644
--- a/libc/arch-arm/krait/bionic/__strcat_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcat_chk.S
@@ -40,7 +40,7 @@
 ENTRY(__strcat_chk)
     pld     [r0, #0]
     push    {r0, lr}
-    .cfi_def_cfa_offset 8
+    .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r0, 0
     .cfi_rel_offset lr, 4
     push    {r4, r5}
@@ -177,7 +177,7 @@ ENTRY(__strcat_chk)
 .L_strlen_done:
     add     r2, r3, r4
     cmp     r2, lr
-    bhi     __strcat_chk_failed
+    bhi     .L_strcat_chk_failed
 
     // Set up the registers for the memcpy code.
     mov     r1, r5
@@ -185,20 +185,17 @@ ENTRY(__strcat_chk)
     mov     r2, r4
     add     r0, r0, r3
     pop     {r4, r5}
-END(__strcat_chk)
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore r4
+    .cfi_restore r5
 
-#define MEMCPY_BASE         __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__strcat_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
+    // Undo the above cfi directives.
     .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r4, 0
     .cfi_rel_offset r5, 4
-
+.L_strcat_chk_failed:
     ldr     r0, error_message
     ldr     r1, error_code
 1:
@@ -208,7 +205,7 @@ error_code:
     .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
 error_message:
     .word   error_string-(1b+4)
-END(__strcat_chk_failed)
+END(__strcat_chk)
 
     .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S
index db76686..00202f3 100644
--- a/libc/arch-arm/krait/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S
@@ -39,7 +39,7 @@
 ENTRY(__strcpy_chk)
     pld     [r0, #0]
     push    {r0, lr}
-    .cfi_def_cfa_offset 8
+    .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r0, 0
     .cfi_rel_offset lr, 4
 
@@ -149,21 +149,14 @@ ENTRY(__strcpy_chk)
     pld     [r1, #64]
     ldr     r0, [sp]
     cmp     r3, lr
-    bhs     __strcpy_chk_failed
+    bhs     .L_strcpy_chk_failed
 
     // Add 1 for copy length to get the string terminator.
     add     r2, r3, #1
-END(__strcpy_chk)
 
-#define MEMCPY_BASE         __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__strcpy_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-
+.L_strcpy_chk_failed:
     ldr     r0, error_message
     ldr     r1, error_code
 1:
@@ -173,7 +166,7 @@ error_code:
     .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
 error_message:
     .word   error_string-(1b+4)
-END(__strcpy_chk_failed)
+END(__strcpy_chk)
 
     .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 9ff46a8..5d27b57 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -45,7 +45,7 @@
 
 ENTRY(__memcpy_chk)
         cmp         r2, r3
-        bhi         __memcpy_chk_fail
+        bhi         .L_memcpy_chk_fail
 
         // Fall through to memcpy...
 END(__memcpy_chk)
@@ -53,19 +53,20 @@ END(__memcpy_chk)
 ENTRY(memcpy)
         pld     [r1, #64]
         stmfd   sp!, {r0, lr}
-        .cfi_def_cfa_offset 8
+        .cfi_adjust_cfa_offset 8
         .cfi_rel_offset r0, 0
         .cfi_rel_offset lr, 4
-END(memcpy)
 
-#define MEMCPY_BASE         __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__memcpy_chk_fail)
+        // Undo the cfi directives from above.
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r0
+        .cfi_restore lr
+.L_memcpy_chk_fail:
         // Preserve lr for backtrace.
         push    {lr}
-        .cfi_def_cfa_offset 4
+        .cfi_adjust_cfa_offset 4
         .cfi_rel_offset lr, 0
 
         ldr     r0, error_message
@@ -77,7 +78,7 @@ error_code:
         .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
         .word   error_string-(1b+4)
-END(__memcpy_chk_fail)
+END(memcpy)
 
         .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S
index 035dcf1..76c5a84 100644
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -1,123 +1,191 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-/*
- * This code assumes it is running on a processor that supports all arm v7
- * instructions, that supports neon instructions, and that has a 32 byte
- * cache line.
- */
-
-// Assumes neon instructions and a cache line size of 32 bytes.
-
-ENTRY_PRIVATE(MEMCPY_BASE)
-        .cfi_def_cfa_offset 8
-        .cfi_rel_offset r0, 0
-        .cfi_rel_offset lr, 4
-
-        /* do we have at least 16-bytes to copy (needed for alignment below) */
-        cmp         r2, #16
-        blo         5f
-
-        /* align destination to cache-line for the write-buffer */
-        rsb         r3, r0, #0
-        ands        r3, r3, #0xF
-        beq         2f
-
-        /* copy up to 15-bytes (count in r3) */
-        sub         r2, r2, r3
-        movs        ip, r3, lsl #31
-        itt         mi
-        ldrbmi      lr, [r1], #1
-        strbmi      lr, [r0], #1
-        itttt       cs
-        ldrbcs      ip, [r1], #1
-        ldrbcs      lr, [r1], #1
-        strbcs      ip, [r0], #1
-        strbcs      lr, [r0], #1
-        movs        ip, r3, lsl #29
-        bge         1f
-        // copies 4 bytes, destination 32-bits aligned
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
-1:      bcc         2f
-        // copies 8 bytes, destination 64-bits aligned
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0, :64]!
-
-2:      /* make sure we have at least 64 bytes to copy */
-        subs        r2, r2, #64
-        blo         2f
-
-1:      /* The main loop copies 64 bytes at a time */
-        vld1.8      {d0  - d3},   [r1]!
-        vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(32*8)]
-        subs        r2, r2, #64
-        vst1.8      {d0  - d3},   [r0, :128]!
-        vst1.8      {d4  - d7},   [r0, :128]!
-        bhs         1b
-
-2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
-        adds        r2, r2, #32
-        blo         4f
-
-        /* Copy 32 bytes. These cache lines were already preloaded */
-        vld1.8      {d0 - d3},  [r1]!
-        sub         r2, r2, #32
-        vst1.8      {d0 - d3},  [r0, :128]!
-
-4:      /* less than 32 left */
-        add         r2, r2, #32
-        tst         r2, #0x10
-        beq         5f
-        // copies 16 bytes, 128-bits aligned
-        vld1.8      {d0, d1}, [r1]!
-        vst1.8      {d0, d1}, [r0, :128]!
-
-5:      /* copy up to 15-bytes (count in r2) */
-        movs        ip, r2, lsl #29
-        bcc         1f
-        vld1.8      {d0}, [r1]!
-        vst1.8      {d0}, [r0]!
-1:      bge         2f
-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
-2:      movs        ip, r2, lsl #31
-        itt         mi
-        ldrbmi      r3, [r1], #1
-        strbmi      r3, [r0], #1
-        itttt       cs
-        ldrbcs      ip, [r1], #1
-        ldrbcs      lr, [r1], #1
-        strbcs      ip, [r0], #1
-        strbcs      lr, [r0], #1
-
-        ldmfd       sp!, {r0, lr}
-        bx          lr
-END(MEMCPY_BASE)
+/***************************************************************************
+ Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of The Linux Foundation nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/* Assumes neon instructions and a cache line size of 64 bytes. */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+#define PLDOFFS	(10)
+#define PLDTHRESH (PLDOFFS)
+#define BBTHRESH (4096/64)
+#define PLDSIZE (64)
+
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+
+	.text
+	.fpu    neon
+
+.L_memcpy_base:
+	cmp	r2, #4
+	blt	.L_neon_lt4
+	cmp	r2, #16
+	blt	.L_neon_lt16
+	cmp	r2, #32
+	blt	.L_neon_16
+	cmp	r2, #64
+	blt	.L_neon_copy_32_a
+
+	mov	r12, r2, lsr #6
+	cmp	r12, #PLDTHRESH
+	ble	.L_neon_copy_64_loop_nopld
+
+	push	{r9, r10}
+	.cfi_adjust_cfa_offset 8
+	.cfi_rel_offset r9, 0
+	.cfi_rel_offset r10, 4
+
+	cmp	r12, #BBTHRESH
+	ble	.L_neon_prime_pump
+
+	add	lr, r0, #0x400
+	add	r9, r1, #(PLDOFFS*PLDSIZE)
+	sub	lr, lr, r9
+	lsl	lr, lr, #21
+	lsr	lr, lr, #21
+	add	lr, lr, #(PLDOFFS*PLDSIZE)
+	cmp	r12, lr, lsr #6
+	ble	.L_neon_prime_pump
+
+	itt	gt
+	movgt	r9, #(PLDOFFS)
+	rsbsgt	r9, r9, lr, lsr #6
+	ble	.L_neon_prime_pump
+
+	add	r10, r1, lr
+	bic	r10, #0x3F
+
+	sub	r12, r12, lr, lsr #6
+
+	cmp	r9, r12
+	itee	le
+	suble	r12, r12, r9
+	movgt	r9, r12
+	movgt	r12, #0
+
+	pld	[r1, #((PLDOFFS-1)*PLDSIZE)]
+.L_neon_copy_64_loop_outer_doublepld:
+	pld	[r1, #((PLDOFFS)*PLDSIZE)]
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]!
+	ldr	r3, [r10]
+	subs	r9, r9, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]!
+	add	r10, #64
+	bne	.L_neon_copy_64_loop_outer_doublepld
+	cmp	r12, #0
+	beq	.L_neon_pop_before_nopld
+
+	cmp	r12, #(512*1024/64)
+	blt	.L_neon_copy_64_loop_outer
+
+.L_neon_copy_64_loop_ddr:
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]!
+	pld	[r10]
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]!
+	add	r10, #64
+	bne	.L_neon_copy_64_loop_ddr
+	b	.L_neon_pop_before_nopld
+
+.L_neon_prime_pump:
+	mov	lr, #(PLDOFFS*PLDSIZE)
+	add	r10, r1, #(PLDOFFS*PLDSIZE)
+	bic	r10, #0x3F
+	sub	r12, r12, #PLDOFFS
+	ldr	r3, [r10, #(-1*PLDSIZE)]
+
+.L_neon_copy_64_loop_outer:
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]!
+	ldr	r3, [r10]
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]!
+	add	r10, #64
+	bne	.L_neon_copy_64_loop_outer
+
+.L_neon_pop_before_nopld:
+	mov	r12, lr, lsr #6
+	pop	{r9, r10}
+	.cfi_adjust_cfa_offset -8
+	.cfi_restore r9
+	.cfi_restore r10
+
+.L_neon_copy_64_loop_nopld:
+	vld1.32	{q8, q9}, [r1]!
+	vld1.32	{q10, q11}, [r1]!
+	subs	r12, r12, #1
+	vst1.32	{q8, q9}, [r0]!
+	vst1.32	{q10, q11}, [r0]!
+	bne	.L_neon_copy_64_loop_nopld
+	ands	r2, r2, #0x3f
+	beq	.L_neon_exit
+
+.L_neon_copy_32_a:
+	movs	r3, r2, lsl #27
+	bcc	.L_neon_16
+	vld1.32	{q0,q1}, [r1]!
+	vst1.32	{q0,q1}, [r0]!
+
+.L_neon_16:
+	bpl	.L_neon_lt16
+	vld1.32	{q8}, [r1]!
+	vst1.32	{q8}, [r0]!
+	ands	r2, r2, #0x0f
+	beq	.L_neon_exit
+
+.L_neon_lt16:
+	movs	r3, r2, lsl #29
+	bcc	1f
+	vld1.8	{d0}, [r1]!
+	vst1.8	{d0}, [r0]!
+1:
+	bge	.L_neon_lt4
+	vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
+	vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+.L_neon_lt4:
+	movs	r2, r2, lsl #31
+	itt	cs
+	ldrhcs	r3, [r1], #2
+	strhcs	r3, [r0], #2
+	itt	mi
+	ldrbmi	r3, [r1]
+	strbmi	r3, [r0]
+
+.L_neon_exit:
+	pop	{r0, pc}
diff --git a/libc/arch-arm/krait/bionic/memmove.S b/libc/arch-arm/krait/bionic/memmove.S
new file mode 100644
index 0000000..aea7315
--- /dev/null
+++ b/libc/arch-arm/krait/bionic/memmove.S
@@ -0,0 +1,219 @@
+/***************************************************************************
+ Copyright (c) 2009-2014 The Linux Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of The Linux Foundation nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/***************************************************************************
+ *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ *     Inputs:
+ *        dest: The destination buffer
+ *        src: The source buffer
+ *        n: The size of the buffer to transfer
+ *     Outputs:
+ *
+ ***************************************************************************/
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+/*
+ * These can be overridden in:
+ *   device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ *   TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ *   TARGET_USE_KRAIT_PLD_SET := true
+ *   TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ *   TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ *   TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS	(10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#if (PLDOFFS < 5)
+#error Routine does not support offsets less than 5
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+
+	.text
+	.syntax	unified
+	.fpu neon
+	.thumb
+	.thumb_func
+
+//ENTRY(bcopy)
+//        //.cfi_startproc
+//	mov	r12, r0
+//	mov	r0, r1
+//	mov	r1, r12
+//        // Fall through to memmove
+//        //.cfi_endproc
+//END(bcopy)
+
+ENTRY(memmove)
+_memmove_words:
+        //.cfi_startproc
+	.save	{r0, lr}
+	cmp	r2, #0
+	it	ne
+	subsne	r12, r0, r1	// Warning: do not combine these "it" blocks
+	it	eq
+	bxeq	lr
+//	memmove only if r1 < r0 < r1+r2
+	cmp	r0, r1
+	itt	ge
+	addge	r12, r1, r2
+	cmpge	r12, r0
+	it	le
+	ble	memcpy
+	cmp	r2, #4
+	it	le
+	ble	.Lneon_b2f_smallcopy_loop
+	push	{r0, lr}
+	add	r0, r0, r2
+	add	r1, r1, r2
+	cmp	r2, #64
+	it	ge
+	bge	.Lneon_b2f_copy_64
+	cmp	r2, #32
+	it	ge
+	bge	.Lneon_b2f_copy_32
+	cmp	r2, #8
+	it	ge
+	bge	.Lneon_b2f_copy_8
+	b	.Lneon_b2f_copy_1
+.Lneon_b2f_copy_64:
+	mov	r12, r2, lsr #6
+	add	r0, r0, #32
+	add	r1, r1, #32
+	cmp	r12, #PLDTHRESH
+	it	le
+	ble	.Lneon_b2f_copy_64_loop_nopld
+	sub	r12, #PLDOFFS
+	sub	lr, r1, #(PLDOFFS)*PLDSIZE
+.Lneon_b2f_copy_64_loop_outer:
+	pld	[lr]
+	sub	r1, r1, #96
+	sub	r0, r0, #96
+	vld1.32	{q0, q1}, [r1]!
+	vld1.32	{q2, q3}, [r1]
+	sub	lr, lr, #64
+	subs	r12, r12, #1
+	vst1.32	{q0, q1}, [r0]!
+	vst1.32	{q2, q3}, [r0]
+	it	ne
+	bne	.Lneon_b2f_copy_64_loop_outer
+	mov	r12, #PLDOFFS
+.Lneon_b2f_copy_64_loop_nopld:
+	sub	r1, r1, #96
+	sub	r0, r0, #96
+	vld1.32	{q8, q9}, [r1]!
+	vld1.32	{q10, q11}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{q8, q9}, [r0]!
+	vst1.32	{q10, q11}, [r0]
+	it	ne
+	bne	.Lneon_b2f_copy_64_loop_nopld
+	ands	r2, r2, #0x3f
+	it	eq
+	beq	.Lneon_memmove_done
+	sub	r1, r1, #32
+	sub	r0, r0, #32
+	cmp	r2, #32
+	it	lt
+	blt	.Lneon_b2f_copy_8
+.Lneon_b2f_copy_32:
+	sub	r1, r1, #32
+	sub	r0, r0, #32
+	vld1.32	{q0, q1}, [r1]
+	vst1.32	{q0, q1}, [r0]
+	ands	r2, r2, #0x1f
+	it	eq
+	beq	.Lneon_memmove_done
+.Lneon_b2f_copy_8:
+	movs	r12, r2, lsr #0x3
+	it	eq
+	beq	.Lneon_b2f_copy_1
+.Lneon_b2f_copy_8_loop:
+	sub	r1, r1, #8
+	sub	r0, r0, #8
+	vld1.32	{d0}, [r1]
+	subs	r12, r12, #1
+	vst1.32	{d0}, [r0]
+	it	ne
+	bne	.Lneon_b2f_copy_8_loop
+	ands	r2, r2, #0x7
+	beq	.Lneon_memmove_done
+.Lneon_b2f_copy_1:
+	movs	r12, r2, lsl #29
+	itttt	mi
+	submi	r1, r1, #4
+	submi	r0, r0, #4
+	ldrmi	r3, [r1]
+	strmi	r3, [r0]
+	movs	r2, r2, lsl #31
+	itttt	cs
+	subcs	r1, r1, #2
+	subcs	r0, r0, #2
+	ldrhcs	r3, [r1]
+	strhcs	r3, [r0]
+	itttt	mi
+	submi	r1, r1, #1
+	submi	r0, r0, #1
+	ldrbmi	r12, [r1]
+	strbmi	r12, [r0]
+.Lneon_memmove_done:
+	pop	{r0, pc}
+.Lneon_b2f_smallcopy_loop:
+	// 4 bytes or less
+	add	r1, r1, r2
+	add	r0, r0, r2
+	movs	r12, r2, lsl #29
+	itttt	mi
+	submi	r1, r1, #4
+	submi	r0, r0, #4
+	ldrmi	r3, [r1]
+	strmi	r3, [r0]
+	movs	r2, r2, lsl #31
+	itttt	cs
+	subcs	r1, r1, #2
+	subcs	r0, r0, #2
+	ldrhcs	r3, [r1]
+	strhcs	r3, [r0]
+	itttt	mi
+	submi	r1, r1, #1
+	submi	r0, r0, #1
+	ldrbmi	r12, [r1]
+	strbmi	r12, [r0]
+	bx	lr
+//	.cfi_endproc
+END(memmove)
+
diff --git a/libc/arch-arm/krait/bionic/memset.S b/libc/arch-arm/krait/bionic/memset.S
index a4fbe17..ae05965 100644
--- a/libc/arch-arm/krait/bionic/memset.S
+++ b/libc/arch-arm/krait/bionic/memset.S
@@ -69,10 +69,7 @@ END(bzero)
 
 /* memset() returns its first argument.  */
 ENTRY(memset)
-        stmfd       sp!, {r0}
-        .cfi_def_cfa_offset 4
-        .cfi_rel_offset r0, 0
-
+        mov         r3, r0
         vdup.8      q0, r1
 
         /* make sure we have at least 32 bytes to write */
@@ -82,7 +79,7 @@ ENTRY(memset)
 
 1:      /* The main loop writes 32 bytes at a time */
         subs        r2, r2, #32
-        vst1.8      {d0 - d3}, [r0]!
+        vst1.8      {d0 - d3}, [r3]!
         bhs         1b
 
 2:      /* less than 32 left */
@@ -91,18 +88,17 @@ ENTRY(memset)
         beq         3f
 
         // writes 16 bytes, 128-bits aligned
-        vst1.8      {d0, d1}, [r0]!
+        vst1.8      {d0, d1}, [r3]!
 3:      /* write up to 15-bytes (count in r2) */
         movs        ip, r2, lsl #29
         bcc         1f
-        vst1.8      {d0}, [r0]!
+        vst1.8      {d0}, [r3]!
 1:      bge         2f
-        vst1.32     {d0[0]}, [r0]!
+        vst1.32     {d0[0]}, [r3]!
 2:      movs        ip, r2, lsl #31
-        strbmi      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        strbcs      r1, [r0], #1
-        ldmfd       sp!, {r0}
+        strbmi      r1, [r3], #1
+        strbcs      r1, [r3], #1
+        strbcs      r1, [r3], #1
         bx          lr
 END(memset)
 
diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk
index 88b4d66..5f5b414 100644
--- a/libc/arch-arm/krait/krait.mk
+++ b/libc/arch-arm/krait/krait.mk
@@ -1,9 +1,19 @@
 libc_bionic_src_files_arm += \
-    arch-arm/krait/bionic/memcpy.S \
     arch-arm/krait/bionic/memset.S \
     arch-arm/krait/bionic/strcmp.S \
     arch-arm/krait/bionic/__strcat_chk.S \
     arch-arm/krait/bionic/__strcpy_chk.S \
+    arch-arm/krait/bionic/memmove.S
+
+#For some targets we don't need this optimization.
+#Corresponding flag is defined in device specific folder.
+ifeq ($(TARGET_CPU_MEMCPY_BASE_OPT_DISABLE),true)
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S
+else
+libc_bionic_src_files_arm += \
+    arch-arm/krait/bionic/memcpy.S
+endif
 
 # Use cortex-a15 versions of strcat/strcpy/strlen and standard memmove
 libc_bionic_src_files_arm += \
@@ -13,7 +23,7 @@ libc_bionic_src_files_arm += \
     arch-arm/cortex-a15/bionic/strlen.S \
 
 libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
     arch-arm/generic/bionic/memcmp.S \
 
-libc_bionic_src_files_arm += \
-    arch-arm/denver/bionic/memmove.S \
+
diff --git a/libc/arch-arm/scorpion/scorpion.mk b/libc/arch-arm/scorpion/scorpion.mk
new file mode 100644
index 0000000..ce18a7e
--- /dev/null
+++ b/libc/arch-arm/scorpion/scorpion.mk
@@ -0,0 +1,18 @@
+# Use krait versions of memset/strcmp/memmove
+libc_bionic_src_files_arm += \
+    arch-arm/krait/bionic/memset.S \
+    arch-arm/krait/bionic/strcmp.S \
+    arch-arm/krait/bionic/memmove.S
+
+libc_bionic_src_files_arm += \
+    arch-arm/cortex-a15/bionic/memcpy.S \
+    arch-arm/cortex-a15/bionic/stpcpy.S \
+    arch-arm/cortex-a15/bionic/strcat.S \
+    arch-arm/cortex-a15/bionic/__strcat_chk.S \
+    arch-arm/cortex-a15/bionic/strcpy.S \
+    arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+    arch-arm/cortex-a15/bionic/strlen.S
+
+libc_bionic_src_files_arm += \
+    arch-arm/generic/bionic/memchr.S \
+    arch-arm/generic/bionic/memcmp.S
diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index 470a038..1b8d534 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -8,7 +8,6 @@ libc_bionic_src_files_arm64 += \
     bionic/__memset_chk.cpp \
     bionic/__strcpy_chk.cpp \
     bionic/__strcat_chk.cpp \
-    bionic/strrchr.cpp \
 
 libc_freebsd_src_files_arm64 += \
     upstream-freebsd/lib/libc/string/wcscat.c \
diff --git a/libc/arch-arm64/denver64/denver64.mk b/libc/arch-arm64/denver64/denver64.mk
index d619c11..3c453bb 100644
--- a/libc/arch-arm64/denver64/denver64.mk
+++ b/libc/arch-arm64/denver64/denver64.mk
@@ -11,4 +11,5 @@ libc_bionic_src_files_arm64 += \
     arch-arm64/generic/bionic/strlen.S \
     arch-arm64/generic/bionic/strncmp.S \
     arch-arm64/generic/bionic/strnlen.S \
+    arch-arm64/generic/bionic/strrchr.S \
     arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S
index c5d42ce..f850624 100644
--- a/libc/arch-arm64/generic/bionic/memcpy_base.S
+++ b/libc/arch-arm64/generic/bionic/memcpy_base.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Linaro Limited
+/* Copyright (c) 2012-2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,158 +22,196 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses.
  *
  */
 
+#include <private/bionic_asm.h>
+
 #define dstin	x0
 #define src	x1
 #define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
-
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
-
-	mov	dst, dstin
-	cmp	count, #64
-	b.ge	.Lcpy_not_short
-	cmp	count, #15
-	b.le	.Ltail15tiny
-
-	/* Deal with small copies quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63:
-	/* Copy up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-
-.Ltail15:
-	ands	count, count, #15
-	beq	1f
-	add	src, src, count
-	ldp	A_l, A_h, [src, #-16]
-	add	dst, dst, count
-	stp	A_l, A_h, [dst, #-16]
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw   w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define tmp1	x9
+
+#define L(l) .L ## l
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   Small and medium copies read all data before writing, allowing any
+   kind of overlap, and memmove tailcalls memcpy for these cases as
+   well as non-overlapping copies.
+*/
+
+	prfm    PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+        cmp     count, 16
+        b.ls    L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
 1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
 	ret
 
-.Ltail15tiny:
-	/* Copy up to 15 bytes of data.  Does not assume additional data
-	   being copied.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
+	.p2align 4
+
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
 1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
 	ret
 
-.Lcpy_not_short:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Copy more data than needed; it's faster than jumping
-	 * around copying sub-Quadword quantities.  We know that
-	 * it can't overrun.  */
-	ldp	A_l, A_h, [src]
-	add	src, src, tmp2
-	stp	A_l, A_h, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63
-2:
-	subs	count, count, #128
-	b.ge	.Lcpy_body_large
-	/* Less than 128 bytes to copy, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	.Ltail63
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
 	ret
 
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lcpy_body_large:
-	/* There are at least 128 bytes to copy.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	2f
 1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	.Ltail63
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+2:
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
 	ret
diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S
index 8b366a3..c50112d 100644
--- a/libc/arch-arm64/generic/bionic/memmove.S
+++ b/libc/arch-arm64/generic/bionic/memmove.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,319 +22,131 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
- * wchar_t is 4 bytes
+ * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
  */
 
 #include <private/bionic_asm.h>
 
 /* Parameters and result.  */
-#ifdef BCOPY
-#define origdstin	x1
-#define origsrc	x0
-#endif
 #define dstin	x0
 #define src	x1
 #define count	x2
-#define tmp1	x3
-#define tmp1w	w3
-#define tmp2	x4
-#define tmp2w	w4
-#define tmp3	x5
-#define tmp3w	w5
-#define dst	x6
-
-#define A_l	x7
-#define A_h	x8
-#define B_l	x9
-#define B_h	x10
-#define C_l	x11
-#define C_h	x12
-#define D_l	x13
-#define D_h	x14
+#define srcend	x3
+#define dstend	x4
+#define tmp1	x5
+#define A_l	x6
+#define A_h	x7
+#define B_l	x8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	count
+#define E_h	tmp1
+
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+   Larger backwards copies are also handled by memcpy. The only remaining
+   case is forward large copies.  The destination is aligned, and an
+   unrolled loop processes 64 bytes per iteration.
+*/
 
-#ifdef BCOPY
-ENTRY(bcopy)
-	/* Swap src and dst so that a branch to memcpy doesn't cause issues. */
-	mov	tmp1, origsrc
-	mov	origsrc, origdstin
-	mov	origdstin, tmp1
-#elif defined(WMEMMOVE)
+#if defined(WMEMMOVE)
 ENTRY(wmemmove)
 	lsl	count, count, #2
 #else
 ENTRY(memmove)
 #endif
-	cmp	dstin, src
-	b.lo	.Ldownwards
-	add	tmp1, src, count
-	cmp	dstin, tmp1
-	b.hs	memcpy		/* No overlap.  */
-
-	/* Upwards move with potential overlap.
-	 * Need to move from the tail backwards.  SRC and DST point one
-	 * byte beyond the remaining data to move.  */
-	add	dst, dstin, count
-	add	src, src, count
-	cmp	count, #64
-	b.ge	.Lmov_not_short_up
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63up:
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15up
-	sub	dst, dst, tmp1
-	sub	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #32]
-	stp	A_l, A_h, [dst, #32]
-1:
-	ldp	A_l, A_h, [src, #16]
-	stp	A_l, A_h, [dst, #16]
-2:
-	ldp	A_l, A_h, [src]
-	stp	A_l, A_h, [dst]
-.Ltail15up:
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	 * being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src, #-1]
-	strb	tmp1w, [dst, #-1]
-1:
-	ret
-
-.Lmov_not_short_up:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src, #-8]!
-	str	tmp1, [dst, #-8]!
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src, #-4]!
-	str	tmp1w, [dst, #-4]!
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src, #-2]!
-	strh	tmp1w, [dst, #-2]!
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src, #-1]!
-	strb	tmp1w, [dst, #-1]!
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63up
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.hs	memcpy
+
+	cbz	tmp1, 3f
+	add	dstend, dstin, count
+	add	srcend, src, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.	 There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.	The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.	 The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
 2:
-	subs	count, count, #128
-	b.ge	.Lmov_body_large_up
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src, #-64]!
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst, #-64]!
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	b.ne	.Ltail63up
-	ret
-
-	/* Critical loop.  Start at a new Icache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lmov_body_large_up:
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #-16]
-	ldp	B_l, B_h, [src, #-32]
-	ldp	C_l, C_h, [src, #-48]
-	ldp	D_l, D_h, [src, #-64]!
-1:
-	stp	A_l, A_h, [dst, #-16]
-	ldp	A_l, A_h, [src, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	ldp	B_l, B_h, [src, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	ldp	C_l, C_h, [src, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	ldp	D_l, D_h, [src, #-64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #-16]
-	stp	B_l, B_h, [dst, #-32]
-	stp	C_l, C_h, [dst, #-48]
-	stp	D_l, D_h, [dst, #-64]!
-	tst	count, #0x3f
-	b.ne	.Ltail63up
-	ret
-
-
-.Ldownwards:
-	/* For a downwards move we can safely use memcpy provided that
-	 * DST is more than 16 bytes away from SRC.  */
-	sub	tmp1, src, #16
-	cmp	dstin, tmp1
-	b.ls	memcpy		/* May overlap, but not critically.  */
-
-	mov	dst, dstin	/* Preserve DSTIN for return value.  */
-	cmp	count, #64
-	b.ge	.Lmov_not_short_down
-
-	/* Deal with small moves quickly by dropping straight into the
-	 * exit block.  */
-.Ltail63down:
-	/* Move up to 48 bytes of data.  At this point we only need the
-	 * bottom 6 bits of count to be accurate.  */
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15down
-	add	dst, dst, tmp1
-	add	src, src, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	ldp	A_l, A_h, [src, #-48]
-	stp	A_l, A_h, [dst, #-48]
-1:
-	ldp	A_l, A_h, [src, #-32]
-	stp	A_l, A_h, [dst, #-32]
-2:
-	ldp	A_l, A_h, [src, #-16]
-	stp	A_l, A_h, [dst, #-16]
-.Ltail15down:
-	/* Move up to 15 bytes of data.  Does not assume additional data
-	   being moved.  */
-	tbz	count, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	count, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	count, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	count, #0, 1f
-	ldrb	tmp1w, [src]
-	strb	tmp1w, [dst]
-1:
-	ret
-
-.Lmov_not_short_down:
-	/* We don't much care about the alignment of DST, but we want SRC
-	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
-	 * boundaries on both loads and stores.  */
-	neg	tmp2, src
-	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
-	b.eq	2f
-	sub	count, count, tmp2
-	/* Move enough data to reach alignment; unlike memcpy, we have to
-	 * be aware of the overlap, which means we can't move data twice.  */
-	tbz	tmp2, #3, 1f
-	ldr	tmp1, [src], #8
-	str	tmp1, [dst], #8
-1:
-	tbz	tmp2, #2, 1f
-	ldr	tmp1w, [src], #4
-	str	tmp1w, [dst], #4
-1:
-	tbz	tmp2, #1, 1f
-	ldrh	tmp1w, [src], #2
-	strh	tmp1w, [dst], #2
-1:
-	tbz	tmp2, #0, 1f
-	ldrb	tmp1w, [src], #1
-	strb	tmp1w, [dst], #1
-1:
-
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63down
-2:
-	subs	count, count, #128
-	b.ge	.Lmov_body_large_down
-	/* Less than 128 bytes to move, so handle 64 here and then jump
-	 * to the tail.  */
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]
-	stp	A_l, A_h, [dst]
-	stp	B_l, B_h, [dst, #16]
-	stp	C_l, C_h, [dst, #32]
-	stp	D_l, D_h, [dst, #48]
-	tst	count, #0x3f
-	add	src, src, #64
-	add	dst, dst, #64
-	b.ne	.Ltail63down
-	ret
-
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lmov_body_large_down:
-	/* There are at least 128 bytes to move.  */
-	ldp	A_l, A_h, [src, #0]
-	sub	dst, dst, #16		/* Pre-bias.  */
-	ldp	B_l, B_h, [src, #16]
-	ldp	C_l, C_h, [src, #32]
-	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
-1:
-	stp	A_l, A_h, [dst, #16]
-	ldp	A_l, A_h, [src, #16]
-	stp	B_l, B_h, [dst, #32]
-	ldp	B_l, B_h, [src, #32]
-	stp	C_l, C_h, [dst, #48]
-	ldp	C_l, C_h, [src, #48]
-	stp	D_l, D_h, [dst, #64]!
-	ldp	D_l, D_h, [src, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	stp	A_l, A_h, [dst, #16]
-	stp	B_l, B_h, [dst, #32]
-	stp	C_l, C_h, [dst, #48]
-	stp	D_l, D_h, [dst, #64]
-	add	src, src, #16
-	add	dst, dst, #64 + 16
-	tst	count, #0x3f
-	b.ne	.Ltail63down
-	ret
-#ifdef BCOPY
-END(bcopy)
-#elif defined(WMEMMOVE)
+	ldp	E_l, E_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	E_l, E_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+#if defined(WMEMMOVE)
 END(wmemmove)
 #else
 END(memmove)
diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S
index 7c204b4..4b3b17b 100644
--- a/libc/arch-arm64/generic/bionic/memset.S
+++ b/libc/arch-arm64/generic/bionic/memset.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Linaro Limited
+/* Copyright (c) 2012-2013, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -22,226 +22,207 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses
  *
  */
 
 #include <private/bionic_asm.h>
 
-/* By default we assume that the DC instruction can be used to zero
-   data blocks more efficiently.  In some circumstances this might be
-   unsafe, for example in an asymmetric multiprocessor environment with
-   different DC clear lengths (neither the upper nor lower lengths are
-   safe to use).
-
-   If code may be run in a virtualized environment, then define
-   MAYBE_VIRT.  This will cause the code to cache the system register
-   values rather than re-reading them each call.  */
-
-#define dstin		x0
-#ifdef BZERO
-#define count		x1
-#else
-#define count		x2
-#endif
-#define val		w1
-#define tmp1		x3
-#define tmp1w		w3
-#define tmp2		x4
-#define tmp2w		w4
-#define zva_len_x	x5
-#define zva_len		w5
-#define zva_bits_x	x6
-
-#define A_l		x7
-#define A_lw		w7
-#define dst		x8
-#define tmp3w		w9
-
-#ifdef BZERO
-ENTRY(bzero)
-#else
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define tmp1	x5
+#define tmp1w	w5
+#define tmp2	x6
+#define tmp2w	w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
 ENTRY(memset)
-#endif
-
-	mov	dst, dstin		/* Preserve return value.  */
-#ifdef BZERO
-	b	.Lzero_mem
-#endif
-	ands	A_lw, val, #255
-	b.eq	.Lzero_mem
-	orr	A_lw, A_lw, A_lw, lsl #8
-	orr	A_lw, A_lw, A_lw, lsl #16
-	orr	A_l, A_l, A_l, lsl #32
-.Ltail_maybe_long:
-	cmp	count, #64
-	b.ge	.Lnot_short
-.Ltail_maybe_tiny:
-	cmp	count, #15
-	b.le	.Ltail15tiny
-.Ltail63:
-	ands	tmp1, count, #0x30
-	b.eq	.Ltail15
-	add	dst, dst, tmp1
-	cmp	tmp1w, #0x20
-	b.eq	1f
-	b.lt	2f
-	stp	A_l, A_l, [dst, #-48]
-1:
-	stp	A_l, A_l, [dst, #-32]
-2:
-	stp	A_l, A_l, [dst, #-16]
-
-.Ltail15:
-	and	count, count, #15
-	add	dst, dst, count
-	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
-	ret
 
-.Ltail15tiny:
-	/* Set up to 15 bytes.  Does not assume earlier memory
-	   being set.  */
-	tbz	count, #3, 1f
-	str	A_l, [dst], #8
-1:
-	tbz	count, #2, 1f
-	str	A_lw, [dst], #4
-1:
-	tbz	count, #1, 1f
-	strh	A_lw, [dst], #2
-1:
-	tbz	count, #0, 1f
-	strb	A_lw, [dst]
-1:
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	nop
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
 	ret
 
-	/* Critical loop.  Start at a new cache line boundary.  Assuming
-	 * 64 bytes per line, this ensures the entire loop is in one line.  */
-	.p2align 6
-.Lnot_short:
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	2f
-	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
-	 * more than that to set, so we simply store 16 bytes and advance by
-	 * the amount required to reach alignment.  */
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	/* There may be less than 63 bytes to go now.  */
-	cmp	count, #63
-	b.le	.Ltail63
-2:
-	sub	dst, dst, #16		/* Pre-bias.  */
-	sub	count, count, #64
-1:
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	stp	A_l, A_l, [dst, #48]
-	stp	A_l, A_l, [dst, #64]!
-	subs	count, count, #64
-	b.ge	1b
-	tst	count, #0x3f
-	add	dst, dst, #16
-	b.ne	.Ltail63
+	.p2align 3
+	nop
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 256
+	ccmp	valw, 0, 0, cs
+	b.eq	L(try_zva)
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	add	dst, dst, 16
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+L(tail64):
+	subs	count, count, 64
+	b.hi	1b
+2:	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
 
-	/* For zeroing memory, check to see if we can use the ZVA feature to
-	 * zero entire 'cache' lines.  */
-.Lzero_mem:
-	mov	A_l, #0
-	cmp	count, #63
-	b.le	.Ltail_maybe_tiny
-	neg	tmp2, dst
-	ands	tmp2, tmp2, #15
-	b.eq	1f
-	sub	count, count, tmp2
-	stp	A_l, A_l, [dst]
-	add	dst, dst, tmp2
-	cmp	count, #63
-	b.le	.Ltail63
-1:
-	/* For zeroing small amounts of memory, it's not worth setting up
-	 * the line-clear code.  */
-	cmp	count, #128
-	b.lt	.Lnot_short
-#ifdef MAYBE_VIRT
-	/* For efficiency when virtualized, we cache the ZVA capability.  */
-	adrp	tmp2, .Lcache_clear
-	ldr	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	tbnz	zva_len, #31, .Lnot_short
-	cbnz	zva_len, .Lzero_by_line
+	.p2align 3
+L(try_zva):
 	mrs	tmp1, dczid_el0
-	tbz	tmp1, #4, 1f
-	/* ZVA not available.  Remember this for next time.  */
-	mov	zva_len, #~0
-	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-	b	.Lnot_short
-1:
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-	str	zva_len, [tmp2, #:lo12:.Lcache_clear]
-#else
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1, #4, .Lnot_short
-	mov	tmp3w, #4
-	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
-	lsl	zva_len, tmp3w, zva_len
-#endif
-
-.Lzero_by_line:
-	/* Compute how far we need to go to become suitably aligned.  We're
-	 * already at quad-word alignment.  */
-	cmp	count, zva_len_x
-	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
-	sub	zva_bits_x, zva_len_x, #1
-	neg	tmp2, dst
-	ands	tmp2, tmp2, zva_bits_x
-	b.eq	1f			/* Already aligned.  */
-	/* Not aligned, check that there's enough to copy after alignment.  */
-	sub	tmp1, count, tmp2
-	cmp	tmp1, #64
-	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
-	b.lt	.Lnot_short
-	/* We know that there's at least 64 bytes to zero and that it's safe
-	 * to overrun by 64 bytes.  */
-	mov	count, tmp1
-2:
-	stp	A_l, A_l, [dst]
-	stp	A_l, A_l, [dst, #16]
-	stp	A_l, A_l, [dst, #32]
-	subs	tmp2, tmp2, #64
-	stp	A_l, A_l, [dst, #48]
-	add	dst, dst, #64
-	b.ge	2b
-	/* We've overrun a bit, so adjust dst downwards.  */
-	add	dst, dst, tmp2
-1:
-	sub	count, count, zva_len_x
-3:
-	dc	zva, dst
-	add	dst, dst, zva_len_x
-	subs	count, count, zva_len_x
-	b.ge	3b
-	ands	count, count, zva_bits_x
-	b.ne	.Ltail_maybe_long
+	tbnz	tmp1w, 4, L(no_zva)
+	and	tmp1w, tmp1w, 15
+	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
+	b.ne	 L(zva_128)
+
+	/* Write the first and last 64 byte aligned block using stp rather
+	   than using DC ZVA.  This is faster on some cores.
+	 */
+L(zva_64):
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+	nop
+1:	dc	zva, dst
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	1b
+	stp	q0, q0, [dst, 0]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
 	ret
-#ifdef BZERO
-END(bzero)
-#else
+
+	.p2align 3
+L(zva_128):
+	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
+	b.ne	L(zva_other)
+
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]
+	stp	q0, q0, [dst, 96]
+	bic	dst, dst, 127
+	sub	count, dstend, dst	/* Count is now 128 too large.	*/
+	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
+	add	dst, dst, 128
+1:	dc	zva, dst
+	add	dst, dst, 128
+	subs	count, count, 128
+	b.hi	1b
+	stp	q0, q0, [dstend, -128]
+	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(zva_other):
+	mov	tmp2w, 4
+	lsl	zva_lenw, tmp2w, tmp1w
+	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
+	cmp	count, tmp1
+	blo	L(no_zva)
+
+	sub	tmp2, zva_len, 1
+	add	tmp1, dst, zva_len
+	add	dst, dst, 16
+	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
+	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
+	beq	2f
+1:	stp	q0, q0, [dst], 64
+	stp	q0, q0, [dst, -32]
+	subs	count, count, 64
+	b.hi	1b
+2:	mov	dst, tmp1
+	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
+	subs	count, count, zva_len
+	b.lo	4f
+3:	dc	zva, dst
+	add	dst, dst, zva_len
+	subs	count, count, zva_len
+	b.hs	3b
+4:	add	count, count, zva_len
+	b	L(tail64)
+
 END(memset)
-#endif
-
-#ifdef MAYBE_VIRT
-	.bss
-	.p2align 2
-.Lcache_clear:
-	.space 4
-#endif
diff --git a/libc/arch-arm64/generic/bionic/strlen.S b/libc/arch-arm64/generic/bionic/strlen.S
index 3bd9809..6e540fc 100644
--- a/libc/arch-arm64/generic/bionic/strlen.S
+++ b/libc/arch-arm64/generic/bionic/strlen.S
@@ -1,16 +1,16 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013-2015, Linaro Limited
    All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are met:
        * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
+	 notice, this list of conditions and the following disclaimer.
        * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
+	 notice, this list of conditions and the following disclaimer in the
+	 documentation and/or other materials provided with the distribution.
        * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
+	 names of its contributors may be used to endorse or promote products
+	 derived from this software without specific prior written permission.
 
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -22,16 +22,19 @@
    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
 #include <private/bionic_asm.h>
 
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.	 */
+
 /* Arguments and results.  */
 #define srcin		x0
 #define len		x0
@@ -40,87 +43,185 @@
 #define src		x1
 #define data1		x2
 #define data2		x3
-#define data2a		x4
-#define has_nul1	x5
-#define has_nul2	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define tmp4		x10
-#define zeroones	x11
-#define pos		x12
+#define has_nul1	x4
+#define has_nul2	x5
+#define tmp1		x4
+#define tmp2		x5
+#define tmp3		x6
+#define tmp4		x7
+#define zeroones	x8
+
+#define L(l) .L ## l
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word. A faster check
+	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+	   false hits for characters 129..255.	*/
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
-	/* Start of critial section -- keep to one 64Byte cache line.  */
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+	/* Since strings are short on average, we check the first 16 bytes
+	   of the string for a NUL character.  In order to do an unaligned ldp
+	   safely we have to do a page cross check first.  If there is a NUL
+	   byte we calculate the length from the 2 8-byte words using
+	   conditional select to reduce branch mispredictions (it is unlikely
+	   strlen will be repeatedly called on strings with the same length).
+
+	   If the string is longer than 16 bytes, we align src so don't need
+	   further page cross checks, and process 32 bytes per iteration
+	   using the fast NUL check.  If we encounter non-ASCII characters,
+	   fallback to a second loop using the full NUL check.
+
+	   If the page cross check fails, we read 16 bytes from an aligned
+	   address, remove any characters before the string, and continue
+	   in the main loop using aligned loads.  Since strings crossing a
+	   page in the first 16 bytes are rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+	   AArch64 systems have a minimum page size of 4k.  We don't bother
+	   checking for larger page sizes - the cost of setting up the correct
+	   page size is just not worth the extra gain from a small reduction in
+	   the cases taking the slow path.  Note that we only care about
+	   whether the first fetch, which may be misaligned, crosses a page
+	   boundary.  */
+
 ENTRY(strlen)
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-.Lloop:
-	ldp	data1, data2, [src], #16
-.Lrealigned:
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	mov	zeroones, REP8_01
+	cmp	tmp1, MIN_PAGE_SIZE - 16
+	b.gt	L(page_cross)
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
 	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+	orr	tmp2, data1, REP8_7f
 	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lloop
-	/* End of critical section -- keep to one 64Byte cache line.  */
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(main_loop_entry)
 
-	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
-#ifdef __AARCH64EB__
-	mov	data2, data1
-#endif
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-.Lnul_in_data2:
+	/* Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	clz	tmp1, has_nul1
+	csel	len, xzr, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+	/* The inner loop processes 32 bytes per iteration and uses the fast
+	   NUL check.  If we encounter non-ASCII characters, use a second
+	   loop with the accurate NUL check.  */
+	.p2align 4
+L(main_loop_entry):
+	bic	src, srcin, 15
+	sub	src, src, 16
+L(main_loop):
+	ldp	data1, data2, [src, 32]!
+.Lpage_cross_entry:
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	bne	1f
+	ldp	data1, data2, [src, 16]
+	sub	tmp1, data1, zeroones
+	sub	tmp3, data2, zeroones
+	orr	tmp2, tmp1, tmp3
+	tst	tmp2, zeroones, lsl 7
+	beq	L(main_loop)
+	add	src, src, 16
+1:
+	/* The fast check failed, so do the slower, accurate NUL check.	 */
+	orr	tmp2, data1, REP8_7f
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+
+	/* Enter with C = has_nul1 == 0.  */
+L(tail):
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
+	   string is 0x01) means we cannot use has_nul1/2 directly.  The
 	   easiest way to get the correct byte is to byte-swap the data
 	   and calculate the syndrome a second time.  */
-	rev	data2, data2
-	sub	tmp1, data2, zeroones
-	orr	tmp2, data2, #REP8_7f
-	bic	has_nul2, tmp1, tmp2
+	csel	data1, data1, data2, cc
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, cc
 #endif
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	sub	len, src, srcin
+	rev	has_nul1, has_nul1
+	add	tmp2, len, 8
+	clz	tmp1, has_nul1
+	csel	len, len, tmp2, cc
+	add	len, len, tmp1, lsr 3
 	ret
 
-.Lmisaligned:
-	cmp	tmp1, #8
-	neg	tmp1, tmp1
-	ldp	data1, data2, [src], #16
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	mov	tmp2, #~0
+L(nonascii_loop):
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	bne	L(tail)
+	ldp	data1, data2, [src, 16]!
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	beq	L(nonascii_loop)
+	b	L(tail)
+
+	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+	   srcin to 0x7f, so we ignore any NUL bytes before the string.
+	   Then continue in the aligned loop.  */
+L(page_cross):
+	bic	src, srcin, 15
+	ldp	data1, data2, [src]
+	lsl	tmp1, srcin, 3
+	mov	tmp4, -1
 #ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	/* Big-endian.	Early bytes are at MSB.	 */
+	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
 #else
 	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	.Lrealigned
+	orr	tmp1, tmp1, REP8_80
+	orn	data1, data1, tmp1
+	orn	tmp2, data2, tmp1
+	tst	srcin, 8
+	csel	data1, data1, tmp4, eq
+	csel	data2, data2, tmp2, eq
+	b	L(page_cross_entry)
 
 END(strlen)
diff --git a/libc/arch-arm64/generic/bionic/strrchr.S b/libc/arch-arm64/generic/bionic/strrchr.S
new file mode 100644
index 0000000..46b5031
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strrchr.S
@@ -0,0 +1,171 @@
+/*
+   strrchr - find last instance of a character in a string
+
+   Copyright (c) 2014, ARM Limited
+   All rights Reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the company nor the names of its contributors
+         may be used to endorse or promote products derived from this
+         software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+#define src_match	x6
+#define src_offset	x7
+#define const_m1	x8
+#define tmp4		x9
+#define nul_match	x10
+#define chr_match	x11
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+/* Locals and temporaries.  */
+
+ENTRY(strrchr)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	mov	src_offset, #0
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	.Laligned
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	mov	nul_match, vhas_nul1.2d[0]
+	lsl	tmp1, tmp1, #1
+	mov	const_m1, #~0
+	mov	chr_match, vhas_chr1.2d[0]
+	lsr	tmp3, const_m1, tmp1
+
+	bic	nul_match, nul_match, tmp3	// Mask padding bits.
+	bic	chr_match, chr_match, tmp3	// Mask padding bits.
+	cbnz	nul_match, .Ltail
+
+.Lloop:
+	cmp	chr_match, #0
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+.Laligned:
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	mov	nul_match, vend1.2d[0]
+	mov	chr_match, vhas_chr1.2d[0]
+	cbz	nul_match, .Lloop
+
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+	mov	nul_match, vhas_nul1.2d[0]
+
+.Ltail:
+	/* Work out exactly where the string ends.  */
+	sub	tmp4, nul_match, #1
+	eor	tmp4, tmp4, nul_match
+	ands	chr_match, chr_match, tmp4
+	/* And pick the values corresponding to the last match.  */
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+
+	/* Count down from the top of the syndrome to find the last match.  */
+	clz	tmp3, src_offset
+	/* Src_match points beyond the word containing the match, so we can
+	   simply subtract half the bit-offset into the syndrome.  Because
+	   we are counting down, we need to go back one more character.  */
+	add	tmp3, tmp3, #2
+	sub	result, src_match, tmp3, lsr #1
+	/* But if the syndrome shows no match was found, then return NULL.  */
+	cmp	src_offset, #0
+	csel	result, result, xzr, ne
+
+	ret
+
+END(strrchr)
diff --git a/libc/arch-arm64/generic/generic.mk b/libc/arch-arm64/generic/generic.mk
index 1b595aa..4512dc5 100644
--- a/libc/arch-arm64/generic/generic.mk
+++ b/libc/arch-arm64/generic/generic.mk
@@ -11,4 +11,5 @@ libc_bionic_src_files_arm64 += \
     arch-arm64/generic/bionic/strlen.S \
     arch-arm64/generic/bionic/strncmp.S \
     arch-arm64/generic/bionic/strnlen.S \
+    arch-arm64/generic/bionic/strrchr.S \
     arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-arm64/kryo/bionic/memcpy.S b/libc/arch-arm64/kryo/bionic/memcpy.S
new file mode 100644
index 0000000..87e1b3b
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+ENTRY(__memcpy_chk)
+  cmp   x2, x3
+  b.hi  __memcpy_chk_fail
+
+  // Fall through to memcpy...
+  b memcpy
+END(__memcpy_chk)
+
+        .align  6
+ENTRY(memcpy)
+  #include "memcpy_base.S"
+END(memcpy)
+
+ENTRY_PRIVATE(__memcpy_chk_fail)
+  // Preserve for accurate backtrace.
+  stp  x29, x30, [sp, -16]!
+  .cfi_def_cfa_offset 16
+  .cfi_rel_offset x29, 0
+  .cfi_rel_offset x30, 8
+
+  adrp  x0, error_string
+  add   x0, x0, :lo12:error_string
+  ldr   x1, error_code
+  bl    __fortify_chk_fail
+error_code:
+  .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+END(__memcpy_chk_fail)
+
+  .data
+  .align 2
+error_string:
+  .string "memcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm64/kryo/bionic/memcpy_base.S b/libc/arch-arm64/kryo/bionic/memcpy_base.S
new file mode 100644
index 0000000..0096bb7
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy_base.S
@@ -0,0 +1,244 @@
+/* Copyright (c) 2015 The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of The Linux Foundation nor the names of its contributors may
+ *       be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef PLDOFFS
+#undef PLDOFFS
+#endif
+#define PLDOFFS		(16)
+
+#ifdef PLDTHRESH
+#undef PLDTHRESH
+#endif
+#define PLDTHRESH (PLDOFFS)
+
+#ifdef BBTHRESH
+#undef BBTHRESH
+#endif
+#define BBTHRESH (2048/128)
+
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+
+#ifdef PLDSIZE
+#undef PLDSIZE
+#endif
+#define PLDSIZE	(128)
+
+kryo_bb_memcpy:
+	mov	x11, x0
+	cmp	x2, #4
+	blo	kryo_bb_lt4
+	cmp	x2, #16
+	blo	kryo_bb_lt16
+	cmp	x2, #32
+	blo	kryo_bb_16
+	cmp	x2, #64
+	blo	kryo_bb_copy_32_a
+	cmp	x2, #128
+	blo	kryo_bb_copy_64_a
+
+	// we have at least 127 bytes to achieve 128-byte alignment
+	neg	x3, x1			// calculate count to get SOURCE aligned
+	ands	x3, x3, #0x7F
+	b.eq	kryo_bb_source_aligned	// already aligned
+	// alignment fixup, small to large (favorable alignment)
+	tbz	x3, #0, 1f
+	ldrb	w5, [x1], #1
+	strb	w5, [x0], #1
+1:	tbz	x3, #1, 2f
+	ldrh	w6, [x1], #2
+	strh	w6, [x0], #2
+2:	tbz	x3, #2, 3f
+	ldr	w8, [x1], #4
+	str	w8, [x0], #4
+3:	tbz	x3, #3, 4f
+	ldr	x9, [x1], #8
+	str	x9, [x0], #8
+4:	tbz	x3, #4, 5f
+	ldr	q7, [x1], #16
+	str	q7, [x0], #16
+5:	tbz	x3, #5, 55f
+	ldp	q0, q1, [x1], #32
+	stp	q0, q1, [x0], #32
+55:	tbz	x3, #6, 6f
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+6:	subs	x2, x2, x3		// fixup count after alignment
+	b.eq	kryo_bb_exit
+	cmp	x2, #128
+	blo	kryo_bb_copy_64_a
+kryo_bb_source_aligned:
+	lsr	x12, x2, #7
+	cmp	x12, #PLDTHRESH
+	bls	kryo_bb_copy_128_loop_nopld
+
+	cmp	x12, #BBTHRESH
+	bls	kryo_bb_prime_pump
+
+	add	x14, x0, #0x400
+	add	x9,  x1, #(PLDOFFS*PLDSIZE)
+	sub	x14, x14, x9
+	lsl	x14, x14, #(21+32)
+	lsr	x14, x14, #(21+32)
+	add	x14, x14, #(PLDOFFS*PLDSIZE)
+	cmp	x12, x14, lsr #7
+	bls	kryo_bb_prime_pump
+
+	mov	x9, #(PLDOFFS)
+	lsr     x13, x14, #7
+	subs    x9, x13, x9
+	bls	kryo_bb_prime_pump
+
+	add	x10, x1, x14
+	bic	x10, x10, #0x7F		// Round to multiple of PLDSIZE
+
+	sub	x12, x12, x14, lsr #7
+	cmp	x9, x12
+	sub     x13, x12, x9
+	csel    x12, x13, x12, LS
+	csel    x9, x12, x9, HI
+	csel    x12, xzr, x12, HI
+
+	prfm	PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)]
+	prfm	PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)]
+kryo_bb_copy_128_loop_outer_doublepld:
+	prfm	PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)]
+	prfm	PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64]
+	subs	x9, x9, #1
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	prfm	PLDL1KEEP, [x10]
+	prfm	PLDL1KEEP, [x10, #64]
+	add	x10, x10, #128
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_outer_doublepld
+	cmp	x12, #0
+	beq	kryo_bb_pop_before_nopld
+	cmp	x12, #(448*1024/128)
+	bls	kryo_bb_copy_128_loop_outer
+
+kryo_bb_copy_128_loop_ddr:
+	subs	x12, x12, #1
+	ldr	x3, [x10], #128
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_ddr
+	b	kryo_bb_pop_before_nopld
+
+kryo_bb_prime_pump:
+	mov	x14, #(PLDOFFS*PLDSIZE)
+	add	x10, x1, #(PLDOFFS*PLDSIZE)
+	bic	x10, x10, #0x7F
+	sub	x12, x12, #PLDOFFS
+	prfm	PLDL1KEEP, [x10, #(-1*PLDSIZE)]
+	prfm	PLDL1KEEP, [x10, #(-1*PLDSIZE+64)]
+	cmp	x12, #(448*1024/128)
+	bhi	kryo_bb_copy_128_loop_ddr
+
+kryo_bb_copy_128_loop_outer:
+	subs	x12, x12, #1
+	prfm	PLDL1KEEP, [x10]
+	prfm	PLDL1KEEP, [x10, #64]
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	add	x10, x10, #128
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_outer
+
+kryo_bb_pop_before_nopld:
+	lsr	x12, x14, #7
+kryo_bb_copy_128_loop_nopld:
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	ldp	q4, q5, [x1], #32
+	ldp	q6, q7, [x1], #32
+	subs	x12, x12, #1
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+	stp	q4, q5, [x0], #32
+	stp	q6, q7, [x0], #32
+	bne	kryo_bb_copy_128_loop_nopld
+	ands	x2, x2, #0x7f
+	beq	kryo_bb_exit
+
+kryo_bb_copy_64_a:
+	tbz	x2, #6, kryo_bb_copy_32_a
+	ldp	q0, q1, [x1], #32
+	ldp	q2, q3, [x1], #32
+	stp	q0, q1, [x0], #32
+	stp	q2, q3, [x0], #32
+kryo_bb_copy_32_a:
+	tbz	x2, #5, kryo_bb_16
+	ldp	q0, q1, [x1], #32
+	stp	q0, q1, [x0], #32
+kryo_bb_16:
+	tbz	x2, #4, kryo_bb_lt16
+	ldr	q7, [x1], #16
+	str	q7, [x0], #16
+	ands	x2, x2, #0x0f
+	beq	kryo_bb_exit
+kryo_bb_lt16:
+	tbz	x2, #3, kryo_bb_lt8
+	ldr	x3, [x1], #8
+	str	x3, [x0], #8
+kryo_bb_lt8:
+	tbz	x2, #2, kryo_bb_lt4
+	ldr	w3, [x1], #4
+	str	w3, [x0], #4
+kryo_bb_lt4:
+	tbz	x2, #1, kryo_bb_lt2
+	ldrh	w3, [x1], #2
+	strh	w3, [x0], #2
+kryo_bb_lt2:
+	tbz	x2, #0, kryo_bb_exit
+	ldrb	w3, [x1], #1
+	strb	w3, [x0], #1
+kryo_bb_exit:
+	mov	x0, x11
+	ret
+
diff --git a/libc/arch-arm64/kryo/kryo.mk b/libc/arch-arm64/kryo/kryo.mk
new file mode 100644
index 0000000..1d901d0
--- /dev/null
+++ b/libc/arch-arm64/kryo/kryo.mk
@@ -0,0 +1,15 @@
+libc_bionic_src_files_arm64 += \
+    arch-arm64/generic/bionic/memchr.S \
+    arch-arm64/generic/bionic/memcmp.S \
+    arch-arm64/kryo/bionic/memcpy.S \
+    arch-arm64/generic/bionic/memmove.S \
+    arch-arm64/generic/bionic/memset.S \
+    arch-arm64/generic/bionic/stpcpy.S \
+    arch-arm64/generic/bionic/strchr.S \
+    arch-arm64/generic/bionic/strcmp.S \
+    arch-arm64/generic/bionic/strcpy.S \
+    arch-arm64/generic/bionic/strlen.S \
+    arch-arm64/generic/bionic/strncmp.S \
+    arch-arm64/generic/bionic/strnlen.S \
+    arch-arm64/generic/bionic/strrchr.S \
+    arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
index 0dbffad..6a5afd6 100644
--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -91,9 +91,6 @@ name:		\
 	.section .text.sse2,"ax",@progbits
 ENTRY (MEMMOVE)
 	ENTRANCE
-#ifdef USE_AS_BCOPY
-	xchg	%rsi, %rdi
-#endif
 	mov	%rdi, %rax
 
 /* Check whether we should copy backward or forward.  */
diff --git a/libc/bionic/malloc_debug_check.cpp b/libc/bionic/malloc_debug_check.cpp
index dee03fa..ad0e613 100644
--- a/libc/bionic/malloc_debug_check.cpp
+++ b/libc/bionic/malloc_debug_check.cpp
@@ -45,6 +45,7 @@
 #include <time.h>
 #include <unistd.h>
 #include <unwind.h>
+#include <signal.h>
 
 #include "debug_mapinfo.h"
 #include "debug_stacktrace.h"
@@ -55,6 +56,14 @@
 #include "private/libc_logging.h"
 #include "private/ScopedPthreadMutexLocker.h"
 
+static unsigned int malloc_sig_enabled = 0;
+static unsigned int min_allocation_report_limit;
+static unsigned int max_allocation_limit;
+static const char* process_name;
+static size_t total_count = 0;
+static bool isDumped = false;
+static bool sigHandled = false;
+
 #define MAX_BACKTRACE_DEPTH 16
 #define ALLOCATION_TAG      0x1ee7d00d
 #define BACKLOG_TAG         0xbabecafe
@@ -63,6 +72,11 @@
 #define FRONT_GUARD_LEN     (1<<5)
 #define REAR_GUARD          0xbb
 #define REAR_GUARD_LEN      (1<<5)
+#define FRONT_GUARD_SS      0xab
+#define DEBUG_SIGNAL SIGWINCH
+
+static void malloc_sigaction(int signum, siginfo_t * sg, void * cxt);
+static struct sigaction default_sa;
 
 static void log_message(const char* format, ...) {
   va_list args;
@@ -135,9 +149,14 @@ static inline void init_front_guard(hdr_t* hdr) {
     memset(hdr->front_guard, FRONT_GUARD, FRONT_GUARD_LEN);
 }
 
+static inline void set_snapshot(hdr_t* hdr) {
+    memset(hdr->front_guard, FRONT_GUARD_SS, FRONT_GUARD_LEN);
+}
+
 static inline bool is_front_guard_valid(hdr_t* hdr) {
     for (size_t i = 0; i < FRONT_GUARD_LEN; i++) {
-        if (hdr->front_guard[i] != FRONT_GUARD) {
+        if (!((hdr->front_guard[i] == FRONT_GUARD) ||
+                    (hdr->front_guard[i] == FRONT_GUARD_SS))) {
             return false;
         }
     }
@@ -171,6 +190,9 @@ static inline bool is_rear_guard_valid(hdr_t* hdr) {
 }
 
 static inline void add_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
+    if (hdr->tag == ALLOCATION_TAG) {
+        total_count += hdr->size;
+    }
     hdr->prev = NULL;
     hdr->next = *head;
     if (*head)
@@ -181,6 +203,9 @@ static inline void add_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
 }
 
 static inline int del_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
+    if (hdr->tag == ALLOCATION_TAG) {
+        total_count -= hdr->size;
+    }
     if (hdr->prev) {
         hdr->prev->next = hdr->next;
     } else {
@@ -194,6 +219,25 @@ static inline int del_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
     return 0;
 }
 
+static void snapshot_report_leaked_nodes() {
+    log_message("%s: %s\n", __FILE__, __FUNCTION__);
+    hdr_t * iterator = head;
+    size_t total_size = 0;
+    do {
+        if (iterator->front_guard[0] == FRONT_GUARD &&
+                iterator->size >= min_allocation_report_limit) {
+            log_message("obj %p, size %d", iterator, iterator->size);
+            total_size += iterator->size;
+            log_backtrace(iterator->bt, iterator->bt_depth);
+            log_message("------------------------------"); // as an end marker
+            // Marking the node as we do not want to print it again.
+            set_snapshot(iterator);
+        }
+        iterator = iterator->next;
+    } while (iterator);
+    log_message("Total Pending allocations after last snapshot: %d", total_size);
+}
+
 static inline void add(hdr_t* hdr, size_t size) {
     ScopedPthreadMutexLocker locker(&lock);
     hdr->tag = ALLOCATION_TAG;
@@ -202,6 +246,11 @@ static inline void add(hdr_t* hdr, size_t size) {
     init_rear_guard(hdr);
     ++g_allocated_block_count;
     add_locked(hdr, &tail, &head);
+    if ((total_count >= max_allocation_limit) && !isDumped && malloc_sig_enabled) {
+        isDumped = true;
+        sigHandled = true; // Need to bypass the snapshot
+        kill(getpid(), DEBUG_SIGNAL);
+    }
 }
 
 static inline int del(hdr_t* hdr) {
@@ -233,7 +282,8 @@ static bool was_used_after_free(hdr_t* hdr) {
 static inline int check_guards(hdr_t* hdr, int* safe) {
     *safe = 1;
     if (!is_front_guard_valid(hdr)) {
-        if (hdr->front_guard[0] == FRONT_GUARD) {
+        if ((hdr->front_guard[0] == FRONT_GUARD) ||
+                ((hdr->front_guard[0] == FRONT_GUARD_SS))) {
             log_message("+++ ALLOCATION %p SIZE %d HAS A CORRUPTED FRONT GUARD\n",
                        user(hdr), hdr->size);
         } else {
@@ -656,6 +706,42 @@ extern "C" bool malloc_debug_initialize(HashTable* hash_table, const MallocDebug
     __libc_format_log(ANDROID_LOG_INFO, "libc", "not gathering backtrace information\n");
   }
 
+  if (__system_property_get("libc.debug.malloc", env)) {
+    if(atoi(env) == 40) malloc_sig_enabled = 1;
+  }
+
+  if (malloc_sig_enabled) {
+    char debug_proc_size[PROP_VALUE_MAX];
+    if (__system_property_get("libc.debug.malloc.maxprocsize", debug_proc_size))
+      max_allocation_limit = atoi(debug_proc_size);
+    else
+      max_allocation_limit = 30 * 1024 * 1024; // In Bytes [Default is 30 MB]
+    if (__system_property_get("libc.debug.malloc.minalloclim", debug_proc_size))
+      min_allocation_report_limit = atoi(debug_proc_size);
+    else
+      min_allocation_report_limit = 10 * 1024; // In Bytes [Default is 10 KB]
+    process_name = getprogname();
+  }
+
+/* Initializes malloc debugging framework.
+ * See comments on MallocDebugInit in malloc_debug_common.h
+ */
+  if (malloc_sig_enabled) {
+    struct sigaction sa; //local or static?
+    sa.sa_handler = NULL;
+    sa.sa_sigaction = malloc_sigaction;
+    sigemptyset(&sa.sa_mask);
+    sigaddset(&sa.sa_mask, DEBUG_SIGNAL);
+    sa.sa_flags = SA_SIGINFO;
+    sa.sa_restorer = NULL;
+    if (sigaction(DEBUG_SIGNAL, &sa, &default_sa) < 0) {
+      log_message("Failed to register signal handler w/ errno %s", strerror(errno));
+      malloc_sig_enabled = 0;
+    } else {
+      log_message("Registered signal handler");
+      sigHandled = false;
+    }
+  }
   if (g_backtrace_enabled) {
     backtrace_startup();
   }
@@ -668,9 +754,66 @@ extern "C" void malloc_debug_finalize(int malloc_debug_level) {
   if (malloc_debug_level == 10) {
     ReportMemoryLeaks();
   }
+  if (malloc_sig_enabled) {
+    log_message("Deregister %d signal handler", DEBUG_SIGNAL);
+    sigaction(DEBUG_SIGNAL, &default_sa, NULL);
+    malloc_sig_enabled = 0;
+    sigHandled = false;
+  }
   if (g_backtrace_enabled) {
     backtrace_shutdown();
   }
 
   pthread_setspecific(g_debug_calls_disabled, NULL);
 }
+
+static void snapshot_nodes_locked() {
+  log_message("%s: %s\n", __FILE__, __FUNCTION__);
+  hdr_t * iterator = head;
+  do {
+    if (iterator->front_guard[0] == FRONT_GUARD) {
+      set_snapshot(iterator);
+    }
+    iterator = iterator->next;
+  } while (iterator);
+}
+
+static void malloc_sigaction(int signum, siginfo_t * info, void * context)
+{
+  log_message("%s: %s\n", __FILE__, __FUNCTION__);
+  log_message("%s got %d signal from PID: %d (context:%x)\n",
+          __func__, signum, info->si_pid, context);
+
+  if (signum != DEBUG_SIGNAL) {
+    log_message("RECEIVED %d instead of %d\n", signum, DEBUG_SIGNAL);
+    return;
+  }
+
+  log_message("Process under observation:%s", process_name);
+  log_message("Maximum process size limit:%d Bytes", max_allocation_limit);
+  log_message("Won't print allocation below %d Bytes", min_allocation_report_limit);
+  log_message("Total count: %d\n", total_count);
+
+  if (!head) {
+    log_message("No allocations?");
+    return;
+  }
+  // If sigHandled is false, meaning it's being handled first time
+  if (!sigHandled) {
+    sigHandled = true;
+    // Marking the nodes assuming that they should not be leaked nodes.
+    snapshot_nodes_locked();
+  } else {
+    // We need to print new allocations now
+    log_message("Start dumping allocations of the process %s", process_name);
+    log_message("+++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ ***\n");
+
+    // Print allocations of the process
+    if (g_backtrace_enabled)
+        snapshot_report_leaked_nodes();
+
+    log_message("*** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++\n");
+    log_message("Completed dumping allocations of the process %s", process_name);
+  }
+  return;
+}
diff --git a/libc/bionic/malloc_debug_common.cpp b/libc/bionic/malloc_debug_common.cpp
index ee796c6..12fc6dd 100644
--- a/libc/bionic/malloc_debug_common.cpp
+++ b/libc/bionic/malloc_debug_common.cpp
@@ -396,6 +396,9 @@ static void malloc_init_impl() {
       }
       so_name = "libc_malloc_debug_qemu.so";
       break;
+    case 40:
+      so_name = "libc_malloc_debug_leak.so";
+      break;
     default:
       error_log("%s: Debug level %d is unknown\n", getprogname(), g_malloc_debug_level);
       return;
@@ -456,6 +459,9 @@ static void malloc_init_impl() {
     case 20:
       InitMalloc(malloc_impl_handle, &malloc_dispatch_table, "qemu_instrumented");
       break;
+    case 40:
+      InitMalloc(malloc_impl_handle, &malloc_dispatch_table, "chk");
+      break;
     default:
       break;
   }
diff --git a/libc/bionic/mmap.cpp b/libc/bionic/mmap.cpp
index 8f25a89..53e8b46 100644
--- a/libc/bionic/mmap.cpp
+++ b/libc/bionic/mmap.cpp
@@ -36,6 +36,11 @@
 extern "C" void*  __mmap2(void*, size_t, int, int, int, size_t);
 
 #define MMAP2_SHIFT 12 // 2**12 == 4096
+#ifdef LEGACY_MMAP
+#define TO_64(a) ((a) & 0x00000000ffffffff)
+#else
+#define TO_64(a) (a)
+#endif
 
 static bool kernel_has_MADV_MERGEABLE = true;
 
@@ -60,5 +65,5 @@ void* mmap64(void* addr, size_t size, int prot, int flags, int fd, off64_t offse
 }
 
 void* mmap(void* addr, size_t size, int prot, int flags, int fd, off_t offset) {
-  return mmap64(addr, size, prot, flags, fd, static_cast<off64_t>(offset));
+  return mmap64(addr, size, prot, flags, fd, TO_64(static_cast<off64_t>(offset)));
 }
diff --git a/libc/include/paths.h b/libc/include/paths.h
index 82c2804..7700cdd 100644
--- a/libc/include/paths.h
+++ b/libc/include/paths.h
@@ -33,6 +33,7 @@
 #define	_PATHS_H_
 
 #define	_PATH_BSHELL	"/system/bin/sh"
+#define	_PATH_BSHELL2	"/sbin/sh"
 #define	_PATH_CONSOLE	"/dev/console"
 #define	_PATH_DEFPATH	"/sbin:/vendor/bin:/system/sbin:/system/bin:/system/xbin"
 #define	_PATH_DEV	"/dev/"
diff --git a/libc/include/regex.h b/libc/include/regex.h
index aec38e3..b06a515 100644
--- a/libc/include/regex.h
+++ b/libc/include/regex.h
@@ -42,8 +42,9 @@
 #include <sys/cdefs.h>
 #include <sys/types.h>
 
-/* types */
-typedef off_t regoff_t;
+/* POSIX says regoff_t is at least as large as the larger of ptrdiff_t and
+ * ssize_t. BSD uses off_t, but that interacts badly with _FILE_OFFSET_BITS. */
+typedef ssize_t regoff_t;
 
 typedef struct {
 	int re_magic;
diff --git a/libc/kernel/uapi/linux/android_alarm.h b/libc/kernel/uapi/linux/android_alarm.h
index 801a01e..9f2de28 100644
--- a/libc/kernel/uapi/linux/android_alarm.h
+++ b/libc/kernel/uapi/linux/android_alarm.h
@@ -28,28 +28,31 @@ enum android_alarm_type {
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
   ANDROID_ALARM_ELAPSED_REALTIME,
   ANDROID_ALARM_SYSTEMTIME,
+  ANDROID_ALARM_RTC_POWEROFF_WAKEUP,
   ANDROID_ALARM_TYPE_COUNT,
-};
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+};
 enum android_alarm_return_flags {
   ANDROID_ALARM_RTC_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_WAKEUP,
   ANDROID_ALARM_RTC_MASK = 1U << ANDROID_ALARM_RTC,
-  ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+  ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
   ANDROID_ALARM_ELAPSED_REALTIME_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME,
   ANDROID_ALARM_SYSTEMTIME_MASK = 1U << ANDROID_ALARM_SYSTEMTIME,
+  ANDROID_ALARM_RTC_POWEROFF_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_POWEROFF_WAKEUP,
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
   ANDROID_ALARM_TIME_CHANGE_MASK = 1U << 16
 };
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_CLEAR(type) _IO('a', 0 | ((type) << 4))
 #define ANDROID_ALARM_WAIT _IO('a', 1)
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ALARM_IOW(c,type,size) _IOW('a', (c) | ((type) << 4), size)
 #define ANDROID_ALARM_SET(type) ALARM_IOW(2, type, struct timespec)
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_SET_AND_WAIT(type) ALARM_IOW(3, type, struct timespec)
 #define ANDROID_ALARM_GET_TIME(type) ALARM_IOW(4, type, struct timespec)
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_SET_RTC _IOW('a', 5, struct timespec)
 #define ANDROID_ALARM_BASE_CMD(cmd) (cmd & ~(_IOC(0, 0, 0xf0, 0)))
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define ANDROID_ALARM_IOCTL_TO_TYPE(cmd) (_IOC_NR(cmd) >> 4)
 #endif
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
diff --git a/libc/kernel/uapi/linux/time.h b/libc/kernel/uapi/linux/time.h
index bf245fc..5690d27 100644
--- a/libc/kernel/uapi/linux/time.h
+++ b/libc/kernel/uapi/linux/time.h
@@ -67,9 +67,10 @@ struct itimerval {
 #define CLOCK_SGI_CYCLE 10
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
 #define CLOCK_TAI 11
+#define CLOCK_POWEROFF_ALARM 12
 #define MAX_CLOCKS 16
 #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC)
-#define CLOCKS_MONO CLOCK_MONOTONIC
 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+#define CLOCKS_MONO CLOCK_MONOTONIC
 #define TIMER_ABSTIME 0x01
 #endif
diff --git a/libc/upstream-netbsd/lib/libc/gen/popen.c b/libc/upstream-netbsd/lib/libc/gen/popen.c
index 593e346..b6ce47c 100644
--- a/libc/upstream-netbsd/lib/libc/gen/popen.c
+++ b/libc/upstream-netbsd/lib/libc/gen/popen.c
@@ -152,6 +152,8 @@ popen(const char *command, const char *type)
 		}
 
 		execl(_PATH_BSHELL, "sh", "-c", command, NULL);
+		if (errno == ENOENT)
+			execl(_PATH_BSHELL2, "sh", "-c", command, NULL);
 		_exit(127);
 		/* NOTREACHED */
 	}
diff --git a/libm/Android.mk b/libm/Android.mk
index e919129..f053e25 100644
--- a/libm/Android.mk
+++ b/libm/Android.mk
@@ -107,8 +107,6 @@ LOCAL_SRC_FILES := \
     upstream-freebsd/lib/msun/src/s_exp2.c \
     upstream-freebsd/lib/msun/src/s_exp2f.c \
     upstream-freebsd/lib/msun/src/s_expm1f.c \
-    upstream-freebsd/lib/msun/src/s_fabs.c \
-    upstream-freebsd/lib/msun/src/s_fabsf.c \
     upstream-freebsd/lib/msun/src/s_fdim.c \
     upstream-freebsd/lib/msun/src/s_finite.c \
     upstream-freebsd/lib/msun/src/s_finitef.c \
@@ -174,7 +172,6 @@ LOCAL_SRC_FILES_64 := \
     upstream-freebsd/lib/msun/src/s_copysignl.c \
     upstream-freebsd/lib/msun/src/e_coshl.c \
     upstream-freebsd/lib/msun/src/s_cosl.c \
-    upstream-freebsd/lib/msun/src/s_fabsl.c \
     upstream-freebsd/lib/msun/src/s_floorl.c \
     upstream-freebsd/lib/msun/src/s_fmal.c \
     upstream-freebsd/lib/msun/src/s_fmaxl.c \
@@ -227,6 +224,10 @@ LOCAL_SRC_FILES += \
 LOCAL_SRC_FILES += \
     signbit.c \
 
+# Home-grown stuff.
+LOCAL_SRC_FILES += \
+    fabs.cpp \
+
 # Arch specific optimizations.
 
 # -----------------------------------------------------------------------------
@@ -282,9 +283,8 @@ LOCAL_SRC_FILES_arm += \
 
 else
 LOCAL_SRC_FILES_arm += \
-    arm/e_sqrt.S \
-    arm/e_sqrtf.S \
-    arm/s_floor.S \
+    arm/sqrt.S \
+    arm/floor.S \
 
 endif
 
@@ -481,8 +481,10 @@ LOCAL_C_INCLUDES_64 += $(LOCAL_PATH)/upstream-freebsd/lib/msun/ld128/
 LOCAL_CLANG := $(libm_clang)
 LOCAL_ARM_MODE := arm
 LOCAL_CFLAGS := \
+    -D__BIONIC_NO_MATH_INLINES \
     -DFLT_EVAL_METHOD=0 \
     -include $(LOCAL_PATH)/freebsd-compat.h \
+    -Werror \
     -Wno-missing-braces \
     -Wno-parentheses \
     -Wno-sign-compare \
diff --git a/libm/arm/e_sqrtf.S b/libm/arm/e_sqrtf.S
deleted file mode 100644
index ddefb22..0000000
--- a/libm/arm/e_sqrtf.S
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2013-2014, NVIDIA Corporation.  All rights reserved.
- * Johhnny Qiu <joqiu@nvidia.com>
- * Shu Zhang <chazhang@nvidia.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- *       copyright notice, this list of conditions and the following
- *       disclaimer in the documentation and/or other materials provided
- *       with the distribution.
- *     * Neither the name of The Linux Foundation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
- * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
- * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <private/bionic_asm.h>
-
-ENTRY(sqrtf)
-    vmov.f32    s0, r0
-    vsqrt.f32   s0, s0
-    vmov.f32    r0, s0
-    bx          lr
-END(sqrtf)
diff --git a/libm/arm/s_floor.S b/libm/arm/floor.S
index 3af8f76..3af8f76 100644
--- a/libm/arm/s_floor.S
+++ b/libm/arm/floor.S
diff --git a/libm/arm/e_sqrt.S b/libm/arm/sqrt.S
index 17312f5..f2981f4 100644
--- a/libm/arm/e_sqrt.S
+++ b/libm/arm/sqrt.S
@@ -39,4 +39,11 @@ ENTRY(sqrt)
     bx          lr
 END(sqrt)
 
+ENTRY(sqrtf)
+    vmov.f32    s0, r0
+    vsqrt.f32   s0, s0
+    vmov.f32    r0, s0
+    bx          lr
+END(sqrtf)
+
 ALIAS_SYMBOL(sqrtl, sqrt);
diff --git a/libm/arm64/fenv.c b/libm/arm64/fenv.c
index ce560a7..19a2393 100644
--- a/libm/arm64/fenv.c
+++ b/libm/arm64/fenv.c
@@ -26,6 +26,7 @@
  * $FreeBSD: libm/aarch64/fenv.c $
  */
 
+#include <stdint.h>
 #include <fenv.h>
 
 #define FPCR_EXCEPT_SHIFT 8
@@ -38,10 +39,20 @@ const fenv_t __fe_dfl_env = { 0 /* control */, 0 /* status */};
 typedef __uint32_t fpu_control_t;   // FPCR, Floating-point Control Register.
 typedef __uint32_t fpu_status_t;    // FPSR, Floating-point Status Register.
 
-#define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r" (__fpcr))
-#define __get_fpsr(__fpsr) __asm__ __volatile__("mrs %0,fpsr" : "=r" (__fpsr))
-#define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : :"ri" (__fpcr))
-#define __set_fpsr(__fpsr) __asm__ __volatile__("msr fpsr,%0" : :"ri" (__fpsr))
+#define __get(REGISTER, __value) { \
+  uint64_t __value64; \
+  __asm__ __volatile__("mrs %0," REGISTER : "=r" (__value64)); \
+  __value = (__uint32_t) __value64; \
+}
+#define __get_fpcr(__fpcr) __get("fpcr", __fpcr)
+#define __get_fpsr(__fpsr) __get("fpsr", __fpsr)
+
+#define __set(REGISTER, __value) { \
+  uint64_t __value64 = __value; \
+  __asm__ __volatile__("msr " REGISTER ",%0" : : "ri" (__value64)); \
+}
+#define __set_fpcr(__fpcr) __set("fpcr", __fpcr)
+#define __set_fpsr(__fpsr) __set("fpsr", __fpsr)
 
 int fegetenv(fenv_t* envp) {
   __get_fpcr(envp->__control);
diff --git a/libm/fabs.cpp b/libm/fabs.cpp
new file mode 100644
index 0000000..add73fe
--- /dev/null
+++ b/libm/fabs.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <math.h>
+
+#include "fpmath.h"
+
+double fabs(double x) {
+#if __arm__
+  // Both Clang and GCC insist on moving r0/r1 into a double register
+  // and using fabs where bit-twiddling would be a better choice.
+  // They get fabsf right, but we need to be careful in fabsl too.
+  IEEEd2bits u;
+  u.d = x;
+  u.bits.sign = 0;
+  return u.d;
+#else
+  return __builtin_fabs(x);
+#endif
+}
+
+float fabsf(float x) {
+  return __builtin_fabsf(x);
+}
+
+#if defined(__LP64__)
+long double fabsl(long double x) { return __builtin_fabsl(x); }
+#else
+long double fabsl(long double x) {
+  // Don't use __builtin_fabs here because of ARM. (See fabs above.)
+  return fabs(x);
+}
+#endif
diff --git a/libm/fake_long_double.c b/libm/fake_long_double.c
index 317a115..5edf839 100644
--- a/libm/fake_long_double.c
+++ b/libm/fake_long_double.c
@@ -25,7 +25,6 @@
  */
 
 long double copysignl(long double a1, long double a2) { return copysign(a1, a2); }
-long double fabsl(long double a1) { return fabs(a1); }
 long double fmaxl(long double a1, long double a2) { return fmax(a1, a2); }
 long double fmodl(long double a1, long double a2) { return fmod(a1, a2); }
 long double fminl(long double a1, long double a2) { return fmin(a1, a2); }
diff --git a/libm/include/math.h b/libm/include/math.h
index 1542374..ce8e3b2 100644
--- a/libm/include/math.h
+++ b/libm/include/math.h
@@ -15,116 +15,70 @@
  */
 
 #ifndef _MATH_H_
-#define	_MATH_H_
+#define _MATH_H_
 
 #include <sys/cdefs.h>
 #include <limits.h>
 
+#if !defined(__BIONIC_NO_MATH_INLINES)
+#define __BIONIC_MATH_INLINE(__def) extern __inline__ __always_inline __attribute__((gnu_inline)) __attribute__((__artificial__)) __def
+#else
+#define __BIONIC_MATH_INLINE(__def)
+#endif
+
 __BEGIN_DECLS
 #pragma GCC visibility push(default)
 
-/*
- * ANSI/POSIX
- */
-extern const union __infinity_un {
-	unsigned char	__uc[8];
-	double		__ud;
-} __infinity;
-
-extern const union __nan_un {
-	unsigned char	__uc[sizeof(float)];
-	float		__uf;
-} __nan;
-
-#if __GNUC_PREREQ(3, 3) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 800)
-#define	__MATH_BUILTIN_CONSTANTS
-#endif
+#define HUGE_VAL	__builtin_huge_val()
 
-#if __GNUC_PREREQ(3, 0) && !defined(__INTEL_COMPILER)
-#define	__MATH_BUILTIN_RELOPS
-#endif
+#if __ISO_C_VISIBLE >= 1999
+#define FP_ILOGB0	(-INT_MAX)
+#define FP_ILOGBNAN	INT_MAX
 
-#ifdef __MATH_BUILTIN_CONSTANTS
-#define	HUGE_VAL	__builtin_huge_val()
-#else
-#define	HUGE_VAL	(__infinity.__ud)
-#endif
+#define HUGE_VALF	__builtin_huge_valf()
+#define HUGE_VALL	__builtin_huge_vall()
+#define INFINITY	__builtin_inff()
+#define NAN		__builtin_nanf("")
 
-#if __ISO_C_VISIBLE >= 1999
-#define	FP_ILOGB0	(-INT_MAX) /* Android-changed */
-#define	FP_ILOGBNAN	INT_MAX /* Android-changed */
-
-#ifdef __MATH_BUILTIN_CONSTANTS
-#define	HUGE_VALF	__builtin_huge_valf()
-#define	HUGE_VALL	__builtin_huge_vall()
-#define	INFINITY	__builtin_inff()
-#define	NAN		__builtin_nanf("")
-#else
-#define	HUGE_VALF	(float)HUGE_VAL
-#define	HUGE_VALL	(long double)HUGE_VAL
-#define	INFINITY	HUGE_VALF
-#define	NAN		(__nan.__uf)
-#endif /* __MATH_BUILTIN_CONSTANTS */
-
-#define	MATH_ERRNO	1
-#define	MATH_ERREXCEPT	2
-#define	math_errhandling	MATH_ERREXCEPT
-
-#define	FP_FAST_FMAF	1
-#ifdef __ia64__
-#define	FP_FAST_FMA	1
-#define	FP_FAST_FMAL	1
+#define MATH_ERRNO	1
+#define MATH_ERREXCEPT	2
+#define math_errhandling	MATH_ERREXCEPT
+
+#if defined(__FP_FAST_FMA)
+#define FP_FAST_FMA 1
+#endif
+#if defined(__FP_FAST_FMAF)
+#define FP_FAST_FMAF 1
+#endif
+#if defined(__FP_FAST_FMAL)
+#define FP_FAST_FMAL 1
 #endif
 
 /* Symbolic constants to classify floating point numbers. */
-#define	FP_INFINITE	0x01
-#define	FP_NAN		0x02
-#define	FP_NORMAL	0x04
-#define	FP_SUBNORMAL	0x08
-#define	FP_ZERO		0x10
-#define	fpclassify(x) \
-    ((sizeof (x) == sizeof (float)) ? __fpclassifyf(x) \
-    : (sizeof (x) == sizeof (double)) ? __fpclassifyd(x) \
-    : __fpclassifyl(x))
-
-#define	isfinite(x)					\
-    ((sizeof (x) == sizeof (float)) ? __isfinitef(x)	\
-    : (sizeof (x) == sizeof (double)) ? __isfinite(x)	\
-    : __isfinitel(x))
-#define	isinf(x)					\
-    ((sizeof (x) == sizeof (float)) ? __isinff(x)	\
-    : (sizeof (x) == sizeof (double)) ? isinf(x)	\
-    : __isinfl(x))
-#define	isnan(x)					\
-    ((sizeof (x) == sizeof (float)) ? __isnanf(x)	\
-    : (sizeof (x) == sizeof (double)) ? isnan(x)	\
-    : __isnanl(x))
-#define	isnormal(x)					\
-    ((sizeof (x) == sizeof (float)) ? __isnormalf(x)	\
-    : (sizeof (x) == sizeof (double)) ? __isnormal(x)	\
-    : __isnormall(x))
-
-#ifdef __MATH_BUILTIN_RELOPS
-#define	isgreater(x, y)		__builtin_isgreater((x), (y))
-#define	isgreaterequal(x, y)	__builtin_isgreaterequal((x), (y))
-#define	isless(x, y)		__builtin_isless((x), (y))
-#define	islessequal(x, y)	__builtin_islessequal((x), (y))
-#define	islessgreater(x, y)	__builtin_islessgreater((x), (y))
-#define	isunordered(x, y)	__builtin_isunordered((x), (y))
-#else
-#define	isgreater(x, y)		(!isunordered((x), (y)) && (x) > (y))
-#define	isgreaterequal(x, y)	(!isunordered((x), (y)) && (x) >= (y))
-#define	isless(x, y)		(!isunordered((x), (y)) && (x) < (y))
-#define	islessequal(x, y)	(!isunordered((x), (y)) && (x) <= (y))
-#define	islessgreater(x, y)	(!isunordered((x), (y)) && \
-					((x) > (y) || (y) > (x)))
-#define	isunordered(x, y)	(isnan(x) || isnan(y))
-#endif /* __MATH_BUILTIN_RELOPS */
-
-#define	signbit(x)					\
-    ((sizeof (x) == sizeof (float)) ? __signbitf(x)	\
-    : (sizeof (x) == sizeof (double)) ? __signbit(x)	\
-    : __signbitl(x))
+#define FP_INFINITE	0x01
+#define FP_NAN		0x02
+#define FP_NORMAL	0x04
+#define FP_SUBNORMAL	0x08
+#define FP_ZERO		0x10
+#define fpclassify(x) \
+    __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, x)
+
+#define isfinite(x) __builtin_isfinite(x)
+#define isinf(x) __builtin_isinf(x)
+#define isnan(x) __builtin_isnan(x)
+#define isnormal(x) __builtin_isnormal(x)
+
+#define isgreater(x, y) __builtin_isgreater((x), (y))
+#define isgreaterequal(x, y) __builtin_isgreaterequal((x), (y))
+#define isless(x, y) __builtin_isless((x), (y))
+#define islessequal(x, y) __builtin_islessequal((x), (y))
+#define islessgreater(x, y) __builtin_islessgreater((x), (y))
+#define isunordered(x, y) __builtin_isunordered((x), (y))
+
+#define signbit(x) \
+    ((sizeof(x) == sizeof(float)) ? __builtin_signbitf(x) \
+    : (sizeof(x) == sizeof(double)) ? __builtin_signbit(x) \
+    : __builtin_signbitl(x))
 
 typedef double __double_t;
 typedef __double_t double_t;
@@ -213,6 +167,7 @@ double	sqrt(double);
 
 double	ceil(double);
 double	fabs(double) __pure2;
+__BIONIC_MATH_INLINE(double fabs(double x) { return __builtin_fabs(x); })
 double	floor(double);
 double	fmod(double, double);
 
@@ -331,6 +286,7 @@ float	sqrtf(float);
 
 float	ceilf(float);
 float	fabsf(float) __pure2;
+__BIONIC_MATH_INLINE(float fabsf(float x) { return __builtin_fabsf(x); })
 float	floorf(float);
 float	fmodf(float, float);
 float	roundf(float);
@@ -418,6 +374,7 @@ long double	exp2l(long double);
 long double	expl(long double);
 long double	expm1l(long double);
 long double	fabsl(long double) __pure2;
+__BIONIC_MATH_INLINE(long double fabsl(long double x) { return __builtin_fabsl(x); })
 long double	fdiml(long double, long double);
 long double	floorl(long double);
 long double	fmal(long double, long double, long double);
diff --git a/libm/upstream-freebsd/lib/msun/ld128/k_expl.h b/libm/upstream-freebsd/lib/msun/ld128/k_expl.h
index a5668fd..e843d43 100644
--- a/libm/upstream-freebsd/lib/msun/ld128/k_expl.h
+++ b/libm/upstream-freebsd/lib/msun/ld128/k_expl.h
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/ld128/k_expl.h 275819 2014-12-16 09:21:56Z ed $");
 
 /*
  * ld128 version of k_expl.h.  See ../ld80/s_expl.c for most comments.
@@ -322,7 +322,7 @@ __ldexp_cexpl(long double complex z, int expt)
 	scale2 = 1;
 	SET_LDBL_EXPSIGN(scale1, BIAS + expt - half_expt);
 
-	return (cpackl(cos(y) * exp_x * scale1 * scale2,
+	return (CMPLXL(cos(y) * exp_x * scale1 * scale2,
 	    sinl(y) * exp_x * scale1 * scale2));
 }
 #endif /* _COMPLEX_H */
diff --git a/libm/upstream-freebsd/lib/msun/src/catrig.c b/libm/upstream-freebsd/lib/msun/src/catrig.c
index 200977c..050a88b 100644
--- a/libm/upstream-freebsd/lib/msun/src/catrig.c
+++ b/libm/upstream-freebsd/lib/msun/src/catrig.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/catrig.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <float.h>
@@ -286,19 +286,19 @@ casinh(double complex z)
 	if (isnan(x) || isnan(y)) {
 		/* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */
 		if (isinf(x))
-			return (cpack(x, y + y));
+			return (CMPLX(x, y + y));
 		/* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
 		if (isinf(y))
-			return (cpack(y, x + x));
+			return (CMPLX(y, x + x));
 		/* casinh(NaN + I*0) = NaN + I*0 */
 		if (y == 0)
-			return (cpack(x + x, y));
+			return (CMPLX(x + x, y));
 		/*
 		 * All other cases involving NaN return NaN + I*NaN.
 		 * C99 leaves it optional whether to raise invalid if one of
 		 * the arguments is not NaN, so we opt not to raise it.
 		 */
-		return (cpack(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
+		return (CMPLX(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
 	}
 
 	if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
@@ -307,7 +307,7 @@ casinh(double complex z)
 			w = clog_for_large_values(z) + m_ln2;
 		else
 			w = clog_for_large_values(-z) + m_ln2;
-		return (cpack(copysign(creal(w), x), copysign(cimag(w), y)));
+		return (CMPLX(copysign(creal(w), x), copysign(cimag(w), y)));
 	}
 
 	/* Avoid spuriously raising inexact for z = 0. */
@@ -325,7 +325,7 @@ casinh(double complex z)
 		ry = asin(B);
 	else
 		ry = atan2(new_y, sqrt_A2my2);
-	return (cpack(copysign(rx, x), copysign(ry, y)));
+	return (CMPLX(copysign(rx, x), copysign(ry, y)));
 }
 
 /*
@@ -335,9 +335,9 @@ casinh(double complex z)
 double complex
 casin(double complex z)
 {
-	double complex w = casinh(cpack(cimag(z), creal(z)));
+	double complex w = casinh(CMPLX(cimag(z), creal(z)));
 
-	return (cpack(cimag(w), creal(w)));
+	return (CMPLX(cimag(w), creal(w)));
 }
 
 /*
@@ -370,19 +370,19 @@ cacos(double complex z)
 	if (isnan(x) || isnan(y)) {
 		/* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
 		if (isinf(x))
-			return (cpack(y + y, -INFINITY));
+			return (CMPLX(y + y, -INFINITY));
 		/* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */
 		if (isinf(y))
-			return (cpack(x + x, -y));
+			return (CMPLX(x + x, -y));
 		/* cacos(0 + I*NaN) = PI/2 + I*NaN with inexact */
 		if (x == 0)
-			return (cpack(pio2_hi + pio2_lo, y + y));
+			return (CMPLX(pio2_hi + pio2_lo, y + y));
 		/*
 		 * All other cases involving NaN return NaN + I*NaN.
 		 * C99 leaves it optional whether to raise invalid if one of
 		 * the arguments is not NaN, so we opt not to raise it.
 		 */
-		return (cpack(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
+		return (CMPLX(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
 	}
 
 	if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
@@ -392,18 +392,18 @@ cacos(double complex z)
 		ry = creal(w) + m_ln2;
 		if (sy == 0)
 			ry = -ry;
-		return (cpack(rx, ry));
+		return (CMPLX(rx, ry));
 	}
 
 	/* Avoid spuriously raising inexact for z = 1. */
 	if (x == 1 && y == 0)
-		return (cpack(0, -y));
+		return (CMPLX(0, -y));
 
 	/* All remaining cases are inexact. */
 	raise_inexact();
 
 	if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4)
-		return (cpack(pio2_hi - (x - pio2_lo), -y));
+		return (CMPLX(pio2_hi - (x - pio2_lo), -y));
 
 	do_hard_work(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x);
 	if (B_is_usable) {
@@ -419,7 +419,7 @@ cacos(double complex z)
 	}
 	if (sy == 0)
 		ry = -ry;
-	return (cpack(rx, ry));
+	return (CMPLX(rx, ry));
 }
 
 /*
@@ -437,15 +437,15 @@ cacosh(double complex z)
 	ry = cimag(w);
 	/* cacosh(NaN + I*NaN) = NaN + I*NaN */
 	if (isnan(rx) && isnan(ry))
-		return (cpack(ry, rx));
+		return (CMPLX(ry, rx));
 	/* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */
 	/* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */
 	if (isnan(rx))
-		return (cpack(fabs(ry), rx));
+		return (CMPLX(fabs(ry), rx));
 	/* cacosh(0 + I*NaN) = NaN + I*NaN */
 	if (isnan(ry))
-		return (cpack(ry, ry));
-	return (cpack(fabs(ry), copysign(rx, cimag(z))));
+		return (CMPLX(ry, ry));
+	return (CMPLX(fabs(ry), copysign(rx, cimag(z))));
 }
 
 /*
@@ -475,16 +475,16 @@ clog_for_large_values(double complex z)
 	 * this method is still poor since it is uneccessarily slow.
 	 */
 	if (ax > DBL_MAX / 2)
-		return (cpack(log(hypot(x / m_e, y / m_e)) + 1, atan2(y, x)));
+		return (CMPLX(log(hypot(x / m_e, y / m_e)) + 1, atan2(y, x)));
 
 	/*
 	 * Avoid overflow when x or y is large.  Avoid underflow when x or
 	 * y is small.
 	 */
 	if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
-		return (cpack(log(hypot(x, y)), atan2(y, x)));
+		return (CMPLX(log(hypot(x, y)), atan2(y, x)));
 
-	return (cpack(log(ax * ax + ay * ay) / 2, atan2(y, x)));
+	return (CMPLX(log(ax * ax + ay * ay) / 2, atan2(y, x)));
 }
 
 /*
@@ -575,30 +575,30 @@ catanh(double complex z)
 
 	/* This helps handle many cases. */
 	if (y == 0 && ax <= 1)
-		return (cpack(atanh(x), y));
+		return (CMPLX(atanh(x), y));
 
 	/* To ensure the same accuracy as atan(), and to filter out z = 0. */
 	if (x == 0)
-		return (cpack(x, atan(y)));
+		return (CMPLX(x, atan(y)));
 
 	if (isnan(x) || isnan(y)) {
 		/* catanh(+-Inf + I*NaN) = +-0 + I*NaN */
 		if (isinf(x))
-			return (cpack(copysign(0, x), y + y));
+			return (CMPLX(copysign(0, x), y + y));
 		/* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
 		if (isinf(y))
-			return (cpack(copysign(0, x),
+			return (CMPLX(copysign(0, x),
 			    copysign(pio2_hi + pio2_lo, y)));
 		/*
 		 * All other cases involving NaN return NaN + I*NaN.
 		 * C99 leaves it optional whether to raise invalid if one of
 		 * the arguments is not NaN, so we opt not to raise it.
 		 */
-		return (cpack(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
+		return (CMPLX(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
 	}
 
 	if (ax > RECIP_EPSILON || ay > RECIP_EPSILON)
-		return (cpack(real_part_reciprocal(x, y),
+		return (CMPLX(real_part_reciprocal(x, y),
 		    copysign(pio2_hi + pio2_lo, y)));
 
 	if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
@@ -623,7 +623,7 @@ catanh(double complex z)
 	else
 		ry = atan2(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
 
-	return (cpack(copysign(rx, x), copysign(ry, y)));
+	return (CMPLX(copysign(rx, x), copysign(ry, y)));
 }
 
 /*
@@ -633,7 +633,7 @@ catanh(double complex z)
 double complex
 catan(double complex z)
 {
-	double complex w = catanh(cpack(cimag(z), creal(z)));
+	double complex w = catanh(CMPLX(cimag(z), creal(z)));
 
-	return (cpack(cimag(w), creal(w)));
+	return (CMPLX(cimag(w), creal(w)));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/catrigf.c b/libm/upstream-freebsd/lib/msun/src/catrigf.c
index 08ebef7..e057d31 100644
--- a/libm/upstream-freebsd/lib/msun/src/catrigf.c
+++ b/libm/upstream-freebsd/lib/msun/src/catrigf.c
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/catrigf.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <float.h>
@@ -156,12 +156,12 @@ casinhf(float complex z)
 
 	if (isnan(x) || isnan(y)) {
 		if (isinf(x))
-			return (cpackf(x, y + y));
+			return (CMPLXF(x, y + y));
 		if (isinf(y))
-			return (cpackf(y, x + x));
+			return (CMPLXF(y, x + x));
 		if (y == 0)
-			return (cpackf(x + x, y));
-		return (cpackf(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
+			return (CMPLXF(x + x, y));
+		return (CMPLXF(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
 	}
 
 	if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
@@ -169,7 +169,7 @@ casinhf(float complex z)
 			w = clog_for_large_values(z) + m_ln2;
 		else
 			w = clog_for_large_values(-z) + m_ln2;
-		return (cpackf(copysignf(crealf(w), x),
+		return (CMPLXF(copysignf(crealf(w), x),
 		    copysignf(cimagf(w), y)));
 	}
 
@@ -186,15 +186,15 @@ casinhf(float complex z)
 		ry = asinf(B);
 	else
 		ry = atan2f(new_y, sqrt_A2my2);
-	return (cpackf(copysignf(rx, x), copysignf(ry, y)));
+	return (CMPLXF(copysignf(rx, x), copysignf(ry, y)));
 }
 
 float complex
 casinf(float complex z)
 {
-	float complex w = casinhf(cpackf(cimagf(z), crealf(z)));
+	float complex w = casinhf(CMPLXF(cimagf(z), crealf(z)));
 
-	return (cpackf(cimagf(w), crealf(w)));
+	return (CMPLXF(cimagf(w), crealf(w)));
 }
 
 float complex
@@ -214,12 +214,12 @@ cacosf(float complex z)
 
 	if (isnan(x) || isnan(y)) {
 		if (isinf(x))
-			return (cpackf(y + y, -INFINITY));
+			return (CMPLXF(y + y, -INFINITY));
 		if (isinf(y))
-			return (cpackf(x + x, -y));
+			return (CMPLXF(x + x, -y));
 		if (x == 0)
-			return (cpackf(pio2_hi + pio2_lo, y + y));
-		return (cpackf(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
+			return (CMPLXF(pio2_hi + pio2_lo, y + y));
+		return (CMPLXF(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
 	}
 
 	if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
@@ -228,16 +228,16 @@ cacosf(float complex z)
 		ry = crealf(w) + m_ln2;
 		if (sy == 0)
 			ry = -ry;
-		return (cpackf(rx, ry));
+		return (CMPLXF(rx, ry));
 	}
 
 	if (x == 1 && y == 0)
-		return (cpackf(0, -y));
+		return (CMPLXF(0, -y));
 
 	raise_inexact();
 
 	if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4)
-		return (cpackf(pio2_hi - (x - pio2_lo), -y));
+		return (CMPLXF(pio2_hi - (x - pio2_lo), -y));
 
 	do_hard_work(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x);
 	if (B_is_usable) {
@@ -253,7 +253,7 @@ cacosf(float complex z)
 	}
 	if (sy == 0)
 		ry = -ry;
-	return (cpackf(rx, ry));
+	return (CMPLXF(rx, ry));
 }
 
 float complex
@@ -266,12 +266,12 @@ cacoshf(float complex z)
 	rx = crealf(w);
 	ry = cimagf(w);
 	if (isnan(rx) && isnan(ry))
-		return (cpackf(ry, rx));
+		return (CMPLXF(ry, rx));
 	if (isnan(rx))
-		return (cpackf(fabsf(ry), rx));
+		return (CMPLXF(fabsf(ry), rx));
 	if (isnan(ry))
-		return (cpackf(ry, ry));
-	return (cpackf(fabsf(ry), copysignf(rx, cimagf(z))));
+		return (CMPLXF(ry, ry));
+	return (CMPLXF(fabsf(ry), copysignf(rx, cimagf(z))));
 }
 
 static float complex
@@ -291,13 +291,13 @@ clog_for_large_values(float complex z)
 	}
 
 	if (ax > FLT_MAX / 2)
-		return (cpackf(logf(hypotf(x / m_e, y / m_e)) + 1,
+		return (CMPLXF(logf(hypotf(x / m_e, y / m_e)) + 1,
 		    atan2f(y, x)));
 
 	if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
-		return (cpackf(logf(hypotf(x, y)), atan2f(y, x)));
+		return (CMPLXF(logf(hypotf(x, y)), atan2f(y, x)));
 
-	return (cpackf(logf(ax * ax + ay * ay) / 2, atan2f(y, x)));
+	return (CMPLXF(logf(ax * ax + ay * ay) / 2, atan2f(y, x)));
 }
 
 static inline float
@@ -346,22 +346,22 @@ catanhf(float complex z)
 	ay = fabsf(y);
 
 	if (y == 0 && ax <= 1)
-		return (cpackf(atanhf(x), y));
+		return (CMPLXF(atanhf(x), y));
 
 	if (x == 0)
-		return (cpackf(x, atanf(y)));
+		return (CMPLXF(x, atanf(y)));
 
 	if (isnan(x) || isnan(y)) {
 		if (isinf(x))
-			return (cpackf(copysignf(0, x), y + y));
+			return (CMPLXF(copysignf(0, x), y + y));
 		if (isinf(y))
-			return (cpackf(copysignf(0, x),
+			return (CMPLXF(copysignf(0, x),
 			    copysignf(pio2_hi + pio2_lo, y)));
-		return (cpackf(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
+		return (CMPLXF(x + 0.0L + (y + 0), x + 0.0L + (y + 0)));
 	}
 
 	if (ax > RECIP_EPSILON || ay > RECIP_EPSILON)
-		return (cpackf(real_part_reciprocal(x, y),
+		return (CMPLXF(real_part_reciprocal(x, y),
 		    copysignf(pio2_hi + pio2_lo, y)));
 
 	if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
@@ -381,13 +381,13 @@ catanhf(float complex z)
 	else
 		ry = atan2f(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
 
-	return (cpackf(copysignf(rx, x), copysignf(ry, y)));
+	return (CMPLXF(copysignf(rx, x), copysignf(ry, y)));
 }
 
 float complex
 catanf(float complex z)
 {
-	float complex w = catanhf(cpackf(cimagf(z), crealf(z)));
+	float complex w = catanhf(CMPLXF(cimagf(z), crealf(z)));
 
-	return (cpackf(cimagf(w), crealf(w)));
+	return (CMPLXF(cimagf(w), crealf(w)));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/e_j0.c b/libm/upstream-freebsd/lib/msun/src/e_j0.c
index 8320f25..36e72c2 100644
--- a/libm/upstream-freebsd/lib/msun/src/e_j0.c
+++ b/libm/upstream-freebsd/lib/msun/src/e_j0.c
@@ -12,7 +12,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/e_j0.c 283032 2015-05-17 16:27:06Z kargl $");
 
 /* __ieee754_j0(x), __ieee754_y0(x)
  * Bessel function of the first and second kinds of order zero.
@@ -62,7 +62,9 @@ __FBSDID("$FreeBSD$");
 #include "math.h"
 #include "math_private.h"
 
-static double pzero(double), qzero(double);
+static __inline double pzero(double), qzero(double);
+
+static const volatile double vone = 1, vzero = 0;
 
 static const double
 huge 	= 1e300,
@@ -115,7 +117,7 @@ __ieee754_j0(double x)
 	if(ix<0x3f200000) {	/* |x| < 2**-13 */
 	    if(huge+x>one) {	/* raise inexact if x != 0 */
 	        if(ix<0x3e400000) return one;	/* |x|<2**-27 */
-	        else 	      return one - 0.25*x*x;
+	        else 	      return one - x*x/4;
 	    }
 	}
 	z = x*x;
@@ -150,10 +152,16 @@ __ieee754_y0(double x)
 
 	EXTRACT_WORDS(hx,lx,x);
         ix = 0x7fffffff&hx;
-    /* Y0(NaN) is NaN, y0(-inf) is Nan, y0(inf) is 0  */
-	if(ix>=0x7ff00000) return  one/(x+x*x); 
-        if((ix|lx)==0) return -one/zero;
-        if(hx<0) return zero/zero;
+	/*
+	 * y0(NaN) = NaN.
+	 * y0(Inf) = 0.
+	 * y0(-Inf) = NaN and raise invalid exception.
+	 */
+	if(ix>=0x7ff00000) return vone/(x+x*x); 
+	/* y0(+-0) = -inf and raise divide-by-zero exception. */
+	if((ix|lx)==0) return -one/vzero;
+	/* y0(x<0) = NaN and raise invalid exception. */
+	if(hx<0) return vzero/vzero;
         if(ix >= 0x40000000) {  /* |x| >= 2.0 */
         /* y0(x) = sqrt(2/(pi*x))*(p0(x)*sin(x0)+q0(x)*cos(x0))
          * where x0 = x-pi/4
@@ -268,7 +276,8 @@ static const double pS2[5] = {
   1.46576176948256193810e+01, /* 0x402D50B3, 0x44391809 */
 };
 
-	static double pzero(double x)
+static __inline double
+pzero(double x)
 {
 	const double *p,*q;
 	double z,r,s;
@@ -278,7 +287,7 @@ static const double pS2[5] = {
 	if(ix>=0x40200000)     {p = pR8; q= pS8;}
 	else if(ix>=0x40122E8B){p = pR5; q= pS5;}
 	else if(ix>=0x4006DB6D){p = pR3; q= pS3;}
-	else if(ix>=0x40000000){p = pR2; q= pS2;}
+	else                   {p = pR2; q= pS2;}	/* ix>=0x40000000 */
 	z = one/(x*x);
 	r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))));
 	s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*q[4]))));
@@ -363,7 +372,8 @@ static const double qS2[6] = {
  -5.31095493882666946917e+00, /* 0xC0153E6A, 0xF8B32931 */
 };
 
-	static double qzero(double x)
+static __inline double
+qzero(double x)
 {
 	const double *p,*q;
 	double s,r,z;
@@ -373,7 +383,7 @@ static const double qS2[6] = {
 	if(ix>=0x40200000)     {p = qR8; q= qS8;}
 	else if(ix>=0x40122E8B){p = qR5; q= qS5;}
 	else if(ix>=0x4006DB6D){p = qR3; q= qS3;}
-	else if(ix>=0x40000000){p = qR2; q= qS2;}
+	else                   {p = qR2; q= qS2;}	/* ix>=0x40000000 */
 	z = one/(x*x);
 	r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))));
 	s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*(q[4]+z*q[5])))));
diff --git a/libm/upstream-freebsd/lib/msun/src/e_j0f.c b/libm/upstream-freebsd/lib/msun/src/e_j0f.c
index c45faf3..e53b218 100644
--- a/libm/upstream-freebsd/lib/msun/src/e_j0f.c
+++ b/libm/upstream-freebsd/lib/msun/src/e_j0f.c
@@ -14,12 +14,18 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/e_j0f.c 283032 2015-05-17 16:27:06Z kargl $");
+
+/*
+ * See e_j0.c for complete comments.
+ */
 
 #include "math.h"
 #include "math_private.h"
 
-static float pzerof(float), qzerof(float);
+static __inline float pzerof(float), qzerof(float);
+
+static const volatile float vone = 1,  vzero = 0;
 
 static const float
 huge 	= 1e30,
@@ -62,17 +68,17 @@ __ieee754_j0f(float x)
 	 * j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x)
 	 * y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x)
 	 */
-		if(ix>0x80000000) z = (invsqrtpi*cc)/sqrtf(x);
+		if(ix>0x58000000) z = (invsqrtpi*cc)/sqrtf(x); /* |x|>2**49 */
 		else {
 		    u = pzerof(x); v = qzerof(x);
 		    z = invsqrtpi*(u*cc-v*ss)/sqrtf(x);
 		}
 		return z;
 	}
-	if(ix<0x39000000) {	/* |x| < 2**-13 */
+	if(ix<0x3b000000) {	/* |x| < 2**-9 */
 	    if(huge+x>one) {	/* raise inexact if x != 0 */
-	        if(ix<0x32000000) return one;	/* |x|<2**-27 */
-	        else 	      return one - (float)0.25*x*x;
+	        if(ix<0x39800000) return one;	/* |x|<2**-12 */
+	        else 	      return one - x*x/4;
 	    }
 	}
 	z = x*x;
@@ -107,10 +113,9 @@ __ieee754_y0f(float x)
 
 	GET_FLOAT_WORD(hx,x);
         ix = 0x7fffffff&hx;
-    /* Y0(NaN) is NaN, y0(-inf) is Nan, y0(inf) is 0  */
-	if(ix>=0x7f800000) return  one/(x+x*x);
-        if(ix==0) return -one/zero;
-        if(hx<0) return zero/zero;
+	if(ix>=0x7f800000) return  vone/(x+x*x);
+	if(ix==0) return -one/vzero;
+	if(hx<0) return vzero/vzero;
         if(ix >= 0x40000000) {  /* |x| >= 2.0 */
         /* y0(x) = sqrt(2/(pi*x))*(p0(x)*sin(x0)+q0(x)*cos(x0))
          * where x0 = x-pi/4
@@ -136,14 +141,14 @@ __ieee754_y0f(float x)
                     if ((s*c)<zero) cc = z/ss;
                     else            ss = z/cc;
                 }
-                if(ix>0x80000000) z = (invsqrtpi*ss)/sqrtf(x);
+                if(ix>0x58000000) z = (invsqrtpi*ss)/sqrtf(x); /* |x|>2**49 */
                 else {
                     u = pzerof(x); v = qzerof(x);
                     z = invsqrtpi*(u*ss+v*cc)/sqrtf(x);
                 }
                 return z;
 	}
-	if(ix<=0x32000000) {	/* x < 2**-27 */
+	if(ix<=0x39000000) {	/* x < 2**-13 */
 	    return(u00 + tpi*__ieee754_logf(x));
 	}
 	z = x*x;
@@ -224,7 +229,8 @@ static const float pS2[5] = {
   1.4657617569e+01, /* 0x416a859a */
 };
 
-	static float pzerof(float x)
+static __inline float
+pzerof(float x)
 {
 	const float *p,*q;
 	float z,r,s;
@@ -232,9 +238,9 @@ static const float pS2[5] = {
 	GET_FLOAT_WORD(ix,x);
 	ix &= 0x7fffffff;
 	if(ix>=0x41000000)     {p = pR8; q= pS8;}
-	else if(ix>=0x40f71c58){p = pR5; q= pS5;}
-	else if(ix>=0x4036db68){p = pR3; q= pS3;}
-	else if(ix>=0x40000000){p = pR2; q= pS2;}
+	else if(ix>=0x409173eb){p = pR5; q= pS5;}
+	else if(ix>=0x4036d917){p = pR3; q= pS3;}
+	else                   {p = pR2; q= pS2;}	/* ix>=0x40000000 */
 	z = one/(x*x);
 	r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))));
 	s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*q[4]))));
@@ -319,7 +325,8 @@ static const float qS2[6] = {
  -5.3109550476e+00, /* 0xc0a9f358 */
 };
 
-	static float qzerof(float x)
+static __inline float
+qzerof(float x)
 {
 	const float *p,*q;
 	float s,r,z;
@@ -327,9 +334,9 @@ static const float qS2[6] = {
 	GET_FLOAT_WORD(ix,x);
 	ix &= 0x7fffffff;
 	if(ix>=0x41000000)     {p = qR8; q= qS8;}
-	else if(ix>=0x40f71c58){p = qR5; q= qS5;}
-	else if(ix>=0x4036db68){p = qR3; q= qS3;}
-	else if(ix>=0x40000000){p = qR2; q= qS2;}
+	else if(ix>=0x409173eb){p = qR5; q= qS5;}
+	else if(ix>=0x4036d917){p = qR3; q= qS3;}
+	else                   {p = qR2; q= qS2;}	/* ix>=0x40000000 */
 	z = one/(x*x);
 	r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))));
 	s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*(q[4]+z*q[5])))));
diff --git a/libm/upstream-freebsd/lib/msun/src/e_j1.c b/libm/upstream-freebsd/lib/msun/src/e_j1.c
index 63800ad..b11ac2d 100644
--- a/libm/upstream-freebsd/lib/msun/src/e_j1.c
+++ b/libm/upstream-freebsd/lib/msun/src/e_j1.c
@@ -12,7 +12,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/e_j1.c 283032 2015-05-17 16:27:06Z kargl $");
 
 /* __ieee754_j1(x), __ieee754_y1(x)
  * Bessel function of the first and second kinds of order zero.
@@ -62,7 +62,9 @@ __FBSDID("$FreeBSD$");
 #include "math.h"
 #include "math_private.h"
 
-static double pone(double), qone(double);
+static __inline double pone(double), qone(double);
+
+static const volatile double vone = 1, vzero = 0;
 
 static const double
 huge    = 1e300,
@@ -147,10 +149,16 @@ __ieee754_y1(double x)
 
 	EXTRACT_WORDS(hx,lx,x);
         ix = 0x7fffffff&hx;
-    /* if Y1(NaN) is NaN, Y1(-inf) is NaN, Y1(inf) is 0 */
-	if(ix>=0x7ff00000) return  one/(x+x*x); 
-        if((ix|lx)==0) return -one/zero;
-        if(hx<0) return zero/zero;
+	/*
+	 * y1(NaN) = NaN.
+	 * y1(Inf) = 0.
+	 * y1(-Inf) = NaN and raise invalid exception.
+	 */
+	if(ix>=0x7ff00000) return  vone/(x+x*x); 
+	/* y1(+-0) = -inf and raise divide-by-zero exception. */
+        if((ix|lx)==0) return -one/vzero;
+	/* y1(x<0) = NaN and raise invalid exception. */
+        if(hx<0) return vzero/vzero;
         if(ix >= 0x40000000) {  /* |x| >= 2.0 */
                 s = sin(x);
                 c = cos(x);
@@ -262,7 +270,8 @@ static const double ps2[5] = {
   8.36463893371618283368e+00, /* 0x4020BAB1, 0xF44E5192 */
 };
 
-	static double pone(double x)
+static __inline double
+pone(double x)
 {
 	const double *p,*q;
 	double z,r,s;
@@ -272,7 +281,7 @@ static const double ps2[5] = {
         if(ix>=0x40200000)     {p = pr8; q= ps8;}
         else if(ix>=0x40122E8B){p = pr5; q= ps5;}
         else if(ix>=0x4006DB6D){p = pr3; q= ps3;}
-        else if(ix>=0x40000000){p = pr2; q= ps2;}
+	else                   {p = pr2; q= ps2;}	/* ix>=0x40000000 */
         z = one/(x*x);
         r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))));
         s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*q[4]))));
@@ -358,7 +367,8 @@ static const double qs2[6] = {
  -4.95949898822628210127e+00, /* 0xC013D686, 0xE71BE86B */
 };
 
-	static double qone(double x)
+static __inline double
+qone(double x)
 {
 	const double *p,*q;
 	double  s,r,z;
@@ -368,7 +378,7 @@ static const double qs2[6] = {
 	if(ix>=0x40200000)     {p = qr8; q= qs8;}
 	else if(ix>=0x40122E8B){p = qr5; q= qs5;}
 	else if(ix>=0x4006DB6D){p = qr3; q= qs3;}
-	else if(ix>=0x40000000){p = qr2; q= qs2;}
+	else                   {p = qr2; q= qs2;}	/* ix>=0x40000000 */
 	z = one/(x*x);
 	r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))));
 	s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*(q[4]+z*q[5])))));
diff --git a/libm/upstream-freebsd/lib/msun/src/e_j1f.c b/libm/upstream-freebsd/lib/msun/src/e_j1f.c
index 88e2d83..0cca823 100644
--- a/libm/upstream-freebsd/lib/msun/src/e_j1f.c
+++ b/libm/upstream-freebsd/lib/msun/src/e_j1f.c
@@ -14,12 +14,18 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/e_j1f.c 283032 2015-05-17 16:27:06Z kargl $");
+
+/*
+ * See e_j1.c for complete comments.
+ */
 
 #include "math.h"
 #include "math_private.h"
 
-static float ponef(float), qonef(float);
+static __inline float ponef(float), qonef(float);
+
+static const volatile float vone = 1, vzero = 0;
 
 static const float
 huge    = 1e30,
@@ -63,7 +69,7 @@ __ieee754_j1f(float x)
 	 * j1(x) = 1/sqrt(pi) * (P(1,x)*cc - Q(1,x)*ss) / sqrt(x)
 	 * y1(x) = 1/sqrt(pi) * (P(1,x)*ss + Q(1,x)*cc) / sqrt(x)
 	 */
-		if(ix>0x80000000) z = (invsqrtpi*cc)/sqrtf(y);
+		if(ix>0x58000000) z = (invsqrtpi*cc)/sqrtf(y); /* |x|>2**49 */
 		else {
 		    u = ponef(y); v = qonef(y);
 		    z = invsqrtpi*(u*cc-v*ss)/sqrtf(y);
@@ -71,7 +77,7 @@ __ieee754_j1f(float x)
 		if(hx<0) return -z;
 		else  	 return  z;
 	}
-	if(ix<0x32000000) {	/* |x|<2**-27 */
+	if(ix<0x39000000) {	/* |x|<2**-13 */
 	    if(huge+x>one) return (float)0.5*x;/* inexact if x!=0 necessary */
 	}
 	z = x*x;
@@ -104,10 +110,9 @@ __ieee754_y1f(float x)
 
 	GET_FLOAT_WORD(hx,x);
         ix = 0x7fffffff&hx;
-    /* if Y1(NaN) is NaN, Y1(-inf) is NaN, Y1(inf) is 0 */
-	if(ix>=0x7f800000) return  one/(x+x*x);
-        if(ix==0) return -one/zero;
-        if(hx<0) return zero/zero;
+	if(ix>=0x7f800000) return  vone/(x+x*x);
+	if(ix==0) return -one/vzero;
+	if(hx<0) return vzero/vzero;
         if(ix >= 0x40000000) {  /* |x| >= 2.0 */
                 s = sinf(x);
                 c = cosf(x);
@@ -129,14 +134,14 @@ __ieee754_y1f(float x)
          *              sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x))
          * to compute the worse one.
          */
-                if(ix>0x48000000) z = (invsqrtpi*ss)/sqrtf(x);
+                if(ix>0x58000000) z = (invsqrtpi*ss)/sqrtf(x); /* |x|>2**49 */
                 else {
                     u = ponef(x); v = qonef(x);
                     z = invsqrtpi*(u*ss+v*cc)/sqrtf(x);
                 }
                 return z;
         }
-        if(ix<=0x24800000) {    /* x < 2**-54 */
+        if(ix<=0x33000000) {    /* x < 2**-25 */
             return(-tpi/x);
         }
         z = x*x;
@@ -219,7 +224,8 @@ static const float ps2[5] = {
   8.3646392822e+00, /* 0x4105d590 */
 };
 
-	static float ponef(float x)
+static __inline float
+ponef(float x)
 {
 	const float *p,*q;
 	float z,r,s;
@@ -227,9 +233,9 @@ static const float ps2[5] = {
 	GET_FLOAT_WORD(ix,x);
 	ix &= 0x7fffffff;
         if(ix>=0x41000000)     {p = pr8; q= ps8;}
-        else if(ix>=0x40f71c58){p = pr5; q= ps5;}
-        else if(ix>=0x4036db68){p = pr3; q= ps3;}
-        else if(ix>=0x40000000){p = pr2; q= ps2;}
+        else if(ix>=0x409173eb){p = pr5; q= ps5;}
+        else if(ix>=0x4036d917){p = pr3; q= ps3;}
+	else                   {p = pr2; q= ps2;}	/* ix>=0x40000000 */
         z = one/(x*x);
         r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))));
         s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*q[4]))));
@@ -315,17 +321,18 @@ static const float qs2[6] = {
  -4.9594988823e+00, /* 0xc09eb437 */
 };
 
-	static float qonef(float x)
+static __inline float
+qonef(float x)
 {
 	const float *p,*q;
 	float  s,r,z;
 	int32_t ix;
 	GET_FLOAT_WORD(ix,x);
 	ix &= 0x7fffffff;
-	if(ix>=0x40200000)     {p = qr8; q= qs8;}
-	else if(ix>=0x40f71c58){p = qr5; q= qs5;}
-	else if(ix>=0x4036db68){p = qr3; q= qs3;}
-	else if(ix>=0x40000000){p = qr2; q= qs2;}
+	if(ix>=0x41000000)     {p = qr8; q= qs8;}
+	else if(ix>=0x409173eb){p = qr5; q= qs5;}
+	else if(ix>=0x4036d917){p = qr3; q= qs3;}
+	else                   {p = qr2; q= qs2;}	/* ix>=0x40000000 */
 	z = one/(x*x);
 	r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))));
 	s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*(q[4]+z*q[5])))));
diff --git a/libm/upstream-freebsd/lib/msun/src/e_jn.c b/libm/upstream-freebsd/lib/msun/src/e_jn.c
index 8b0bc62..a1130c5 100644
--- a/libm/upstream-freebsd/lib/msun/src/e_jn.c
+++ b/libm/upstream-freebsd/lib/msun/src/e_jn.c
@@ -12,7 +12,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/e_jn.c 279856 2015-03-10 17:10:54Z kargl $");
 
 /*
  * __ieee754_jn(n, x), __ieee754_yn(n, x)
@@ -43,6 +43,8 @@ __FBSDID("$FreeBSD$");
 #include "math.h"
 #include "math_private.h"
 
+static const volatile double vone = 1, vzero = 0;
+
 static const double
 invsqrtpi=  5.64189583547756279280e-01, /* 0x3FE20DD7, 0x50429B6D */
 two   =  2.00000000000000000000e+00, /* 0x40000000, 0x00000000 */
@@ -220,10 +222,12 @@ __ieee754_yn(int n, double x)
 
 	EXTRACT_WORDS(hx,lx,x);
 	ix = 0x7fffffff&hx;
-    /* if Y(n,NaN) is NaN */
+	/* yn(n,NaN) = NaN */
 	if((ix|((u_int32_t)(lx|-lx))>>31)>0x7ff00000) return x+x;
-	if((ix|lx)==0) return -one/zero;
-	if(hx<0) return zero/zero;
+	/* yn(n,+-0) = -inf and raise divide-by-zero exception. */
+	if((ix|lx)==0) return -one/vzero;
+	/* yn(n,x<0) = NaN and raise invalid exception. */
+	if(hx<0) return vzero/vzero;
 	sign = 1;
 	if(n<0){
 		n = -n;
diff --git a/libm/upstream-freebsd/lib/msun/src/e_jnf.c b/libm/upstream-freebsd/lib/msun/src/e_jnf.c
index f564aec..c82d5cf 100644
--- a/libm/upstream-freebsd/lib/msun/src/e_jnf.c
+++ b/libm/upstream-freebsd/lib/msun/src/e_jnf.c
@@ -14,11 +14,17 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/e_jnf.c 279856 2015-03-10 17:10:54Z kargl $");
+
+/*
+ * See e_jn.c for complete comments.
+ */
 
 #include "math.h"
 #include "math_private.h"
 
+static const volatile float vone = 1, vzero = 0;
+
 static const float
 two   =  2.0000000000e+00, /* 0x40000000 */
 one   =  1.0000000000e+00; /* 0x3F800000 */
@@ -172,10 +178,9 @@ __ieee754_ynf(int n, float x)
 
 	GET_FLOAT_WORD(hx,x);
 	ix = 0x7fffffff&hx;
-    /* if Y(n,NaN) is NaN */
 	if(ix>0x7f800000) return x+x;
-	if(ix==0) return -one/zero;
-	if(hx<0) return zero/zero;
+	if(ix==0) return -one/vzero;
+	if(hx<0) return vzero/vzero;
 	sign = 1;
 	if(n<0){
 		n = -n;
diff --git a/libm/upstream-freebsd/lib/msun/src/k_exp.c b/libm/upstream-freebsd/lib/msun/src/k_exp.c
index f592f69..5aa3ef3 100644
--- a/libm/upstream-freebsd/lib/msun/src/k_exp.c
+++ b/libm/upstream-freebsd/lib/msun/src/k_exp.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/k_exp.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 
@@ -103,6 +103,6 @@ __ldexp_cexp(double complex z, int expt)
 	half_expt = expt - half_expt;
 	INSERT_WORDS(scale2, (0x3ff + half_expt) << 20, 0);
 
-	return (cpack(cos(y) * exp_x * scale1 * scale2,
+	return (CMPLX(cos(y) * exp_x * scale1 * scale2,
 	    sin(y) * exp_x * scale1 * scale2));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/k_expf.c b/libm/upstream-freebsd/lib/msun/src/k_expf.c
index 548a008..8fe8c46 100644
--- a/libm/upstream-freebsd/lib/msun/src/k_expf.c
+++ b/libm/upstream-freebsd/lib/msun/src/k_expf.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/k_expf.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 
@@ -82,6 +82,6 @@ __ldexp_cexpf(float complex z, int expt)
 	half_expt = expt - half_expt;
 	SET_FLOAT_WORD(scale2, (0x7f + half_expt) << 23);
 
-	return (cpackf(cosf(y) * exp_x * scale1 * scale2,
+	return (CMPLXF(cosf(y) * exp_x * scale1 * scale2,
 	    sinf(y) * exp_x * scale1 * scale2));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/math_private.h b/libm/upstream-freebsd/lib/msun/src/math_private.h
index 8af2c65..1f10e8b 100644
--- a/libm/upstream-freebsd/lib/msun/src/math_private.h
+++ b/libm/upstream-freebsd/lib/msun/src/math_private.h
@@ -11,7 +11,7 @@
 
 /*
  * from: @(#)fdlibm.h 5.1 93/09/24
- * $FreeBSD$
+ * $FreeBSD: head/lib/msun/src/math_private.h 276176 2014-12-24 10:13:53Z ed $
  */
 
 #ifndef _MATH_PRIVATE_H_
@@ -454,9 +454,15 @@ typedef union {
  * (0.0+I)*(y+0.0*I) and laboriously computing the full complex product.
  * In particular, I*Inf is corrupted to NaN+I*Inf, and I*-0 is corrupted
  * to -0.0+I*0.0.
+ *
+ * The C11 standard introduced the macros CMPLX(), CMPLXF() and CMPLXL()
+ * to construct complex values.  Compilers that conform to the C99
+ * standard require the following functions to avoid the above issues.
  */
+
+#ifndef CMPLXF
 static __inline float complex
-cpackf(float x, float y)
+CMPLXF(float x, float y)
 {
 	float_complex z;
 
@@ -464,9 +470,11 @@ cpackf(float x, float y)
 	IMAGPART(z) = y;
 	return (z.f);
 }
+#endif
 
+#ifndef CMPLX
 static __inline double complex
-cpack(double x, double y)
+CMPLX(double x, double y)
 {
 	double_complex z;
 
@@ -474,9 +482,11 @@ cpack(double x, double y)
 	IMAGPART(z) = y;
 	return (z.f);
 }
+#endif
 
+#ifndef CMPLXL
 static __inline long double complex
-cpackl(long double x, long double y)
+CMPLXL(long double x, long double y)
 {
 	long_double_complex z;
 
@@ -484,6 +494,8 @@ cpackl(long double x, long double y)
 	IMAGPART(z) = y;
 	return (z.f);
 }
+#endif
+
 #endif /* _COMPLEX_H */
  
 #ifdef __GNUCLIKE_ASM
diff --git a/libm/upstream-freebsd/lib/msun/src/s_ccosh.c b/libm/upstream-freebsd/lib/msun/src/s_ccosh.c
index 9ea962b..e544e91 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_ccosh.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_ccosh.c
@@ -32,10 +32,12 @@
  *
  * Exceptional values are noted in the comments within the source code.
  * These values and the return value were taken from n1124.pdf.
+ * The sign of the result for some exceptional values is unspecified but
+ * must satisfy both cosh(conj(z)) == conj(cosh(z)) and cosh(-z) == cosh(z).
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_ccosh.c 284423 2015-06-15 20:11:06Z tijl $");
 
 #include <complex.h>
 #include <math.h>
@@ -62,49 +64,48 @@ ccosh(double complex z)
 	/* Handle the nearly-non-exceptional cases where x and y are finite. */
 	if (ix < 0x7ff00000 && iy < 0x7ff00000) {
 		if ((iy | ly) == 0)
-			return (cpack(cosh(x), x * y));
-		if (ix < 0x40360000)	/* small x: normal case */
-			return (cpack(cosh(x) * cos(y), sinh(x) * sin(y)));
+			return (CMPLX(cosh(x), x * y));
+		if (ix < 0x40360000)	/* |x| < 22: normal case */
+			return (CMPLX(cosh(x) * cos(y), sinh(x) * sin(y)));
 
 		/* |x| >= 22, so cosh(x) ~= exp(|x|) */
 		if (ix < 0x40862e42) {
 			/* x < 710: exp(|x|) won't overflow */
 			h = exp(fabs(x)) * 0.5;
-			return (cpack(h * cos(y), copysign(h, x) * sin(y)));
+			return (CMPLX(h * cos(y), copysign(h, x) * sin(y)));
 		} else if (ix < 0x4096bbaa) {
 			/* x < 1455: scale to avoid overflow */
-			z = __ldexp_cexp(cpack(fabs(x), y), -1);
-			return (cpack(creal(z), cimag(z) * copysign(1, x)));
+			z = __ldexp_cexp(CMPLX(fabs(x), y), -1);
+			return (CMPLX(creal(z), cimag(z) * copysign(1, x)));
 		} else {
 			/* x >= 1455: the result always overflows */
 			h = huge * x;
-			return (cpack(h * h * cos(y), h * sin(y)));
+			return (CMPLX(h * h * cos(y), h * sin(y)));
 		}
 	}
 
 	/*
-	 * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
-	 * The sign of 0 in the result is unspecified.  Choice = normally
-	 * the same as dNaN.  Raise the invalid floating-point exception.
+	 * cosh(+-0 +- I Inf) = dNaN + I (+-)(+-)0.
+	 * The sign of 0 in the result is unspecified.  Choice = product
+	 * of the signs of the argument.  Raise the invalid floating-point
+	 * exception.
 	 *
-	 * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
-	 * The sign of 0 in the result is unspecified.  Choice = normally
-	 * the same as d(NaN).
+	 * cosh(+-0 +- I NaN) = d(NaN) + I (+-)(+-)0.
+	 * The sign of 0 in the result is unspecified.  Choice = product
+	 * of the signs of the argument.
 	 */
-	if ((ix | lx) == 0 && iy >= 0x7ff00000)
-		return (cpack(y - y, copysign(0, x * (y - y))));
+	if ((ix | lx) == 0)		/* && iy >= 0x7ff00000 */
+		return (CMPLX(y - y, x * copysign(0, y)));
 
 	/*
 	 * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
 	 *
-	 * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
-	 * The sign of 0 in the result is unspecified.
+	 * cosh(NaN +- I 0)   = d(NaN) + I (+-)(+-)0.
+	 * The sign of 0 in the result is unspecified.  Choice = product
+	 * of the signs of the argument.
 	 */
-	if ((iy | ly) == 0 && ix >= 0x7ff00000) {
-		if (((hx & 0xfffff) | lx) == 0)
-			return (cpack(x * x, copysign(0, x) * y));
-		return (cpack(x * x, copysign(0, (x + x) * y)));
-	}
+	if ((iy | ly) == 0)		/* && ix >= 0x7ff00000 */
+		return (CMPLX(x * x, copysign(0, x) * y));
 
 	/*
 	 * cosh(x +- I Inf) = dNaN + I dNaN.
@@ -114,8 +115,8 @@ ccosh(double complex z)
 	 * Optionally raises the invalid floating-point exception for finite
 	 * nonzero x.  Choice = don't raise (except for signaling NaNs).
 	 */
-	if (ix < 0x7ff00000 && iy >= 0x7ff00000)
-		return (cpack(y - y, x * (y - y)));
+	if (ix < 0x7ff00000)		/* && iy >= 0x7ff00000 */
+		return (CMPLX(y - y, x * (y - y)));
 
 	/*
 	 * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
@@ -126,10 +127,10 @@ ccosh(double complex z)
 	 *
 	 * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
 	 */
-	if (ix >= 0x7ff00000 && ((hx & 0xfffff) | lx) == 0) {
+	if (ix == 0x7ff00000 && lx == 0) {
 		if (iy >= 0x7ff00000)
-			return (cpack(x * x, x * (y - y)));
-		return (cpack((x * x) * cos(y), x * sin(y)));
+			return (CMPLX(INFINITY, x * (y - y)));
+		return (CMPLX(INFINITY * cos(y), x * sin(y)));
 	}
 
 	/*
@@ -143,7 +144,7 @@ ccosh(double complex z)
 	 * Optionally raises the invalid floating-point exception for finite
 	 * nonzero y.  Choice = don't raise (except for signaling NaNs).
 	 */
-	return (cpack((x * x) * (y - y), (x + x) * (y - y)));
+	return (CMPLX((x * x) * (y - y), (x + x) * (y - y)));
 }
 
 double complex
@@ -151,5 +152,5 @@ ccos(double complex z)
 {
 
 	/* ccos(z) = ccosh(I * z) */
-	return (ccosh(cpack(-cimag(z), creal(z))));
+	return (ccosh(CMPLX(-cimag(z), creal(z))));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_ccoshf.c b/libm/upstream-freebsd/lib/msun/src/s_ccoshf.c
index 1de9ad4..e33840a 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_ccoshf.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_ccoshf.c
@@ -25,11 +25,11 @@
  */
 
 /*
- * Hyperbolic cosine of a complex argument.  See s_ccosh.c for details.
+ * Float version of ccosh().  See s_ccosh.c for details.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_ccoshf.c 284423 2015-06-15 20:11:06Z tijl $");
 
 #include <complex.h>
 #include <math.h>
@@ -55,50 +55,47 @@ ccoshf(float complex z)
 
 	if (ix < 0x7f800000 && iy < 0x7f800000) {
 		if (iy == 0)
-			return (cpackf(coshf(x), x * y));
-		if (ix < 0x41100000)	/* small x: normal case */
-			return (cpackf(coshf(x) * cosf(y), sinhf(x) * sinf(y)));
+			return (CMPLXF(coshf(x), x * y));
+		if (ix < 0x41100000)	/* |x| < 9: normal case */
+			return (CMPLXF(coshf(x) * cosf(y), sinhf(x) * sinf(y)));
 
 		/* |x| >= 9, so cosh(x) ~= exp(|x|) */
 		if (ix < 0x42b17218) {
 			/* x < 88.7: expf(|x|) won't overflow */
-			h = expf(fabsf(x)) * 0.5f;
-			return (cpackf(h * cosf(y), copysignf(h, x) * sinf(y)));
+			h = expf(fabsf(x)) * 0.5F;
+			return (CMPLXF(h * cosf(y), copysignf(h, x) * sinf(y)));
 		} else if (ix < 0x4340b1e7) {
 			/* x < 192.7: scale to avoid overflow */
-			z = __ldexp_cexpf(cpackf(fabsf(x), y), -1);
-			return (cpackf(crealf(z), cimagf(z) * copysignf(1, x)));
+			z = __ldexp_cexpf(CMPLXF(fabsf(x), y), -1);
+			return (CMPLXF(crealf(z), cimagf(z) * copysignf(1, x)));
 		} else {
 			/* x >= 192.7: the result always overflows */
 			h = huge * x;
-			return (cpackf(h * h * cosf(y), h * sinf(y)));
+			return (CMPLXF(h * h * cosf(y), h * sinf(y)));
 		}
 	}
 
-	if (ix == 0 && iy >= 0x7f800000)
-		return (cpackf(y - y, copysignf(0, x * (y - y))));
+	if (ix == 0)			/* && iy >= 0x7f800000 */
+		return (CMPLXF(y - y, x * copysignf(0, y)));
 
-	if (iy == 0 && ix >= 0x7f800000) {
-		if ((hx & 0x7fffff) == 0)
-			return (cpackf(x * x, copysignf(0, x) * y));
-		return (cpackf(x * x, copysignf(0, (x + x) * y)));
-	}
+	if (iy == 0)			/* && ix >= 0x7f800000 */
+		return (CMPLXF(x * x, copysignf(0, x) * y));
 
-	if (ix < 0x7f800000 && iy >= 0x7f800000)
-		return (cpackf(y - y, x * (y - y)));
+	if (ix < 0x7f800000)		/* && iy >= 0x7f800000 */
+		return (CMPLXF(y - y, x * (y - y)));
 
-	if (ix >= 0x7f800000 && (hx & 0x7fffff) == 0) {
+	if (ix == 0x7f800000) {
 		if (iy >= 0x7f800000)
-			return (cpackf(x * x, x * (y - y)));
-		return (cpackf((x * x) * cosf(y), x * sinf(y)));
+			return (CMPLXF(INFINITY, x * (y - y)));
+		return (CMPLXF(INFINITY * cosf(y), x * sinf(y)));
 	}
 
-	return (cpackf((x * x) * (y - y), (x + x) * (y - y)));
+	return (CMPLXF((x * x) * (y - y), (x + x) * (y - y)));
 }
 
 float complex
 ccosf(float complex z)
 {
 
-	return (ccoshf(cpackf(-cimagf(z), crealf(z))));
+	return (ccoshf(CMPLXF(-cimagf(z), crealf(z))));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_cexp.c b/libm/upstream-freebsd/lib/msun/src/s_cexp.c
index abe178f..660a68d 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_cexp.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_cexp.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_cexp.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <math.h>
@@ -50,22 +50,22 @@ cexp(double complex z)
 
 	/* cexp(x + I 0) = exp(x) + I 0 */
 	if ((hy | ly) == 0)
-		return (cpack(exp(x), y));
+		return (CMPLX(exp(x), y));
 	EXTRACT_WORDS(hx, lx, x);
 	/* cexp(0 + I y) = cos(y) + I sin(y) */
 	if (((hx & 0x7fffffff) | lx) == 0)
-		return (cpack(cos(y), sin(y)));
+		return (CMPLX(cos(y), sin(y)));
 
 	if (hy >= 0x7ff00000) {
 		if (lx != 0 || (hx & 0x7fffffff) != 0x7ff00000) {
 			/* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */
-			return (cpack(y - y, y - y));
+			return (CMPLX(y - y, y - y));
 		} else if (hx & 0x80000000) {
 			/* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */
-			return (cpack(0.0, 0.0));
+			return (CMPLX(0.0, 0.0));
 		} else {
 			/* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */
-			return (cpack(x, y - y));
+			return (CMPLX(x, y - y));
 		}
 	}
 
@@ -84,6 +84,6 @@ cexp(double complex z)
 		 *  -  x = NaN (spurious inexact exception from y)
 		 */
 		exp_x = exp(x);
-		return (cpack(exp_x * cos(y), exp_x * sin(y)));
+		return (CMPLX(exp_x * cos(y), exp_x * sin(y)));
 	}
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_cexpf.c b/libm/upstream-freebsd/lib/msun/src/s_cexpf.c
index 0e30d08..709ad47 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_cexpf.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_cexpf.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_cexpf.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <math.h>
@@ -50,22 +50,22 @@ cexpf(float complex z)
 
 	/* cexp(x + I 0) = exp(x) + I 0 */
 	if (hy == 0)
-		return (cpackf(expf(x), y));
+		return (CMPLXF(expf(x), y));
 	GET_FLOAT_WORD(hx, x);
 	/* cexp(0 + I y) = cos(y) + I sin(y) */
 	if ((hx & 0x7fffffff) == 0)
-		return (cpackf(cosf(y), sinf(y)));
+		return (CMPLXF(cosf(y), sinf(y)));
 
 	if (hy >= 0x7f800000) {
 		if ((hx & 0x7fffffff) != 0x7f800000) {
 			/* cexp(finite|NaN +- I Inf|NaN) = NaN + I NaN */
-			return (cpackf(y - y, y - y));
+			return (CMPLXF(y - y, y - y));
 		} else if (hx & 0x80000000) {
 			/* cexp(-Inf +- I Inf|NaN) = 0 + I 0 */
-			return (cpackf(0.0, 0.0));
+			return (CMPLXF(0.0, 0.0));
 		} else {
 			/* cexp(+Inf +- I Inf|NaN) = Inf + I NaN */
-			return (cpackf(x, y - y));
+			return (CMPLXF(x, y - y));
 		}
 	}
 
@@ -84,6 +84,6 @@ cexpf(float complex z)
 		 *  -  x = NaN (spurious inexact exception from y)
 		 */
 		exp_x = expf(x);
-		return (cpackf(exp_x * cosf(y), exp_x * sinf(y)));
+		return (CMPLXF(exp_x * cosf(y), exp_x * sinf(y)));
 	}
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_conj.c b/libm/upstream-freebsd/lib/msun/src/s_conj.c
index 5770c29..61fac63 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_conj.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_conj.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/lib/msun/src/s_conj.c 275819 2014-12-16 09:21:56Z ed $
  */
 
 #include <complex.h>
@@ -34,5 +34,5 @@ double complex
 conj(double complex z)
 {
 
-	return (cpack(creal(z), -cimag(z)));
+	return (CMPLX(creal(z), -cimag(z)));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_conjf.c b/libm/upstream-freebsd/lib/msun/src/s_conjf.c
index b090760..83c9ef0 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_conjf.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_conjf.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/lib/msun/src/s_conjf.c 275819 2014-12-16 09:21:56Z ed $
  */
 
 #include <complex.h>
@@ -34,5 +34,5 @@ float complex
 conjf(float complex z)
 {
 
-	return (cpackf(crealf(z), -cimagf(z)));
+	return (CMPLXF(crealf(z), -cimagf(z)));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_conjl.c b/libm/upstream-freebsd/lib/msun/src/s_conjl.c
index 0e431ef..d9e6a16 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_conjl.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_conjl.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/lib/msun/src/s_conjl.c 275819 2014-12-16 09:21:56Z ed $
  */
 
 #include <complex.h>
@@ -34,5 +34,5 @@ long double complex
 conjl(long double complex z)
 {
 
-	return (cpackl(creall(z), -cimagl(z)));
+	return (CMPLXL(creall(z), -cimagl(z)));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_cproj.c b/libm/upstream-freebsd/lib/msun/src/s_cproj.c
index 8e9404c..ec2266e 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_cproj.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_cproj.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_cproj.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <math.h>
@@ -39,7 +39,7 @@ cproj(double complex z)
 	if (!isinf(creal(z)) && !isinf(cimag(z)))
 		return (z);
 	else
-		return (cpack(INFINITY, copysign(0.0, cimag(z))));
+		return (CMPLX(INFINITY, copysign(0.0, cimag(z))));
 }
 
 #if LDBL_MANT_DIG == 53
diff --git a/libm/upstream-freebsd/lib/msun/src/s_cprojf.c b/libm/upstream-freebsd/lib/msun/src/s_cprojf.c
index 68ea77b..63af75f 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_cprojf.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_cprojf.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_cprojf.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <math.h>
@@ -39,5 +39,5 @@ cprojf(float complex z)
 	if (!isinf(crealf(z)) && !isinf(cimagf(z)))
 		return (z);
 	else
-		return (cpackf(INFINITY, copysignf(0.0, cimagf(z))));
+		return (CMPLXF(INFINITY, copysignf(0.0, cimagf(z))));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_cprojl.c b/libm/upstream-freebsd/lib/msun/src/s_cprojl.c
index 07385bc..8386f81 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_cprojl.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_cprojl.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_cprojl.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <math.h>
@@ -39,5 +39,5 @@ cprojl(long double complex z)
 	if (!isinf(creall(z)) && !isinf(cimagl(z)))
 		return (z);
 	else
-		return (cpackl(INFINITY, copysignl(0.0, cimagl(z))));
+		return (CMPLXL(INFINITY, copysignl(0.0, cimagl(z))));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_csinh.c b/libm/upstream-freebsd/lib/msun/src/s_csinh.c
index c192f30..cff1402 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_csinh.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_csinh.c
@@ -32,10 +32,12 @@
  *
  * Exceptional values are noted in the comments within the source code.
  * These values and the return value were taken from n1124.pdf.
+ * The sign of the result for some exceptional values is unspecified but
+ * must satisfy both sinh(conj(z)) == conj(sinh(z)) and sinh(-z) == -sinh(z).
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_csinh.c 284426 2015-06-15 20:16:53Z tijl $");
 
 #include <complex.h>
 #include <math.h>
@@ -62,48 +64,45 @@ csinh(double complex z)
 	/* Handle the nearly-non-exceptional cases where x and y are finite. */
 	if (ix < 0x7ff00000 && iy < 0x7ff00000) {
 		if ((iy | ly) == 0)
-			return (cpack(sinh(x), y));
-		if (ix < 0x40360000)	/* small x: normal case */
-			return (cpack(sinh(x) * cos(y), cosh(x) * sin(y)));
+			return (CMPLX(sinh(x), y));
+		if (ix < 0x40360000)	/* |x| < 22: normal case */
+			return (CMPLX(sinh(x) * cos(y), cosh(x) * sin(y)));
 
 		/* |x| >= 22, so cosh(x) ~= exp(|x|) */
 		if (ix < 0x40862e42) {
 			/* x < 710: exp(|x|) won't overflow */
 			h = exp(fabs(x)) * 0.5;
-			return (cpack(copysign(h, x) * cos(y), h * sin(y)));
+			return (CMPLX(copysign(h, x) * cos(y), h * sin(y)));
 		} else if (ix < 0x4096bbaa) {
 			/* x < 1455: scale to avoid overflow */
-			z = __ldexp_cexp(cpack(fabs(x), y), -1);
-			return (cpack(creal(z) * copysign(1, x), cimag(z)));
+			z = __ldexp_cexp(CMPLX(fabs(x), y), -1);
+			return (CMPLX(creal(z) * copysign(1, x), cimag(z)));
 		} else {
 			/* x >= 1455: the result always overflows */
 			h = huge * x;
-			return (cpack(h * cos(y), h * h * sin(y)));
+			return (CMPLX(h * cos(y), h * h * sin(y)));
 		}
 	}
 
 	/*
-	 * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
-	 * The sign of 0 in the result is unspecified.  Choice = normally
-	 * the same as dNaN.  Raise the invalid floating-point exception.
+	 * sinh(+-0 +- I Inf) = +-0 + I dNaN.
+	 * The sign of 0 in the result is unspecified.  Choice = same sign
+	 * as the argument.  Raise the invalid floating-point exception.
 	 *
-	 * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
-	 * The sign of 0 in the result is unspecified.  Choice = normally
-	 * the same as d(NaN).
+	 * sinh(+-0 +- I NaN) = +-0 + I d(NaN).
+	 * The sign of 0 in the result is unspecified.  Choice = same sign
+	 * as the argument.
 	 */
-	if ((ix | lx) == 0 && iy >= 0x7ff00000)
-		return (cpack(copysign(0, x * (y - y)), y - y));
+	if ((ix | lx) == 0)		/* && iy >= 0x7ff00000 */
+		return (CMPLX(x, y - y));
 
 	/*
 	 * sinh(+-Inf +- I 0) = +-Inf + I +-0.
 	 *
 	 * sinh(NaN +- I 0)   = d(NaN) + I +-0.
 	 */
-	if ((iy | ly) == 0 && ix >= 0x7ff00000) {
-		if (((hx & 0xfffff) | lx) == 0)
-			return (cpack(x, y));
-		return (cpack(x, copysign(0, y)));
-	}
+	if ((iy | ly) == 0)		/* && ix >= 0x7ff00000 */
+		return (CMPLX(x + x, y));
 
 	/*
 	 * sinh(x +- I Inf) = dNaN + I dNaN.
@@ -113,45 +112,45 @@ csinh(double complex z)
 	 * Optionally raises the invalid floating-point exception for finite
 	 * nonzero x.  Choice = don't raise (except for signaling NaNs).
 	 */
-	if (ix < 0x7ff00000 && iy >= 0x7ff00000)
-		return (cpack(y - y, x * (y - y)));
+	if (ix < 0x7ff00000)		/* && iy >= 0x7ff00000 */
+		return (CMPLX(y - y, y - y));
 
 	/*
 	 * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
-	 * The sign of Inf in the result is unspecified.  Choice = normally
-	 * the same as d(NaN).
+	 * The sign of Inf in the result is unspecified.  Choice = same sign
+	 * as the argument.
 	 *
-	 * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
-	 * The sign of Inf in the result is unspecified.  Choice = always +.
-	 * Raise the invalid floating-point exception.
+	 * sinh(+-Inf +- I Inf) = +-Inf + I dNaN.
+	 * The sign of Inf in the result is unspecified.  Choice = same sign
+	 * as the argument.  Raise the invalid floating-point exception.
 	 *
 	 * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
 	 */
-	if (ix >= 0x7ff00000 && ((hx & 0xfffff) | lx) == 0) {
+	if (ix == 0x7ff00000 && lx == 0) {
 		if (iy >= 0x7ff00000)
-			return (cpack(x * x, x * (y - y)));
-		return (cpack(x * cos(y), INFINITY * sin(y)));
+			return (CMPLX(x, y - y));
+		return (CMPLX(x * cos(y), INFINITY * sin(y)));
 	}
 
 	/*
-	 * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+	 * sinh(NaN1 + I NaN2) = d(NaN1, NaN2) + I d(NaN1, NaN2).
 	 *
-	 * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+	 * sinh(NaN +- I Inf)  = d(NaN, dNaN) + I d(NaN, dNaN).
 	 * Optionally raises the invalid floating-point exception.
 	 * Choice = raise.
 	 *
-	 * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+	 * sinh(NaN + I y)     = d(NaN) + I d(NaN).
 	 * Optionally raises the invalid floating-point exception for finite
 	 * nonzero y.  Choice = don't raise (except for signaling NaNs).
 	 */
-	return (cpack((x * x) * (y - y), (x + x) * (y - y)));
+	return (CMPLX((x + x) * (y - y), (x * x) * (y - y)));
 }
 
 double complex
 csin(double complex z)
 {
 
-	/* csin(z) = -I * csinh(I * z) */
-	z = csinh(cpack(-cimag(z), creal(z)));
-	return (cpack(cimag(z), -creal(z)));
+	/* csin(z) = -I * csinh(I * z) = I * conj(csinh(I * conj(z))). */
+	z = csinh(CMPLX(cimag(z), creal(z)));
+	return (CMPLX(cimag(z), creal(z)));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_csinhf.c b/libm/upstream-freebsd/lib/msun/src/s_csinhf.c
index c523125..f050890 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_csinhf.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_csinhf.c
@@ -25,11 +25,11 @@
  */
 
 /*
- * Hyperbolic sine of a complex argument z.  See s_csinh.c for details.
+ * Float version of csinh().  See s_csinh.c for details.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_csinhf.c 284426 2015-06-15 20:16:53Z tijl $");
 
 #include <complex.h>
 #include <math.h>
@@ -55,51 +55,48 @@ csinhf(float complex z)
 
 	if (ix < 0x7f800000 && iy < 0x7f800000) {
 		if (iy == 0)
-			return (cpackf(sinhf(x), y));
-		if (ix < 0x41100000)	/* small x: normal case */
-			return (cpackf(sinhf(x) * cosf(y), coshf(x) * sinf(y)));
+			return (CMPLXF(sinhf(x), y));
+		if (ix < 0x41100000)	/* |x| < 9: normal case */
+			return (CMPLXF(sinhf(x) * cosf(y), coshf(x) * sinf(y)));
 
 		/* |x| >= 9, so cosh(x) ~= exp(|x|) */
 		if (ix < 0x42b17218) {
 			/* x < 88.7: expf(|x|) won't overflow */
-			h = expf(fabsf(x)) * 0.5f;
-			return (cpackf(copysignf(h, x) * cosf(y), h * sinf(y)));
+			h = expf(fabsf(x)) * 0.5F;
+			return (CMPLXF(copysignf(h, x) * cosf(y), h * sinf(y)));
 		} else if (ix < 0x4340b1e7) {
 			/* x < 192.7: scale to avoid overflow */
-			z = __ldexp_cexpf(cpackf(fabsf(x), y), -1);
-			return (cpackf(crealf(z) * copysignf(1, x), cimagf(z)));
+			z = __ldexp_cexpf(CMPLXF(fabsf(x), y), -1);
+			return (CMPLXF(crealf(z) * copysignf(1, x), cimagf(z)));
 		} else {
 			/* x >= 192.7: the result always overflows */
 			h = huge * x;
-			return (cpackf(h * cosf(y), h * h * sinf(y)));
+			return (CMPLXF(h * cosf(y), h * h * sinf(y)));
 		}
 	}
 
-	if (ix == 0 && iy >= 0x7f800000)
-		return (cpackf(copysignf(0, x * (y - y)), y - y));
+	if (ix == 0)			/* && iy >= 0x7f800000 */
+		return (CMPLXF(x, y - y));
 
-	if (iy == 0 && ix >= 0x7f800000) {
-		if ((hx & 0x7fffff) == 0)
-			return (cpackf(x, y));
-		return (cpackf(x, copysignf(0, y)));
-	}
+	if (iy == 0)			/* && ix >= 0x7f800000 */
+		return (CMPLXF(x + x, y));
 
-	if (ix < 0x7f800000 && iy >= 0x7f800000)
-		return (cpackf(y - y, x * (y - y)));
+	if (ix < 0x7f800000)		/* && iy >= 0x7f800000 */
+		return (CMPLXF(y - y, y - y));
 
-	if (ix >= 0x7f800000 && (hx & 0x7fffff) == 0) {
+	if (ix == 0x7f800000) {
 		if (iy >= 0x7f800000)
-			return (cpackf(x * x, x * (y - y)));
-		return (cpackf(x * cosf(y), INFINITY * sinf(y)));
+			return (CMPLXF(x, y - y));
+		return (CMPLXF(x * cosf(y), INFINITY * sinf(y)));
 	}
 
-	return (cpackf((x * x) * (y - y), (x + x) * (y - y)));
+	return (CMPLXF((x + x) * (y - y), (x * x) * (y - y)));
 }
 
 float complex
 csinf(float complex z)
 {
 
-	z = csinhf(cpackf(-cimagf(z), crealf(z)));
-	return (cpackf(cimagf(z), -crealf(z)));
+	z = csinhf(CMPLXF(cimagf(z), crealf(z)));
+	return (CMPLXF(cimagf(z), crealf(z)));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_csqrt.c b/libm/upstream-freebsd/lib/msun/src/s_csqrt.c
index 18a7ae3..c908a2d 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_csqrt.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_csqrt.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_csqrt.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <float.h>
@@ -58,12 +58,12 @@ csqrt(double complex z)
 
 	/* Handle special cases. */
 	if (z == 0)
-		return (cpack(0, b));
+		return (CMPLX(0, b));
 	if (isinf(b))
-		return (cpack(INFINITY, b));
+		return (CMPLX(INFINITY, b));
 	if (isnan(a)) {
 		t = (b - b) / (b - b);	/* raise invalid if b is not a NaN */
-		return (cpack(a, t));	/* return NaN + NaN i */
+		return (CMPLX(a, t));	/* return NaN + NaN i */
 	}
 	if (isinf(a)) {
 		/*
@@ -73,9 +73,9 @@ csqrt(double complex z)
 		 * csqrt(-inf + y i)   = 0   +  inf i
 		 */
 		if (signbit(a))
-			return (cpack(fabs(b - b), copysign(a, b)));
+			return (CMPLX(fabs(b - b), copysign(a, b)));
 		else
-			return (cpack(a, copysign(b - b, b)));
+			return (CMPLX(a, copysign(b - b, b)));
 	}
 	/*
 	 * The remaining special case (b is NaN) is handled just fine by
@@ -94,10 +94,10 @@ csqrt(double complex z)
 	/* Algorithm 312, CACM vol 10, Oct 1967. */
 	if (a >= 0) {
 		t = sqrt((a + hypot(a, b)) * 0.5);
-		result = cpack(t, b / (2 * t));
+		result = CMPLX(t, b / (2 * t));
 	} else {
 		t = sqrt((-a + hypot(a, b)) * 0.5);
-		result = cpack(fabs(b) / (2 * t), copysign(t, b));
+		result = CMPLX(fabs(b) / (2 * t), copysign(t, b));
 	}
 
 	/* Rescale. */
diff --git a/libm/upstream-freebsd/lib/msun/src/s_csqrtf.c b/libm/upstream-freebsd/lib/msun/src/s_csqrtf.c
index da7fe18..12a894f 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_csqrtf.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_csqrtf.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_csqrtf.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <math.h>
@@ -49,12 +49,12 @@ csqrtf(float complex z)
 
 	/* Handle special cases. */
 	if (z == 0)
-		return (cpackf(0, b));
+		return (CMPLXF(0, b));
 	if (isinf(b))
-		return (cpackf(INFINITY, b));
+		return (CMPLXF(INFINITY, b));
 	if (isnan(a)) {
 		t = (b - b) / (b - b);	/* raise invalid if b is not a NaN */
-		return (cpackf(a, t));	/* return NaN + NaN i */
+		return (CMPLXF(a, t));	/* return NaN + NaN i */
 	}
 	if (isinf(a)) {
 		/*
@@ -64,9 +64,9 @@ csqrtf(float complex z)
 		 * csqrtf(-inf + y i)   = 0   +  inf i
 		 */
 		if (signbit(a))
-			return (cpackf(fabsf(b - b), copysignf(a, b)));
+			return (CMPLXF(fabsf(b - b), copysignf(a, b)));
 		else
-			return (cpackf(a, copysignf(b - b, b)));
+			return (CMPLXF(a, copysignf(b - b, b)));
 	}
 	/*
 	 * The remaining special case (b is NaN) is handled just fine by
@@ -80,9 +80,9 @@ csqrtf(float complex z)
 	 */
 	if (a >= 0) {
 		t = sqrt((a + hypot(a, b)) * 0.5);
-		return (cpackf(t, b / (2.0 * t)));
+		return (CMPLXF(t, b / (2.0 * t)));
 	} else {
 		t = sqrt((-a + hypot(a, b)) * 0.5);
-		return (cpackf(fabsf(b) / (2.0 * t), copysignf(t, b)));
+		return (CMPLXF(fabsf(b) / (2.0 * t), copysignf(t, b)));
 	}
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_csqrtl.c b/libm/upstream-freebsd/lib/msun/src/s_csqrtl.c
index dd18e1e..7bcff59 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_csqrtl.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_csqrtl.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_csqrtl.c 275819 2014-12-16 09:21:56Z ed $");
 
 #include <complex.h>
 #include <float.h>
@@ -58,12 +58,12 @@ csqrtl(long double complex z)
 
 	/* Handle special cases. */
 	if (z == 0)
-		return (cpackl(0, b));
+		return (CMPLXL(0, b));
 	if (isinf(b))
-		return (cpackl(INFINITY, b));
+		return (CMPLXL(INFINITY, b));
 	if (isnan(a)) {
 		t = (b - b) / (b - b);	/* raise invalid if b is not a NaN */
-		return (cpackl(a, t));	/* return NaN + NaN i */
+		return (CMPLXL(a, t));	/* return NaN + NaN i */
 	}
 	if (isinf(a)) {
 		/*
@@ -73,9 +73,9 @@ csqrtl(long double complex z)
 		 * csqrt(-inf + y i)   = 0   +  inf i
 		 */
 		if (signbit(a))
-			return (cpackl(fabsl(b - b), copysignl(a, b)));
+			return (CMPLXL(fabsl(b - b), copysignl(a, b)));
 		else
-			return (cpackl(a, copysignl(b - b, b)));
+			return (CMPLXL(a, copysignl(b - b, b)));
 	}
 	/*
 	 * The remaining special case (b is NaN) is handled just fine by
@@ -94,10 +94,10 @@ csqrtl(long double complex z)
 	/* Algorithm 312, CACM vol 10, Oct 1967. */
 	if (a >= 0) {
 		t = sqrtl((a + hypotl(a, b)) * 0.5);
-		result = cpackl(t, b / (2 * t));
+		result = CMPLXL(t, b / (2 * t));
 	} else {
 		t = sqrtl((-a + hypotl(a, b)) * 0.5);
-		result = cpackl(fabsl(b) / (2 * t), copysignl(t, b));
+		result = CMPLXL(fabsl(b) / (2 * t), copysignl(t, b));
 	}
 
 	/* Rescale. */
diff --git a/libm/upstream-freebsd/lib/msun/src/s_ctanh.c b/libm/upstream-freebsd/lib/msun/src/s_ctanh.c
index d427e28..e5973c3 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_ctanh.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_ctanh.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Hyperbolic tangent of a complex argument z = x + i y.
+ * Hyperbolic tangent of a complex argument z = x + I y.
  *
  * The algorithm is from:
  *
@@ -44,15 +44,15 @@
  *
  *   tanh(z) = sinh(z) / cosh(z)
  *
- *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *             sinh(x) cos(y) + I cosh(x) sin(y)
  *           = ---------------------------------
- *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *             cosh(x) cos(y) + I sinh(x) sin(y)
  *
- *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *             cosh(x) sinh(x) / cos^2(y) + I tan(y)
  *           = -------------------------------------
  *                    1 + sinh^2(x) / cos^2(y)
  *
- *             beta rho s + i t
+ *             beta rho s + I t
  *           = ----------------
  *               1 + beta s^2
  *
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_ctanh.c 284427 2015-06-15 20:40:44Z tijl $");
 
 #include <complex.h>
 #include <math.h>
@@ -85,16 +85,16 @@ ctanh(double complex z)
 	ix = hx & 0x7fffffff;
 
 	/*
-	 * ctanh(NaN + i 0) = NaN + i 0
+	 * ctanh(NaN +- I 0) = d(NaN) +- I 0
 	 *
-	 * ctanh(NaN + i y) = NaN + i NaN		for y != 0
+	 * ctanh(NaN + I y) = d(NaN,y) + I d(NaN,y)	for y != 0
 	 *
 	 * The imaginary part has the sign of x*sin(2*y), but there's no
 	 * special effort to get this right.
 	 *
-	 * ctanh(+-Inf +- i Inf) = +-1 +- 0
+	 * ctanh(+-Inf +- I Inf) = +-1 +- I 0
 	 *
-	 * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)		for y finite
+	 * ctanh(+-Inf + I y) = +-1 + I 0 sin(2y)	for y finite
 	 *
 	 * The imaginary part of the sign is unspecified.  This special
 	 * case is only needed to avoid a spurious invalid exception when
@@ -102,26 +102,27 @@ ctanh(double complex z)
 	 */
 	if (ix >= 0x7ff00000) {
 		if ((ix & 0xfffff) | lx)	/* x is NaN */
-			return (cpack(x, (y == 0 ? y : x * y)));
+			return (CMPLX((x + 0) * (y + 0),
+			    y == 0 ? y : (x + 0) * (y + 0)));
 		SET_HIGH_WORD(x, hx - 0x40000000);	/* x = copysign(1, x) */
-		return (cpack(x, copysign(0, isinf(y) ? y : sin(y) * cos(y))));
+		return (CMPLX(x, copysign(0, isinf(y) ? y : sin(y) * cos(y))));
 	}
 
 	/*
-	 * ctanh(x + i NAN) = NaN + i NaN
-	 * ctanh(x +- i Inf) = NaN + i NaN
+	 * ctanh(x + I NaN) = d(NaN) + I d(NaN)
+	 * ctanh(x +- I Inf) = dNaN + I dNaN
 	 */
 	if (!isfinite(y))
-		return (cpack(y - y, y - y));
+		return (CMPLX(y - y, y - y));
 
 	/*
-	 * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+	 * ctanh(+-huge +- I y) ~= +-1 +- I 2sin(2y)/exp(2x), using the
 	 * approximation sinh^2(huge) ~= exp(2*huge) / 4.
 	 * We use a modified formula to avoid spurious overflow.
 	 */
-	if (ix >= 0x40360000) {	/* x >= 22 */
+	if (ix >= 0x40360000) {	/* |x| >= 22 */
 		double exp_mx = exp(-fabs(x));
-		return (cpack(copysign(1, x),
+		return (CMPLX(copysign(1, x),
 		    4 * sin(y) * cos(y) * exp_mx * exp_mx));
 	}
 
@@ -131,14 +132,14 @@ ctanh(double complex z)
 	s = sinh(x);
 	rho = sqrt(1 + s * s);	/* = cosh(x) */
 	denom = 1 + beta * s * s;
-	return (cpack((beta * rho * s) / denom, t / denom));
+	return (CMPLX((beta * rho * s) / denom, t / denom));
 }
 
 double complex
 ctan(double complex z)
 {
 
-	/* ctan(z) = -I * ctanh(I * z) */
-	z = ctanh(cpack(-cimag(z), creal(z)));
-	return (cpack(cimag(z), -creal(z)));
+	/* ctan(z) = -I * ctanh(I * z) = I * conj(ctanh(I * conj(z))) */
+	z = ctanh(CMPLX(cimag(z), creal(z)));
+	return (CMPLX(cimag(z), creal(z)));
 }
diff --git a/libm/upstream-freebsd/lib/msun/src/s_ctanhf.c b/libm/upstream-freebsd/lib/msun/src/s_ctanhf.c
index 4be28d8..e9826c0 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_ctanhf.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_ctanhf.c
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_ctanhf.c 284428 2015-06-15 20:47:26Z tijl $");
 
 #include <complex.h>
 #include <math.h>
@@ -51,18 +51,19 @@ ctanhf(float complex z)
 
 	if (ix >= 0x7f800000) {
 		if (ix & 0x7fffff)
-			return (cpackf(x, (y == 0 ? y : x * y)));
+			return (CMPLXF((x + 0) * (y + 0),
+			    y == 0 ? y : (x + 0) * (y + 0)));
 		SET_FLOAT_WORD(x, hx - 0x40000000);
-		return (cpackf(x,
+		return (CMPLXF(x,
 		    copysignf(0, isinf(y) ? y : sinf(y) * cosf(y))));
 	}
 
 	if (!isfinite(y))
-		return (cpackf(y - y, y - y));
+		return (CMPLXF(y - y, y - y));
 
-	if (ix >= 0x41300000) {	/* x >= 11 */
+	if (ix >= 0x41300000) {	/* |x| >= 11 */
 		float exp_mx = expf(-fabsf(x));
-		return (cpackf(copysignf(1, x),
+		return (CMPLXF(copysignf(1, x),
 		    4 * sinf(y) * cosf(y) * exp_mx * exp_mx));
 	}
 
@@ -71,14 +72,14 @@ ctanhf(float complex z)
 	s = sinhf(x);
 	rho = sqrtf(1 + s * s);
 	denom = 1 + beta * s * s;
-	return (cpackf((beta * rho * s) / denom, t / denom));
+	return (CMPLXF((beta * rho * s) / denom, t / denom));
 }
 
 float complex
 ctanf(float complex z)
 {
 
-	z = ctanhf(cpackf(-cimagf(z), crealf(z)));
-	return (cpackf(cimagf(z), -crealf(z)));
+	z = ctanhf(CMPLXF(cimagf(z), crealf(z)));
+	return (CMPLXF(cimagf(z), crealf(z)));
 }
 
diff --git a/libm/upstream-freebsd/lib/msun/src/s_exp2.c b/libm/upstream-freebsd/lib/msun/src/s_exp2.c
index fde11c2..dbef729 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_exp2.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_exp2.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_exp2.c 286515 2015-08-09 10:00:13Z dim $");
 
 #include <float.h>
 
@@ -376,14 +376,14 @@ exp2(double x)
 	/* Compute r = exp2(y) = exp2t[i0] * p(z - eps[i]). */
 	t = tbl[i0];		/* exp2t[i0] */
 	z -= tbl[i0 + 1];	/* eps[i0]   */
-	if (k >= -1021 << 20)
+	if (k >= -(1021 << 20))
 		INSERT_WORDS(twopk, 0x3ff00000 + k, 0);
 	else
 		INSERT_WORDS(twopkp1000, 0x3ff00000 + k + (1000 << 20), 0);
 	r = t + t * z * (P1 + z * (P2 + z * (P3 + z * (P4 + z * P5))));
 
 	/* Scale by 2**(k>>20). */
-	if(k >= -1021 << 20) {
+	if(k >= -(1021 << 20)) {
 		if (k == 1024 << 20)
 			return (r * 2.0 * 0x1p1023);
 		return (r * twopk);
diff --git a/libm/upstream-freebsd/lib/msun/src/s_fabs.c b/libm/upstream-freebsd/lib/msun/src/s_fabs.c
deleted file mode 100644
index 15529e5..0000000
--- a/libm/upstream-freebsd/lib/msun/src/s_fabs.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* @(#)s_fabs.c 5.1 93/09/24 */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-#ifndef lint
-static char rcsid[] = "$FreeBSD$";
-#endif
-
-/*
- * fabs(x) returns the absolute value of x.
- */
-
-#include "math.h"
-#include "math_private.h"
-
-double
-fabs(double x)
-{
-	u_int32_t high;
-	GET_HIGH_WORD(high,x);
-	SET_HIGH_WORD(x,high&0x7fffffff);
-        return x;
-}
diff --git a/libm/upstream-freebsd/lib/msun/src/s_fabsf.c b/libm/upstream-freebsd/lib/msun/src/s_fabsf.c
deleted file mode 100644
index e9383d0..0000000
--- a/libm/upstream-freebsd/lib/msun/src/s_fabsf.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* s_fabsf.c -- float version of s_fabs.c.
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-/*
- * fabsf(x) returns the absolute value of x.
- */
-
-#include "math.h"
-#include "math_private.h"
-
-float
-fabsf(float x)
-{
-	u_int32_t ix;
-	GET_FLOAT_WORD(ix,x);
-	SET_FLOAT_WORD(x,ix&0x7fffffff);
-        return x;
-}
diff --git a/libm/upstream-freebsd/lib/msun/src/s_scalbln.c b/libm/upstream-freebsd/lib/msun/src/s_scalbln.c
index d609d4e..8e61377 100644
--- a/libm/upstream-freebsd/lib/msun/src/s_scalbln.c
+++ b/libm/upstream-freebsd/lib/msun/src/s_scalbln.c
@@ -25,52 +25,30 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/lib/msun/src/s_scalbln.c 278339 2015-02-07 00:38:18Z kargl $");
 
-#include <limits.h>
 #include <math.h>
 
+#define	NMAX	65536
+#define	NMIN	-65536
+
 double
-scalbln (double x, long n)
+scalbln(double x, long n)
 {
-	int in;
 
-	in = (int)n;
-	if (in != n) {
-		if (n > 0)
-			in = INT_MAX;
-		else
-			in = INT_MIN;
-	}
-	return (scalbn(x, in));
+	return (scalbn(x, (n > NMAX) ? NMAX : (n < NMIN) ? NMIN : (int)n));
 }
 
 float
-scalblnf (float x, long n)
+scalblnf(float x, long n)
 {
-	int in;
 
-	in = (int)n;
-	if (in != n) {
-		if (n > 0)
-			in = INT_MAX;
-		else
-			in = INT_MIN;
-	}
-	return (scalbnf(x, in));
+	return (scalbnf(x, (n > NMAX) ? NMAX : (n < NMIN) ? NMIN : (int)n));
 }
 
 long double
-scalblnl (long double x, long n)
+scalblnl(long double x, long n)
 {
-	int in;
 
-	in = (int)n;
-	if (in != n) {
-		if (n > 0)
-			in = INT_MAX;
-		else
-			in = INT_MIN;
-	}
-	return (scalbnl(x, (int)n));
+	return (scalbnl(x, (n > NMAX) ? NMAX : (n < NMIN) ? NMIN : (int)n));
 }
diff --git a/linker/linker.cpp b/linker/linker.cpp
index d3ac1d0..9885b69 100644
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -890,6 +890,65 @@ typedef linked_list_t<soinfo> SoinfoLinkedList;
 typedef linked_list_t<const char> StringLinkedList;
 typedef linked_list_t<LoadTask> LoadTaskList;
 
+static soinfo* find_library(const char* name, int rtld_flags, const android_dlextinfo* extinfo);
+
+// g_ld_all_shim_libs maintains the references to memory as it used
+// in the soinfo structures and in the g_active_shim_libs list.
+
+static std::vector<std::string> g_ld_all_shim_libs;
+
+// g_active_shim_libs are all shim libs that are still eligible
+// to be loaded.  We must remove a shim lib from the list before
+// we load the library to avoid recursive loops (load shim libA
+// for libB where libA also links against libB).
+
+static linked_list_t<const std::string> g_active_shim_libs;
+
+static void reset_g_active_shim_libs(void) {
+  g_active_shim_libs.clear();
+  for (const auto& pair : g_ld_all_shim_libs) {
+    g_active_shim_libs.push_back(&pair);
+  }
+}
+
+static void parse_LD_SHIM_LIBS(const char* path) {
+  parse_path(path, " :", &g_ld_all_shim_libs);
+  reset_g_active_shim_libs();
+}
+
+static bool shim_lib_matches(const char *shim_lib, const char *realpath) {
+  const char *sep = strchr(shim_lib, '|');
+  return sep != nullptr && strncmp(realpath, shim_lib, sep - shim_lib) == 0;
+}
+
+template<typename F>
+static bool shim_libs_for_each(const char *const path, F action) {
+  if (path == nullptr) return true;
+  INFO("finding shim libs for \"%s\"\n", path);
+  std::vector<const std::string *> matched;
+
+  g_active_shim_libs.for_each([&](const std::string *a_pair) {
+    const char *pair = a_pair->c_str();
+    if (shim_lib_matches(pair, path)) {
+      matched.push_back(a_pair);
+    }
+  });
+
+  g_active_shim_libs.remove_if([&](const std::string *a_pair) {
+    const char *pair = a_pair->c_str();
+    return shim_lib_matches(pair, path);
+  });
+
+  for (const auto& one_pair : matched) {
+    const char* const pair = one_pair->c_str();
+    const char* sep = strchr(pair, '|');
+    INFO("found shim lib \"%s\"\n", sep+1);
+    soinfo *child = find_library(sep+1, RTLD_GLOBAL, nullptr);
+    if (! child) return false;
+    action(child);
+  }
+  return true;
+}
 
 // This function walks down the tree of soinfo dependencies
 // in breadth-first order and
@@ -899,7 +958,7 @@ typedef linked_list_t<LoadTask> LoadTaskList;
 // walk_dependencies_tree returns false if walk was terminated
 // by the action and true otherwise.
 template<typename F>
-static bool walk_dependencies_tree(soinfo* root_soinfos[], size_t root_soinfos_size, F action) {
+static bool walk_dependencies_tree(soinfo* root_soinfos[], size_t root_soinfos_size, bool do_shims, F action) {
   SoinfoLinkedList visit_list;
   SoinfoLinkedList visited;
 
@@ -922,6 +981,13 @@ static bool walk_dependencies_tree(soinfo* root_soinfos[], size_t root_soinfos_s
     si->get_children().for_each([&](soinfo* child) {
       visit_list.push_back(child);
     });
+
+    if (do_shims && !shim_libs_for_each(si->get_realpath(), [&](soinfo* child) {
+        si->add_child(child);
+        visit_list.push_back(child);
+      })) {
+      return false;
+    }
   }
 
   return true;
@@ -933,7 +999,7 @@ static const ElfW(Sym)* dlsym_handle_lookup(soinfo* root, soinfo* skip_until,
   const ElfW(Sym)* result = nullptr;
   bool skip_lookup = skip_until != nullptr;
 
-  walk_dependencies_tree(&root, 1, [&](soinfo* current_soinfo) {
+  walk_dependencies_tree(&root, 1, false, [&](soinfo* current_soinfo) {
     if (skip_lookup) {
       skip_lookup = current_soinfo != skip_until;
       return true;
@@ -1513,6 +1579,7 @@ static bool find_libraries(soinfo* start_with, const char* const library_names[]
   walk_dependencies_tree(
       start_with == nullptr ? soinfos : &start_with,
       start_with == nullptr ? soinfos_count : 1,
+      true,
       [&] (soinfo* si) {
     local_group.push_back(si);
     return true;
@@ -1692,6 +1759,7 @@ soinfo* do_dlopen(const char* name, int flags, const android_dlextinfo* extinfo)
   }
 
   ProtectedDataGuard guard;
+  reset_g_active_shim_libs();
   soinfo* si = find_library(name, flags, extinfo);
   if (si != nullptr) {
     si->call_constructors();
@@ -3164,9 +3232,11 @@ static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(
   // doesn't cost us anything.
   const char* ldpath_env = nullptr;
   const char* ldpreload_env = nullptr;
+  const char* ldshim_libs_env = nullptr;
   if (!getauxval(AT_SECURE)) {
     ldpath_env = getenv("LD_LIBRARY_PATH");
     ldpreload_env = getenv("LD_PRELOAD");
+    ldshim_libs_env = getenv("LD_SHIM_LIBS");
   }
 
   INFO("[ android linker & debugger ]");
@@ -3220,6 +3290,7 @@ static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(
   // Use LD_LIBRARY_PATH and LD_PRELOAD (but only if we aren't setuid/setgid).
   parse_LD_LIBRARY_PATH(ldpath_env);
   parse_LD_PRELOAD(ldpreload_env);
+  parse_LD_SHIM_LIBS(ldshim_libs_env);
 
   somain = si;
 
diff --git a/tests/buffer_tests.cpp b/tests/buffer_tests.cpp
index 4967382..a2b330e 100644
--- a/tests/buffer_tests.cpp
+++ b/tests/buffer_tests.cpp
@@ -381,15 +381,19 @@ void RunSrcDstBufferOverreadTest(void (*test_func)(uint8_t*, uint8_t*, size_t))
   // Make the second page unreadable and unwritable.
   ASSERT_TRUE(mprotect(&memory[pagesize], pagesize, PROT_NONE) == 0);
 
-  uint8_t* dst = new uint8_t[pagesize];
-  for (size_t i = 0; i < pagesize; i++) {
-    uint8_t* src = &memory[pagesize-i];
-
-    test_func(src, dst, i);
+  uint8_t* dst_buffer = new uint8_t[2*pagesize];
+  // Change the dst alignment as we change the source.
+  for (size_t i = 0; i < 16; i++) {
+    uint8_t* dst = &dst_buffer[i];
+    for (size_t j = 0; j < pagesize; j++) {
+      uint8_t* src = &memory[pagesize-j];
+
+      test_func(src, dst, j);
+    }
   }
   ASSERT_TRUE(mprotect(&memory[pagesize], pagesize, PROT_READ | PROT_WRITE) == 0);
   free(memory);
-  delete dst;
+  delete dst_buffer;
 }
 
 void RunCmpBufferOverreadTest(
diff --git a/tests/regex_test.cpp b/tests/regex_test.cpp
index d026221..4a4409e 100644
--- a/tests/regex_test.cpp
+++ b/tests/regex_test.cpp
@@ -36,3 +36,13 @@ TEST(regex, smoke) {
 
   regfree(&re);
 }
+
+TEST(regex, match_offsets) {
+  regex_t re;
+  regmatch_t matches[1];
+  ASSERT_EQ(0, regcomp(&re, "b", 0));
+  ASSERT_EQ(0, regexec(&re, "abc", 1, matches, 0));
+  ASSERT_EQ(1, matches[0].rm_so);
+  ASSERT_EQ(2, matches[0].rm_eo);
+  regfree(&re);
+}
diff --git a/tests/string_test.cpp b/tests/string_test.cpp
index 1d63c76..3d97d81 100644
--- a/tests/string_test.cpp
+++ b/tests/string_test.cpp
@@ -1166,7 +1166,7 @@ static size_t LargeSetIncrement(size_t len) {
   return 1;
 }
 
-#define STRCAT_DST_LEN  128
+#define STRCAT_DST_LEN  64
 
 static void DoStrcatTest(uint8_t* src, uint8_t* dst, size_t len) {
   if (len >= 1) {
@@ -1181,7 +1181,7 @@ static void DoStrcatTest(uint8_t* src, uint8_t* dst, size_t len) {
       int value2 = 32 + (value + 2) % 96;
       memset(cmp_buf, value2, sizeof(cmp_buf));
 
-      for (size_t i = 1; i <= STRCAT_DST_LEN; i++) {
+      for (size_t i = 1; i <= STRCAT_DST_LEN;) {
         memset(dst, value2, i-1);
         memset(dst+i-1, 0, len-i);
         src[len-i] = '\0';
@@ -1189,6 +1189,13 @@ static void DoStrcatTest(uint8_t* src, uint8_t* dst, size_t len) {
                                                          reinterpret_cast<char*>(src))));
         ASSERT_TRUE(memcmp(dst, cmp_buf, i-1) == 0);
         ASSERT_TRUE(memcmp(src, dst+i-1, len-i+1) == 0);
+        // This is an expensive loop, so don't loop through every value,
+        // get to a certain size and then start doubling.
+        if (i < 16) {
+          i++;
+        } else {
+          i <<= 1;
+        }
       }
     } else {
       dst[0] = '\0';
@@ -1221,7 +1228,7 @@ static void DoStrlcatTest(uint8_t* src, uint8_t* dst, size_t len) {
       int value2 = 32 + (value + 2) % 96;
       memset(cmp_buf, value2, sizeof(cmp_buf));
 
-      for (size_t i = 1; i <= STRCAT_DST_LEN; i++) {
+      for (size_t i = 1; i <= STRCAT_DST_LEN;) {
         memset(dst, value2, i-1);
         memset(dst+i-1, 0, len-i);
         src[len-i] = '\0';
@@ -1229,6 +1236,13 @@ static void DoStrlcatTest(uint8_t* src, uint8_t* dst, size_t len) {
                                  reinterpret_cast<char*>(src), len));
         ASSERT_TRUE(memcmp(dst, cmp_buf, i-1) == 0);
         ASSERT_TRUE(memcmp(src, dst+i-1, len-i+1) == 0);
+        // This is an expensive loop, so don't loop through every value,
+        // get to a certain size and then start doubling.
+        if (i < 16) {
+          i++;
+        } else {
+          i <<= 1;
+        }
       }
     } else {
       dst[0] = '\0';