summaryrefslogtreecommitdiffstats
path: root/libc
diff options
context:
space:
mode:
Diffstat (limited to 'libc')
-rw-r--r--libc/Android.mk7
-rw-r--r--libc/arch-arm/arm.mk3
-rw-r--r--libc/arch-arm/cortex-a15/bionic/__strcat_chk.S192
-rw-r--r--libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S212
-rw-r--r--libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S156
-rw-r--r--libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S173
-rw-r--r--libc/arch-arm/cortex-a15/bionic/memcpy.S79
-rw-r--r--libc/arch-arm/cortex-a15/bionic/memcpy_base.S42
-rw-r--r--libc/arch-arm/cortex-a15/bionic/memcpy_common.S103
-rw-r--r--libc/arch-arm/cortex-a15/bionic/strcat.S311
-rw-r--r--libc/arch-arm/cortex-a15/bionic/string_copy.S21
-rw-r--r--libc/arch-arm/cortex-a15/bionic/strlen.S60
-rw-r--r--libc/arch-arm/cortex-a15/cortex-a15.mk1
-rw-r--r--libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk22
-rw-r--r--libc/arch-arm/cortex-a53/bionic/__strcat_chk.S32
-rw-r--r--libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S32
-rw-r--r--libc/arch-arm/cortex-a53/bionic/memcpy.S32
-rw-r--r--libc/arch-arm/cortex-a53/bionic/memcpy_base.S143
-rw-r--r--libc/arch-arm/cortex-a53/cortex-a53.mk22
-rw-r--r--libc/arch-arm/cortex-a7/bionic/memset.S180
-rw-r--r--libc/arch-arm/cortex-a7/cortex-a7.mk20
-rw-r--r--libc/arch-arm/cortex-a9/bionic/memcpy_base.S3
-rw-r--r--libc/arch-arm/cortex-a9/bionic/memset.S33
-rw-r--r--libc/arch-arm/cortex-a9/bionic/strcat.S261
-rw-r--r--libc/arch-arm/cortex-a9/bionic/string_copy.S21
-rw-r--r--libc/arch-arm/cortex-a9/cortex-a9.mk1
-rw-r--r--libc/arch-arm/denver/denver.mk1
-rw-r--r--libc/arch-arm/generic/bionic/memchr.S155
-rw-r--r--libc/arch-arm/generic/bionic/memcmp.S3
-rw-r--r--libc/arch-arm/generic/bionic/memcpy.S6
-rw-r--r--libc/arch-arm/generic/bionic/memset.S6
-rw-r--r--libc/arch-arm/generic/generic.mk1
-rw-r--r--libc/arch-arm/krait/bionic/__strcat_chk.S19
-rw-r--r--libc/arch-arm/krait/bionic/__strcpy_chk.S15
-rw-r--r--libc/arch-arm/krait/bionic/memcpy.S17
-rw-r--r--libc/arch-arm/krait/bionic/memcpy_base.S314
-rw-r--r--libc/arch-arm/krait/bionic/memmove.S219
-rw-r--r--libc/arch-arm/krait/bionic/memset.S20
-rw-r--r--libc/arch-arm/krait/krait.mk16
-rw-r--r--libc/arch-arm/scorpion/scorpion.mk18
-rw-r--r--libc/arch-arm64/arm64.mk1
-rw-r--r--libc/arch-arm64/denver64/denver64.mk1
-rw-r--r--libc/arch-arm64/generic/bionic/memcpy_base.S312
-rw-r--r--libc/arch-arm64/generic/bionic/memmove.S404
-rw-r--r--libc/arch-arm64/generic/bionic/memset.S395
-rw-r--r--libc/arch-arm64/generic/bionic/strlen.S243
-rw-r--r--libc/arch-arm64/generic/bionic/strrchr.S171
-rw-r--r--libc/arch-arm64/generic/generic.mk1
-rw-r--r--libc/arch-arm64/kryo/bionic/memcpy.S65
-rw-r--r--libc/arch-arm64/kryo/bionic/memcpy_base.S244
-rw-r--r--libc/arch-arm64/kryo/kryo.mk15
-rw-r--r--libc/arch-x86_64/string/sse2-memmove-slm.S3
-rw-r--r--libc/bionic/malloc_debug_check.cpp147
-rw-r--r--libc/bionic/malloc_debug_common.cpp6
-rw-r--r--libc/bionic/mmap.cpp7
-rw-r--r--libc/include/paths.h1
-rw-r--r--libc/include/regex.h5
-rw-r--r--libc/kernel/uapi/linux/android_alarm.h13
-rw-r--r--libc/kernel/uapi/linux/time.h3
-rw-r--r--libc/upstream-netbsd/lib/libc/gen/popen.c2
60 files changed, 3330 insertions, 1681 deletions
diff --git a/libc/Android.mk b/libc/Android.mk
index f0c5e9f..f7f2adc 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -611,6 +611,10 @@ ifneq ($(BOARD_MALLOC_ALIGNMENT),)
libc_common_cflags += -DMALLOC_ALIGNMENT=$(BOARD_MALLOC_ALIGNMENT)
endif
+ifeq ($(BOARD_USES_LEGACY_MMAP),true)
+ libc_common_cflags += -DLEGACY_MMAP
+endif
+
# Define some common conlyflags
libc_common_conlyflags := \
-std=gnu99
@@ -1394,6 +1398,9 @@ LOCAL_SRC_FILES_arm += \
LOCAL_ADDRESS_SANITIZER := false
LOCAL_NATIVE_COVERAGE := $(bionic_coverage)
+# Allow devices to provide additional symbols
+LOCAL_WHOLE_STATIC_LIBRARIES += $(BOARD_PROVIDES_ADDITIONAL_BIONIC_STATIC_LIBS)
+
include $(BUILD_SHARED_LIBRARY)
diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk
index d72a160..c2b80c5 100644
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -20,7 +20,6 @@ libc_freebsd_src_files_arm += \
upstream-freebsd/lib/libc/string/wmemmove.c \
libc_openbsd_src_files_arm += \
- upstream-openbsd/lib/libc/string/memchr.c \
upstream-openbsd/lib/libc/string/memrchr.c \
upstream-openbsd/lib/libc/string/stpncpy.c \
upstream-openbsd/lib/libc/string/strlcat.c \
@@ -52,7 +51,7 @@ ifeq ($(strip $(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)),)
endif
cpu_variant_mk := $(LOCAL_PATH)/arch-arm/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT).mk
ifeq ($(wildcard $(cpu_variant_mk)),)
-$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
+$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, scorpion, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
endif
include $(cpu_variant_mk)
libc_common_additional_dependencies += $(cpu_variant_mk)
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
index a2e9c22..3692f04 100644
--- a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -26,191 +26,7 @@
* SUCH DAMAGE.
*/
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
- .syntax unified
-
- .thumb
- .thumb_func
-
-// Get the length of src string, then get the source of the dst string.
-// Check that the two lengths together don't exceed the threshold, then
-// do a memcpy of the data.
-ENTRY(__strcat_chk)
- pld [r0, #0]
- push {r0, lr}
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
- push {r4, r5}
- .cfi_adjust_cfa_offset 8
- .cfi_rel_offset r4, 0
- .cfi_rel_offset r5, 4
-
- mov lr, r2
-
- // Save the dst register to r5
- mov r5, r0
-
- // Zero out r4
- eor r4, r4, r4
-
- // r1 contains the address of the string to count.
-.L_strlen_start:
- mov r0, r1
- ands r3, r1, #7
- beq .L_mainloop
-
- // Align to a double word (64 bits).
- rsb r3, r3, #8
- lsls ip, r3, #31
- beq .L_align_to_32
-
- ldrb r2, [r1], #1
- cbz r2, .L_update_count_and_finish
-
-.L_align_to_32:
- bcc .L_align_to_64
- ands ip, r3, #2
- beq .L_align_to_64
-
- ldrb r2, [r1], #1
- cbz r2, .L_update_count_and_finish
- ldrb r2, [r1], #1
- cbz r2, .L_update_count_and_finish
-
-.L_align_to_64:
- tst r3, #4
- beq .L_mainloop
- ldr r3, [r1], #4
-
- sub ip, r3, #0x01010101
- bic ip, ip, r3
- ands ip, ip, #0x80808080
- bne .L_zero_in_second_register
-
- .p2align 2
-.L_mainloop:
- ldrd r2, r3, [r1], #8
-
- pld [r1, #64]
-
- sub ip, r2, #0x01010101
- bic ip, ip, r2
- ands ip, ip, #0x80808080
- bne .L_zero_in_first_register
-
- sub ip, r3, #0x01010101
- bic ip, ip, r3
- ands ip, ip, #0x80808080
- bne .L_zero_in_second_register
- b .L_mainloop
-
-.L_update_count_and_finish:
- sub r3, r1, r0
- sub r3, r3, #1
- b .L_finish
-
-.L_zero_in_first_register:
- sub r3, r1, r0
- lsls r2, ip, #17
- bne .L_sub8_and_finish
- bcs .L_sub7_and_finish
- lsls ip, ip, #1
- bne .L_sub6_and_finish
-
- sub r3, r3, #5
- b .L_finish
-
-.L_sub8_and_finish:
- sub r3, r3, #8
- b .L_finish
-
-.L_sub7_and_finish:
- sub r3, r3, #7
- b .L_finish
-
-.L_sub6_and_finish:
- sub r3, r3, #6
- b .L_finish
-
-.L_zero_in_second_register:
- sub r3, r1, r0
- lsls r2, ip, #17
- bne .L_sub4_and_finish
- bcs .L_sub3_and_finish
- lsls ip, ip, #1
- bne .L_sub2_and_finish
-
- sub r3, r3, #1
- b .L_finish
-
-.L_sub4_and_finish:
- sub r3, r3, #4
- b .L_finish
-
-.L_sub3_and_finish:
- sub r3, r3, #3
- b .L_finish
-
-.L_sub2_and_finish:
- sub r3, r3, #2
-
-.L_finish:
- cmp r4, #0
- bne .L_strlen_done
-
- // Time to get the dst string length.
- mov r1, r5
-
- // Save the original source address to r5.
- mov r5, r0
-
- // Save the current length (adding 1 for the terminator).
- add r4, r3, #1
- b .L_strlen_start
-
- // r0 holds the pointer to the dst string.
- // r3 holds the dst string length.
- // r4 holds the src string length + 1.
-.L_strlen_done:
- add r2, r3, r4
- cmp r2, lr
- bhi __strcat_chk_failed
-
- // Set up the registers for the memcpy code.
- mov r1, r5
- pld [r1, #64]
- mov r2, r4
- add r0, r0, r3
- pop {r4, r5}
-END(__strcat_chk)
-
-#define MEMCPY_BASE __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
-
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__strcat_chk_failed)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
- .cfi_adjust_cfa_offset 8
- .cfi_rel_offset r4, 0
- .cfi_rel_offset r5, 4
-
- ldr r0, error_message
- ldr r1, error_code
-1:
- add r0, pc
- bl __fortify_chk_fail
-error_code:
- .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
-error_message:
- .word error_string-(1b+4)
-END(__strcat_chk_failed)
-
- .data
-error_string:
- .string "strcat: prevented write past end of buffer"
+#include "__strcat_chk_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S
new file mode 100644
index 0000000..de66967
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+ .syntax unified
+
+ .thumb
+ .thumb_func
+
+// Get the length of src string, then get the source of the dst string.
+// Check that the two lengths together don't exceed the threshold, then
+// do a memcpy of the data.
+ENTRY(__strcat_chk)
+ pld [r0, #0]
+ push {r0, lr}
+ .cfi_def_cfa_offset 8
+ .cfi_rel_offset r0, 0
+ .cfi_rel_offset lr, 4
+ push {r4, r5}
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset r4, 0
+ .cfi_rel_offset r5, 4
+
+ mov lr, r2
+
+ // Save the dst register to r5
+ mov r5, r0
+
+ // Zero out r4
+ eor r4, r4, r4
+
+ // r1 contains the address of the string to count.
+.L_strlen_start:
+ mov r0, r1
+ ands r3, r1, #7
+ beq .L_mainloop
+
+ // Align to a double word (64 bits).
+ rsb r3, r3, #8
+ lsls ip, r3, #31
+ beq .L_align_to_32
+
+ ldrb r2, [r1], #1
+ cbz r2, .L_update_count_and_finish
+
+.L_align_to_32:
+ bcc .L_align_to_64
+ ands ip, r3, #2
+ beq .L_align_to_64
+
+ ldrb r2, [r1], #1
+ cbz r2, .L_update_count_and_finish
+ ldrb r2, [r1], #1
+ cbz r2, .L_update_count_and_finish
+
+.L_align_to_64:
+ tst r3, #4
+ beq .L_mainloop
+ ldr r3, [r1], #4
+
+ sub ip, r3, #0x01010101
+ bic ip, ip, r3
+ ands ip, ip, #0x80808080
+ bne .L_zero_in_second_register
+
+ .p2align 2
+.L_mainloop:
+ ldrd r2, r3, [r1], #8
+
+ pld [r1, #64]
+
+ sub ip, r2, #0x01010101
+ bic ip, ip, r2
+ ands ip, ip, #0x80808080
+ bne .L_zero_in_first_register
+
+ sub ip, r3, #0x01010101
+ bic ip, ip, r3
+ ands ip, ip, #0x80808080
+ bne .L_zero_in_second_register
+ b .L_mainloop
+
+.L_update_count_and_finish:
+ sub r3, r1, r0
+ sub r3, r3, #1
+ b .L_finish
+
+.L_zero_in_first_register:
+ sub r3, r1, r0
+ lsls r2, ip, #17
+ bne .L_sub8_and_finish
+ bcs .L_sub7_and_finish
+ lsls ip, ip, #1
+ bne .L_sub6_and_finish
+
+ sub r3, r3, #5
+ b .L_finish
+
+.L_sub8_and_finish:
+ sub r3, r3, #8
+ b .L_finish
+
+.L_sub7_and_finish:
+ sub r3, r3, #7
+ b .L_finish
+
+.L_sub6_and_finish:
+ sub r3, r3, #6
+ b .L_finish
+
+.L_zero_in_second_register:
+ sub r3, r1, r0
+ lsls r2, ip, #17
+ bne .L_sub4_and_finish
+ bcs .L_sub3_and_finish
+ lsls ip, ip, #1
+ bne .L_sub2_and_finish
+
+ sub r3, r3, #1
+ b .L_finish
+
+.L_sub4_and_finish:
+ sub r3, r3, #4
+ b .L_finish
+
+.L_sub3_and_finish:
+ sub r3, r3, #3
+ b .L_finish
+
+.L_sub2_and_finish:
+ sub r3, r3, #2
+
+.L_finish:
+ cmp r4, #0
+ bne .L_strlen_done
+
+ // Time to get the dst string length.
+ mov r1, r5
+
+ // Save the original source address to r5.
+ mov r5, r0
+
+ // Save the current length (adding 1 for the terminator).
+ add r4, r3, #1
+ b .L_strlen_start
+
+ // r0 holds the pointer to the dst string.
+ // r3 holds the dst string length.
+ // r4 holds the src string length + 1.
+.L_strlen_done:
+ add r2, r3, r4
+ cmp r2, lr
+ bhi .L_strcat_chk_failed
+
+ // Set up the registers for the memcpy code.
+ mov r1, r5
+ pld [r1, #64]
+ mov r2, r4
+ add r0, r0, r3
+ pop {r4, r5}
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r4
+ .cfi_restore r5
+
+#include MEMCPY_BASE
+
+ // Undo the above cfi directives
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset r4, 0
+ .cfi_rel_offset r5, 4
+.L_strcat_chk_failed:
+ ldr r0, error_message
+ ldr r1, error_code
+1:
+ add r0, pc
+ bl __fortify_chk_fail
+error_code:
+ .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
+error_message:
+ .word error_string-(1b+4)
+END(__strcat_chk)
+
+ .data
+error_string:
+ .string "strcat: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
index db76686..d8cb3d9 100644
--- a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2013 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -26,155 +26,7 @@
* SUCH DAMAGE.
*/
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
- .syntax unified
-
- .thumb
- .thumb_func
-
-// Get the length of the source string first, then do a memcpy of the data
-// instead of a strcpy.
-ENTRY(__strcpy_chk)
- pld [r0, #0]
- push {r0, lr}
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
-
- mov lr, r2
- mov r0, r1
-
- ands r3, r1, #7
- beq .L_mainloop
-
- // Align to a double word (64 bits).
- rsb r3, r3, #8
- lsls ip, r3, #31
- beq .L_align_to_32
-
- ldrb r2, [r0], #1
- cbz r2, .L_update_count_and_finish
-
-.L_align_to_32:
- bcc .L_align_to_64
- ands ip, r3, #2
- beq .L_align_to_64
-
- ldrb r2, [r0], #1
- cbz r2, .L_update_count_and_finish
- ldrb r2, [r0], #1
- cbz r2, .L_update_count_and_finish
-
-.L_align_to_64:
- tst r3, #4
- beq .L_mainloop
- ldr r3, [r0], #4
-
- sub ip, r3, #0x01010101
- bic ip, ip, r3
- ands ip, ip, #0x80808080
- bne .L_zero_in_second_register
-
- .p2align 2
-.L_mainloop:
- ldrd r2, r3, [r0], #8
-
- pld [r0, #64]
-
- sub ip, r2, #0x01010101
- bic ip, ip, r2
- ands ip, ip, #0x80808080
- bne .L_zero_in_first_register
-
- sub ip, r3, #0x01010101
- bic ip, ip, r3
- ands ip, ip, #0x80808080
- bne .L_zero_in_second_register
- b .L_mainloop
-
-.L_update_count_and_finish:
- sub r3, r0, r1
- sub r3, r3, #1
- b .L_check_size
-
-.L_zero_in_first_register:
- sub r3, r0, r1
- lsls r2, ip, #17
- bne .L_sub8_and_finish
- bcs .L_sub7_and_finish
- lsls ip, ip, #1
- bne .L_sub6_and_finish
-
- sub r3, r3, #5
- b .L_check_size
-
-.L_sub8_and_finish:
- sub r3, r3, #8
- b .L_check_size
-
-.L_sub7_and_finish:
- sub r3, r3, #7
- b .L_check_size
-
-.L_sub6_and_finish:
- sub r3, r3, #6
- b .L_check_size
-
-.L_zero_in_second_register:
- sub r3, r0, r1
- lsls r2, ip, #17
- bne .L_sub4_and_finish
- bcs .L_sub3_and_finish
- lsls ip, ip, #1
- bne .L_sub2_and_finish
-
- sub r3, r3, #1
- b .L_check_size
-
-.L_sub4_and_finish:
- sub r3, r3, #4
- b .L_check_size
-
-.L_sub3_and_finish:
- sub r3, r3, #3
- b .L_check_size
-
-.L_sub2_and_finish:
- sub r3, r3, #2
-
-.L_check_size:
- pld [r1, #0]
- pld [r1, #64]
- ldr r0, [sp]
- cmp r3, lr
- bhs __strcpy_chk_failed
-
- // Add 1 for copy length to get the string terminator.
- add r2, r3, #1
-END(__strcpy_chk)
-
-#define MEMCPY_BASE __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__strcpy_chk_failed)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
-
- ldr r0, error_message
- ldr r1, error_code
-1:
- add r0, pc
- bl __fortify_chk_fail
-error_code:
- .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
-error_message:
- .word error_string-(1b+4)
-END(__strcpy_chk_failed)
-
- .data
-error_string:
- .string "strcpy: prevented write past end of buffer"
+#include "__strcpy_chk_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S
new file mode 100644
index 0000000..69ebcb4
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+ .syntax unified
+
+ .thumb
+ .thumb_func
+
+// Get the length of the source string first, then do a memcpy of the data
+// instead of a strcpy.
+ENTRY(__strcpy_chk)
+ pld [r0, #0]
+ push {r0, lr}
+ .cfi_def_cfa_offset 8
+ .cfi_rel_offset r0, 0
+ .cfi_rel_offset lr, 4
+
+ mov lr, r2
+ mov r0, r1
+
+ ands r3, r1, #7
+ beq .L_mainloop
+
+ // Align to a double word (64 bits).
+ rsb r3, r3, #8
+ lsls ip, r3, #31
+ beq .L_align_to_32
+
+ ldrb r2, [r0], #1
+ cbz r2, .L_update_count_and_finish
+
+.L_align_to_32:
+ bcc .L_align_to_64
+ ands ip, r3, #2
+ beq .L_align_to_64
+
+ ldrb r2, [r0], #1
+ cbz r2, .L_update_count_and_finish
+ ldrb r2, [r0], #1
+ cbz r2, .L_update_count_and_finish
+
+.L_align_to_64:
+ tst r3, #4
+ beq .L_mainloop
+ ldr r3, [r0], #4
+
+ sub ip, r3, #0x01010101
+ bic ip, ip, r3
+ ands ip, ip, #0x80808080
+ bne .L_zero_in_second_register
+
+ .p2align 2
+.L_mainloop:
+ ldrd r2, r3, [r0], #8
+
+ pld [r0, #64]
+
+ sub ip, r2, #0x01010101
+ bic ip, ip, r2
+ ands ip, ip, #0x80808080
+ bne .L_zero_in_first_register
+
+ sub ip, r3, #0x01010101
+ bic ip, ip, r3
+ ands ip, ip, #0x80808080
+ bne .L_zero_in_second_register
+ b .L_mainloop
+
+.L_update_count_and_finish:
+ sub r3, r0, r1
+ sub r3, r3, #1
+ b .L_check_size
+
+.L_zero_in_first_register:
+ sub r3, r0, r1
+ lsls r2, ip, #17
+ bne .L_sub8_and_finish
+ bcs .L_sub7_and_finish
+ lsls ip, ip, #1
+ bne .L_sub6_and_finish
+
+ sub r3, r3, #5
+ b .L_check_size
+
+.L_sub8_and_finish:
+ sub r3, r3, #8
+ b .L_check_size
+
+.L_sub7_and_finish:
+ sub r3, r3, #7
+ b .L_check_size
+
+.L_sub6_and_finish:
+ sub r3, r3, #6
+ b .L_check_size
+
+.L_zero_in_second_register:
+ sub r3, r0, r1
+ lsls r2, ip, #17
+ bne .L_sub4_and_finish
+ bcs .L_sub3_and_finish
+ lsls ip, ip, #1
+ bne .L_sub2_and_finish
+
+ sub r3, r3, #1
+ b .L_check_size
+
+.L_sub4_and_finish:
+ sub r3, r3, #4
+ b .L_check_size
+
+.L_sub3_and_finish:
+ sub r3, r3, #3
+ b .L_check_size
+
+.L_sub2_and_finish:
+ sub r3, r3, #2
+
+.L_check_size:
+ pld [r1, #0]
+ pld [r1, #64]
+ ldr r0, [sp]
+ cmp r3, lr
+ bhs .L_strcpy_chk_failed
+
+ // Add 1 for copy length to get the string terminator.
+ add r2, r3, #1
+
+#include MEMCPY_BASE
+
+.L_strcpy_chk_failed:
+ ldr r0, error_message
+ ldr r1, error_code
+1:
+ add r0, pc
+ bl __fortify_chk_fail
+error_code:
+ .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
+error_message:
+ .word error_string-(1b+4)
+END(__strcpy_chk)
+
+ .data
+error_string:
+ .string "strcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S
index 410b663..537f3de 100644
--- a/libc/arch-arm/cortex-a15/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2015 The Android Open Source Project
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -25,79 +25,8 @@
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
-/*
- * Copyright (c) 2013 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-// Prototype: void *memcpy (void *dst, const void *src, size_t count).
-
-#include <private/bionic_asm.h>
-#include <private/libc_events.h>
-
- .text
- .syntax unified
- .fpu neon
-
-ENTRY(__memcpy_chk)
- cmp r2, r3
- bhi __memcpy_chk_fail
-
- // Fall through to memcpy...
-END(__memcpy_chk)
-
-ENTRY(memcpy)
- pld [r1, #64]
- push {r0, lr}
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
-END(memcpy)
-
-#define MEMCPY_BASE __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
-#include "memcpy_base.S"
-
-ENTRY_PRIVATE(__memcpy_chk_fail)
- // Preserve lr for backtrace.
- push {lr}
- .cfi_def_cfa_offset 4
- .cfi_rel_offset lr, 0
- ldr r0, error_message
- ldr r1, error_code
-1:
- add r0, pc
- bl __fortify_chk_fail
-error_code:
- .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
-error_message:
- .word error_string-(1b+8)
-END(__memcpy_chk_fail)
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "memcpy_base.S"
- .data
-error_string:
- .string "memcpy: prevented write past end of buffer"
+#include "memcpy_common.S"
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
index 2a73852..aac737d 100644
--- a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S
@@ -53,11 +53,7 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-ENTRY_PRIVATE(MEMCPY_BASE)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
-
+.L_memcpy_base:
// Assumes that n >= 0, and dst, src are valid pointers.
// For any sizes less than 832 use the neon code that doesn't
// care about the src alignment. This avoids any checks
@@ -168,12 +164,6 @@ ENTRY_PRIVATE(MEMCPY_BASE)
eor r3, r0, r1
ands r3, r3, #0x3
bne .L_copy_unknown_alignment
-END(MEMCPY_BASE)
-
-ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
// To try and improve performance, stack layout changed,
// i.e., not keeping the stack looking like users expect
@@ -185,7 +175,7 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
strd r6, r7, [sp, #-8]!
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r6, 0
- .cfi_rel_offset r7, 0
+ .cfi_rel_offset r7, 4
strd r8, r9, [sp, #-8]!
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r8, 0
@@ -291,10 +281,28 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
// Restore registers: optimized pop {r0, pc}
ldrd r8, r9, [sp], #8
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r8
+ .cfi_restore r9
ldrd r6, r7, [sp], #8
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r6
+ .cfi_restore r7
ldrd r4, r5, [sp], #8
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r4
+ .cfi_restore r5
pop {r0, pc}
+ // Put the cfi directives back for the below instructions.
+ .cfi_adjust_cfa_offset 24
+ .cfi_rel_offset r4, 0
+ .cfi_rel_offset r5, 4
+ .cfi_rel_offset r6, 8
+ .cfi_rel_offset r7, 12
+ .cfi_rel_offset r8, 16
+ .cfi_rel_offset r9, 20
+
.L_dst_not_word_aligned:
// Align dst to word.
rsb ip, ip, #4
@@ -315,4 +323,12 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
// Src is guaranteed to be at least word aligned by this point.
b .L_word_aligned
-END(MEMCPY_BASE_ALIGNED)
+
+ // Undo any cfi directives from above.
+ .cfi_adjust_cfa_offset -24
+ .cfi_restore r4
+ .cfi_restore r5
+ .cfi_restore r6
+ .cfi_restore r7
+ .cfi_restore r8
+ .cfi_restore r9
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_common.S b/libc/arch-arm/cortex-a15/bionic/memcpy_common.S
new file mode 100644
index 0000000..464fb46
--- /dev/null
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy_common.S
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+ .text
+ .syntax unified
+ .fpu neon
+
+ENTRY(__memcpy_chk)
+ cmp r2, r3
+ bhi .L_memcpy_chk_fail
+
+ // Fall through to memcpy...
+END(__memcpy_chk)
+
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).
+ENTRY(memcpy)
+ pld [r1, #64]
+ push {r0, lr}
+ .cfi_def_cfa_offset 8
+ .cfi_rel_offset r0, 0
+ .cfi_rel_offset lr, 4
+
+#include MEMCPY_BASE
+
+ // Undo the cfi instructions from above.
+ .cfi_def_cfa_offset 0
+ .cfi_restore r0
+ .cfi_restore lr
+.L_memcpy_chk_fail:
+ // Preserve lr for backtrace.
+ push {lr}
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset lr, 0
+
+ ldr r0, error_message
+ ldr r1, error_code
+1:
+ add r0, pc
+ bl __fortify_chk_fail
+error_code:
+ .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+error_message:
+ .word error_string-(1b+8)
+END(memcpy)
+
+ .data
+error_string:
+ .string "memcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a15/bionic/strcat.S b/libc/arch-arm/cortex-a15/bionic/strcat.S
index b95be94..157cc9f 100644
--- a/libc/arch-arm/cortex-a15/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a15/bionic/strcat.S
@@ -70,7 +70,7 @@
.macro m_scan_byte
ldrb r3, [r0]
- cbz r3, strcat_r0_scan_done
+ cbz r3, .L_strcat_r0_scan_done
add r0, #1
.endm // m_scan_byte
@@ -84,10 +84,10 @@ ENTRY(strcat)
// Quick check to see if src is empty.
ldrb r2, [r1]
pld [r1, #0]
- cbnz r2, strcat_continue
+ cbnz r2, .L_strcat_continue
bx lr
-strcat_continue:
+.L_strcat_continue:
// To speed up really small dst strings, unroll checking the first 4 bytes.
m_push
m_scan_byte
@@ -96,95 +96,102 @@ strcat_continue:
m_scan_byte
ands r3, r0, #7
- beq strcat_mainloop
+ beq .L_strcat_mainloop
// Align to a double word (64 bits).
rsb r3, r3, #8
lsls ip, r3, #31
- beq strcat_align_to_32
+ beq .L_strcat_align_to_32
ldrb r5, [r0]
- cbz r5, strcat_r0_scan_done
+ cbz r5, .L_strcat_r0_scan_done
add r0, r0, #1
-strcat_align_to_32:
- bcc strcat_align_to_64
+.L_strcat_align_to_32:
+ bcc .L_strcat_align_to_64
ldrb r2, [r0]
- cbz r2, strcat_r0_scan_done
+ cbz r2, .L_strcat_r0_scan_done
add r0, r0, #1
ldrb r4, [r0]
- cbz r4, strcat_r0_scan_done
+ cbz r4, .L_strcat_r0_scan_done
add r0, r0, #1
-strcat_align_to_64:
+.L_strcat_align_to_64:
tst r3, #4
- beq strcat_mainloop
+ beq .L_strcat_mainloop
ldr r3, [r0], #4
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcat_zero_in_second_register
- b strcat_mainloop
+ bne .L_strcat_zero_in_second_register
+ b .L_strcat_mainloop
-strcat_r0_scan_done:
+.L_strcat_r0_scan_done:
// For short copies, hard-code checking the first 8 bytes since this
// new code doesn't win until after about 8 bytes.
- m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
-
-strcpy_finish:
+ m_copy_byte reg=r2, cmd=cbz, label=.L_strcpy_finish
+ m_copy_byte reg=r3, cmd=cbz, label=.L_strcpy_finish
+ m_copy_byte reg=r4, cmd=cbz, label=.L_strcpy_finish
+ m_copy_byte reg=r5, cmd=cbz, label=.L_strcpy_finish
+ m_copy_byte reg=r2, cmd=cbz, label=.L_strcpy_finish
+ m_copy_byte reg=r3, cmd=cbz, label=.L_strcpy_finish
+ m_copy_byte reg=r4, cmd=cbz, label=.L_strcpy_finish
+ m_copy_byte reg=r5, cmd=cbnz, label=.L_strcpy_continue
+
+.L_strcpy_finish:
m_pop
-strcpy_continue:
+.L_strcpy_continue:
ands r3, r0, #7
- beq strcpy_check_src_align
+ beq .L_strcpy_check_src_align
// Align to a double word (64 bits).
rsb r3, r3, #8
lsls ip, r3, #31
- beq strcpy_align_to_32
+ beq .L_strcpy_align_to_32
ldrb r2, [r1], #1
strb r2, [r0], #1
- cbz r2, strcpy_complete
+ cbz r2, .L_strcpy_complete
-strcpy_align_to_32:
- bcc strcpy_align_to_64
+.L_strcpy_align_to_32:
+ bcc .L_strcpy_align_to_64
ldrb r2, [r1], #1
strb r2, [r0], #1
- cbz r2, strcpy_complete
+ cbz r2, .L_strcpy_complete
ldrb r2, [r1], #1
strb r2, [r0], #1
- cbz r2, strcpy_complete
+ cbz r2, .L_strcpy_complete
-strcpy_align_to_64:
+.L_strcpy_align_to_64:
tst r3, #4
- beq strcpy_check_src_align
- ldr r2, [r1], #4
-
- sub ip, r2, #0x01010101
- bic ip, ip, r2
- ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
- str r2, [r0], #4
+ beq .L_strcpy_check_src_align
+ // Read one byte at a time since we don't know the src alignment
+ // and we don't want to read into a different page.
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .L_strcpy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .L_strcpy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .L_strcpy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .L_strcpy_complete
-strcpy_check_src_align:
+.L_strcpy_check_src_align:
// At this point dst is aligned to a double word, check if src
// is also aligned to a double word.
ands r3, r1, #7
- bne strcpy_unaligned_copy
+ bne .L_strcpy_unaligned_copy
.p2align 2
-strcpy_mainloop:
+.L_strcpy_mainloop:
ldrd r2, r3, [r1], #8
pld [r1, #64]
@@ -192,128 +199,128 @@ strcpy_mainloop:
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .L_strcpy_zero_in_first_register
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .L_strcpy_zero_in_second_register
strd r2, r3, [r0], #8
- b strcpy_mainloop
+ b .L_strcpy_mainloop
-strcpy_complete:
+.L_strcpy_complete:
m_pop
-strcpy_zero_in_first_register:
+.L_strcpy_zero_in_first_register:
lsls lr, ip, #17
- bne strcpy_copy1byte
- bcs strcpy_copy2bytes
+ bne .L_strcpy_copy1byte
+ bcs .L_strcpy_copy2bytes
lsls ip, ip, #1
- bne strcpy_copy3bytes
+ bne .L_strcpy_copy3bytes
-strcpy_copy4bytes:
+.L_strcpy_copy4bytes:
// Copy 4 bytes to the destiniation.
str r2, [r0]
m_pop
-strcpy_copy1byte:
+.L_strcpy_copy1byte:
strb r2, [r0]
m_pop
-strcpy_copy2bytes:
+.L_strcpy_copy2bytes:
strh r2, [r0]
m_pop
-strcpy_copy3bytes:
+.L_strcpy_copy3bytes:
strh r2, [r0], #2
lsr r2, #16
strb r2, [r0]
m_pop
-strcpy_zero_in_second_register:
+.L_strcpy_zero_in_second_register:
lsls lr, ip, #17
- bne strcpy_copy5bytes
- bcs strcpy_copy6bytes
+ bne .L_strcpy_copy5bytes
+ bcs .L_strcpy_copy6bytes
lsls ip, ip, #1
- bne strcpy_copy7bytes
+ bne .L_strcpy_copy7bytes
// Copy 8 bytes to the destination.
strd r2, r3, [r0]
m_pop
-strcpy_copy5bytes:
+.L_strcpy_copy5bytes:
str r2, [r0], #4
strb r3, [r0]
m_pop
-strcpy_copy6bytes:
+.L_strcpy_copy6bytes:
str r2, [r0], #4
strh r3, [r0]
m_pop
-strcpy_copy7bytes:
+.L_strcpy_copy7bytes:
str r2, [r0], #4
strh r3, [r0], #2
lsr r3, #16
strb r3, [r0]
m_pop
-strcpy_unaligned_copy:
+.L_strcpy_unaligned_copy:
// Dst is aligned to a double word, while src is at an unknown alignment.
// There are 7 different versions of the unaligned copy code
// to prevent overreading the src. The mainloop of every single version
// will store 64 bits per loop. The difference is how much of src can
// be read without potentially crossing a page boundary.
tbb [pc, r3]
-strcpy_unaligned_branchtable:
+.L_strcpy_unaligned_branchtable:
.byte 0
- .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
+ .byte ((.L_strcpy_unalign7 - .L_strcpy_unaligned_branchtable)/2)
+ .byte ((.L_strcpy_unalign6 - .L_strcpy_unaligned_branchtable)/2)
+ .byte ((.L_strcpy_unalign5 - .L_strcpy_unaligned_branchtable)/2)
+ .byte ((.L_strcpy_unalign4 - .L_strcpy_unaligned_branchtable)/2)
+ .byte ((.L_strcpy_unalign3 - .L_strcpy_unaligned_branchtable)/2)
+ .byte ((.L_strcpy_unalign2 - .L_strcpy_unaligned_branchtable)/2)
+ .byte ((.L_strcpy_unalign1 - .L_strcpy_unaligned_branchtable)/2)
.p2align 2
// Can read 7 bytes before possibly crossing a page.
-strcpy_unalign7:
+.L_strcpy_unalign7:
ldr r2, [r1], #4
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .L_strcpy_zero_in_first_register
ldrb r3, [r1]
- cbz r3, strcpy_unalign7_copy5bytes
+ cbz r3, .L_strcpy_unalign7_copy5bytes
ldrb r4, [r1, #1]
- cbz r4, strcpy_unalign7_copy6bytes
+ cbz r4, .L_strcpy_unalign7_copy6bytes
ldrb r5, [r1, #2]
- cbz r5, strcpy_unalign7_copy7bytes
+ cbz r5, .L_strcpy_unalign7_copy7bytes
ldr r3, [r1], #4
pld [r1, #64]
lsrs ip, r3, #24
strd r2, r3, [r0], #8
- beq strcpy_unalign_return
- b strcpy_unalign7
+ beq .L_strcpy_unalign_return
+ b .L_strcpy_unalign7
-strcpy_unalign7_copy5bytes:
+.L_strcpy_unalign7_copy5bytes:
str r2, [r0], #4
strb r3, [r0]
-strcpy_unalign_return:
+.L_strcpy_unalign_return:
m_pop
-strcpy_unalign7_copy6bytes:
+.L_strcpy_unalign7_copy6bytes:
str r2, [r0], #4
strb r3, [r0], #1
strb r4, [r0], #1
m_pop
-strcpy_unalign7_copy7bytes:
+.L_strcpy_unalign7_copy7bytes:
str r2, [r0], #4
strb r3, [r0], #1
strb r4, [r0], #1
@@ -322,41 +329,41 @@ strcpy_unalign7_copy7bytes:
.p2align 2
// Can read 6 bytes before possibly crossing a page.
-strcpy_unalign6:
+.L_strcpy_unalign6:
ldr r2, [r1], #4
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .L_strcpy_zero_in_first_register
ldrb r4, [r1]
- cbz r4, strcpy_unalign_copy5bytes
+ cbz r4, .L_strcpy_unalign_copy5bytes
ldrb r5, [r1, #1]
- cbz r5, strcpy_unalign_copy6bytes
+ cbz r5, .L_strcpy_unalign_copy6bytes
ldr r3, [r1], #4
pld [r1, #64]
tst r3, #0xff0000
- beq strcpy_copy7bytes
+ beq .L_strcpy_copy7bytes
lsrs ip, r3, #24
strd r2, r3, [r0], #8
- beq strcpy_unalign_return
- b strcpy_unalign6
+ beq .L_strcpy_unalign_return
+ b .L_strcpy_unalign6
.p2align 2
// Can read 5 bytes before possibly crossing a page.
-strcpy_unalign5:
+.L_strcpy_unalign5:
ldr r2, [r1], #4
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .L_strcpy_zero_in_first_register
ldrb r4, [r1]
- cbz r4, strcpy_unalign_copy5bytes
+ cbz r4, .L_strcpy_unalign_copy5bytes
ldr r3, [r1], #4
@@ -365,17 +372,17 @@ strcpy_unalign5:
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .L_strcpy_zero_in_second_register
strd r2, r3, [r0], #8
- b strcpy_unalign5
+ b .L_strcpy_unalign5
-strcpy_unalign_copy5bytes:
+.L_strcpy_unalign_copy5bytes:
str r2, [r0], #4
strb r4, [r0]
m_pop
-strcpy_unalign_copy6bytes:
+.L_strcpy_unalign_copy6bytes:
str r2, [r0], #4
strb r4, [r0], #1
strb r5, [r0]
@@ -383,13 +390,13 @@ strcpy_unalign_copy6bytes:
.p2align 2
// Can read 4 bytes before possibly crossing a page.
-strcpy_unalign4:
+.L_strcpy_unalign4:
ldr r2, [r1], #4
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .L_strcpy_zero_in_first_register
ldr r3, [r1], #4
pld [r1, #64]
@@ -397,20 +404,20 @@ strcpy_unalign4:
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .L_strcpy_zero_in_second_register
strd r2, r3, [r0], #8
- b strcpy_unalign4
+ b .L_strcpy_unalign4
.p2align 2
// Can read 3 bytes before possibly crossing a page.
-strcpy_unalign3:
+.L_strcpy_unalign3:
ldrb r2, [r1]
- cbz r2, strcpy_unalign3_copy1byte
+ cbz r2, .L_strcpy_unalign3_copy1byte
ldrb r3, [r1, #1]
- cbz r3, strcpy_unalign3_copy2bytes
+ cbz r3, .L_strcpy_unalign3_copy2bytes
ldrb r4, [r1, #2]
- cbz r4, strcpy_unalign3_copy3bytes
+ cbz r4, .L_strcpy_unalign3_copy3bytes
ldr r2, [r1], #4
ldr r3, [r1], #4
@@ -418,26 +425,26 @@ strcpy_unalign3:
pld [r1, #64]
lsrs lr, r2, #24
- beq strcpy_copy4bytes
+ beq .L_strcpy_copy4bytes
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .L_strcpy_zero_in_second_register
strd r2, r3, [r0], #8
- b strcpy_unalign3
+ b .L_strcpy_unalign3
-strcpy_unalign3_copy1byte:
+.L_strcpy_unalign3_copy1byte:
strb r2, [r0]
m_pop
-strcpy_unalign3_copy2bytes:
+.L_strcpy_unalign3_copy2bytes:
strb r2, [r0], #1
strb r3, [r0]
m_pop
-strcpy_unalign3_copy3bytes:
+.L_strcpy_unalign3_copy3bytes:
strb r2, [r0], #1
strb r3, [r0], #1
strb r4, [r0]
@@ -445,34 +452,34 @@ strcpy_unalign3_copy3bytes:
.p2align 2
// Can read 2 bytes before possibly crossing a page.
-strcpy_unalign2:
+.L_strcpy_unalign2:
ldrb r2, [r1]
- cbz r2, strcpy_unalign_copy1byte
+ cbz r2, .L_strcpy_unalign_copy1byte
ldrb r4, [r1, #1]
- cbz r4, strcpy_unalign_copy2bytes
+ cbz r4, .L_strcpy_unalign_copy2bytes
ldr r2, [r1], #4
ldr r3, [r1], #4
pld [r1, #64]
tst r2, #0xff0000
- beq strcpy_copy3bytes
+ beq .L_strcpy_copy3bytes
lsrs ip, r2, #24
- beq strcpy_copy4bytes
+ beq .L_strcpy_copy4bytes
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .L_strcpy_zero_in_second_register
strd r2, r3, [r0], #8
- b strcpy_unalign2
+ b .L_strcpy_unalign2
.p2align 2
// Can read 1 byte before possibly crossing a page.
-strcpy_unalign1:
+.L_strcpy_unalign1:
ldrb r2, [r1]
- cbz r2, strcpy_unalign_copy1byte
+ cbz r2, .L_strcpy_unalign_copy1byte
ldr r2, [r1], #4
ldr r3, [r1], #4
@@ -482,27 +489,27 @@ strcpy_unalign1:
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .L_strcpy_zero_in_first_register
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .L_strcpy_zero_in_second_register
strd r2, r3, [r0], #8
- b strcpy_unalign1
+ b .L_strcpy_unalign1
-strcpy_unalign_copy1byte:
+.L_strcpy_unalign_copy1byte:
strb r2, [r0]
m_pop
-strcpy_unalign_copy2bytes:
+.L_strcpy_unalign_copy2bytes:
strb r2, [r0], #1
strb r4, [r0]
m_pop
.p2align 2
-strcat_mainloop:
+.L_strcat_mainloop:
ldrd r2, r3, [r0], #8
pld [r0, #64]
@@ -510,59 +517,59 @@ strcat_mainloop:
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcat_zero_in_first_register
+ bne .L_strcat_zero_in_first_register
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcat_zero_in_second_register
- b strcat_mainloop
+ bne .L_strcat_zero_in_second_register
+ b .L_strcat_mainloop
-strcat_zero_in_first_register:
+.L_strcat_zero_in_first_register:
// Prefetch the src now, it's going to be used soon.
pld [r1, #0]
lsls lr, ip, #17
- bne strcat_sub8
- bcs strcat_sub7
+ bne .L_strcat_sub8
+ bcs .L_strcat_sub7
lsls ip, ip, #1
- bne strcat_sub6
+ bne .L_strcat_sub6
sub r0, r0, #5
- b strcat_r0_scan_done
+ b .L_strcat_r0_scan_done
-strcat_sub8:
+.L_strcat_sub8:
sub r0, r0, #8
- b strcat_r0_scan_done
+ b .L_strcat_r0_scan_done
-strcat_sub7:
+.L_strcat_sub7:
sub r0, r0, #7
- b strcat_r0_scan_done
+ b .L_strcat_r0_scan_done
-strcat_sub6:
+.L_strcat_sub6:
sub r0, r0, #6
- b strcat_r0_scan_done
+ b .L_strcat_r0_scan_done
-strcat_zero_in_second_register:
+.L_strcat_zero_in_second_register:
// Prefetch the src now, it's going to be used soon.
pld [r1, #0]
lsls lr, ip, #17
- bne strcat_sub4
- bcs strcat_sub3
+ bne .L_strcat_sub4
+ bcs .L_strcat_sub3
lsls ip, ip, #1
- bne strcat_sub2
+ bne .L_strcat_sub2
sub r0, r0, #1
- b strcat_r0_scan_done
+ b .L_strcat_r0_scan_done
-strcat_sub4:
+.L_strcat_sub4:
sub r0, r0, #4
- b strcat_r0_scan_done
+ b .L_strcat_r0_scan_done
-strcat_sub3:
+.L_strcat_sub3:
sub r0, r0, #3
- b strcat_r0_scan_done
+ b .L_strcat_r0_scan_done
-strcat_sub2:
+.L_strcat_sub2:
sub r0, r0, #2
- b strcat_r0_scan_done
+ b .L_strcat_r0_scan_done
END(strcat)
diff --git a/libc/arch-arm/cortex-a15/bionic/string_copy.S b/libc/arch-arm/cortex-a15/bionic/string_copy.S
index 20f0e91..92d1c98 100644
--- a/libc/arch-arm/cortex-a15/bionic/string_copy.S
+++ b/libc/arch-arm/cortex-a15/bionic/string_copy.S
@@ -149,13 +149,20 @@ ENTRY(strcpy)
.Lstringcopy_align_to_64:
tst r3, #4
beq .Lstringcopy_check_src_align
- ldr r2, [r1], #4
-
- sub ip, r2, #0x01010101
- bic ip, ip, r2
- ands ip, ip, #0x80808080
- bne .Lstringcopy_zero_in_first_register
- str r2, [r0], #4
+ // Read one byte at a time since we don't have any idea about the alignment
+ // of the source and we don't want to read into a different page.
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .Lstringcopy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .Lstringcopy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .Lstringcopy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .Lstringcopy_complete
.Lstringcopy_check_src_align:
// At this point dst is aligned to a double word, check if src
diff --git a/libc/arch-arm/cortex-a15/bionic/strlen.S b/libc/arch-arm/cortex-a15/bionic/strlen.S
index 9a0ce62..4fd6284 100644
--- a/libc/arch-arm/cortex-a15/bionic/strlen.S
+++ b/libc/arch-arm/cortex-a15/bionic/strlen.S
@@ -65,38 +65,38 @@ ENTRY(strlen)
mov r1, r0
ands r3, r0, #7
- beq mainloop
+ beq .L_mainloop
// Align to a double word (64 bits).
rsb r3, r3, #8
lsls ip, r3, #31
- beq align_to_32
+ beq .L_align_to_32
ldrb r2, [r1], #1
- cbz r2, update_count_and_return
+ cbz r2, .L_update_count_and_return
-align_to_32:
- bcc align_to_64
+.L_align_to_32:
+ bcc .L_align_to_64
ands ip, r3, #2
- beq align_to_64
+ beq .L_align_to_64
ldrb r2, [r1], #1
- cbz r2, update_count_and_return
+ cbz r2, .L_update_count_and_return
ldrb r2, [r1], #1
- cbz r2, update_count_and_return
+ cbz r2, .L_update_count_and_return
-align_to_64:
+.L_align_to_64:
tst r3, #4
- beq mainloop
+ beq .L_mainloop
ldr r3, [r1], #4
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne zero_in_second_register
+ bne .L_zero_in_second_register
.p2align 2
-mainloop:
+.L_mainloop:
ldrd r2, r3, [r1], #8
pld [r1, #64]
@@ -104,62 +104,62 @@ mainloop:
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne zero_in_first_register
+ bne .L_zero_in_first_register
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne zero_in_second_register
- b mainloop
+ bne .L_zero_in_second_register
+ b .L_mainloop
-update_count_and_return:
+.L_update_count_and_return:
sub r0, r1, r0
sub r0, r0, #1
bx lr
-zero_in_first_register:
+.L_zero_in_first_register:
sub r0, r1, r0
lsls r3, ip, #17
- bne sub8_and_return
- bcs sub7_and_return
+ bne .L_sub8_and_return
+ bcs .L_sub7_and_return
lsls ip, ip, #1
- bne sub6_and_return
+ bne .L_sub6_and_return
sub r0, r0, #5
bx lr
-sub8_and_return:
+.L_sub8_and_return:
sub r0, r0, #8
bx lr
-sub7_and_return:
+.L_sub7_and_return:
sub r0, r0, #7
bx lr
-sub6_and_return:
+.L_sub6_and_return:
sub r0, r0, #6
bx lr
-zero_in_second_register:
+.L_zero_in_second_register:
sub r0, r1, r0
lsls r3, ip, #17
- bne sub4_and_return
- bcs sub3_and_return
+ bne .L_sub4_and_return
+ bcs .L_sub3_and_return
lsls ip, ip, #1
- bne sub2_and_return
+ bne .L_sub2_and_return
sub r0, r0, #1
bx lr
-sub4_and_return:
+.L_sub4_and_return:
sub r0, r0, #4
bx lr
-sub3_and_return:
+.L_sub3_and_return:
sub r0, r0, #3
bx lr
-sub2_and_return:
+.L_sub2_and_return:
sub r0, r0, #2
bx lr
END(strlen)
diff --git a/libc/arch-arm/cortex-a15/cortex-a15.mk b/libc/arch-arm/cortex-a15/cortex-a15.mk
index 6fa3270..202a3bf 100644
--- a/libc/arch-arm/cortex-a15/cortex-a15.mk
+++ b/libc/arch-arm/cortex-a15/cortex-a15.mk
@@ -10,6 +10,7 @@ libc_bionic_src_files_arm += \
arch-arm/cortex-a15/bionic/strlen.S \
libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memchr.S \
arch-arm/generic/bionic/memcmp.S \
libc_bionic_src_files_arm += \
diff --git a/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk b/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk
new file mode 100644
index 0000000..5d7efc6
--- /dev/null
+++ b/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk
@@ -0,0 +1,22 @@
+# This file represents the best optimized routines that are the middle
+# ground when running on a big/little system that is cortex-a57/cortex-a53.
+# The cortex-a7 optimized routines, and the cortex-a53 optimized routines
+# decrease performance on cortex-a57 processors by as much as 20%.
+
+libc_bionic_src_files_arm += \
+ arch-arm/cortex-a15/bionic/memcpy.S \
+ arch-arm/cortex-a15/bionic/memset.S \
+ arch-arm/cortex-a15/bionic/stpcpy.S \
+ arch-arm/cortex-a15/bionic/strcat.S \
+ arch-arm/cortex-a15/bionic/__strcat_chk.S \
+ arch-arm/cortex-a15/bionic/strcmp.S \
+ arch-arm/cortex-a15/bionic/strcpy.S \
+ arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+ arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memcmp.S \
+ arch-arm/generic/bionic/memchr.S
+
+libc_bionic_src_files_arm += \
+ arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S
new file mode 100644
index 0000000..c5bc98a
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/__strcat_chk_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S
new file mode 100644
index 0000000..1f8945d
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/__strcpy_chk_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy.S b/libc/arch-arm/cortex-a53/bionic/memcpy.S
new file mode 100644
index 0000000..664f574
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/memcpy.S
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Indicate which memcpy base file to include.
+#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S"
+
+#include "arch-arm/cortex-a15/bionic/memcpy_common.S"
diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy_base.S b/libc/arch-arm/cortex-a53/bionic/memcpy_base.S
new file mode 100644
index 0000000..2749fc8
--- /dev/null
+++ b/libc/arch-arm/cortex-a53/bionic/memcpy_base.S
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+.L_memcpy_base:
+ // Assumes that n >= 0, and dst, src are valid pointers.
+ cmp r2, #16
+ blo .L_copy_less_than_16_unknown_align
+
+.L_copy_unknown_alignment:
+ // Unknown alignment of src and dst.
+ // Assumes that the first few bytes have already been prefetched.
+
+ // Align destination to 128 bits. The mainloop store instructions
+ // require this alignment or they will throw an exception.
+ rsb r3, r0, #0
+ ands r3, r3, #0xF
+ beq 2f
+
+ // Copy up to 15 bytes (count in r3).
+ sub r2, r2, r3
+ movs ip, r3, lsl #31
+
+ itt mi
+ ldrbmi lr, [r1], #1
+ strbmi lr, [r0], #1
+ itttt cs
+ ldrbcs ip, [r1], #1
+ ldrbcs lr, [r1], #1
+ strbcs ip, [r0], #1
+ strbcs lr, [r0], #1
+
+ movs ip, r3, lsl #29
+ bge 1f
+ // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
+1: bcc 2f
+ // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0, :64]!
+
+2: // Make sure we have at least 64 bytes to copy.
+ subs r2, r2, #64
+ blo 2f
+
+1: // The main loop copies 64 bytes at a time.
+ vld1.8 {d0 - d3}, [r1]!
+ vld1.8 {d4 - d7}, [r1]!
+ subs r2, r2, #64
+ vstmia r0!, {d0 - d7}
+ pld [r1, #(64*10)]
+ bhs 1b
+
+2: // Fix-up the remaining count and make sure we have >= 32 bytes left.
+ adds r2, r2, #32
+ blo 3f
+
+ // 32 bytes. These cache lines were already preloaded.
+ vld1.8 {d0 - d3}, [r1]!
+ sub r2, r2, #32
+ vst1.8 {d0 - d3}, [r0, :128]!
+3: // Less than 32 left.
+ add r2, r2, #32
+ tst r2, #0x10
+ beq .L_copy_less_than_16_unknown_align
+ // Copies 16 bytes, destination 128 bits aligned.
+ vld1.8 {d0, d1}, [r1]!
+ vst1.8 {d0, d1}, [r0, :128]!
+
+.L_copy_less_than_16_unknown_align:
+ // Copy up to 15 bytes (count in r2).
+ movs ip, r2, lsl #29
+ bcc 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0]!
+1: bge 2f
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+2: // Copy 0 to 4 bytes.
+ lsls r2, r2, #31
+ itt ne
+ ldrbne lr, [r1], #1
+ strbne lr, [r0], #1
+ itttt cs
+ ldrbcs ip, [r1], #1
+ ldrbcs lr, [r1]
+ strbcs ip, [r0], #1
+ strbcs lr, [r0]
+
+ pop {r0, pc}
diff --git a/libc/arch-arm/cortex-a53/cortex-a53.mk b/libc/arch-arm/cortex-a53/cortex-a53.mk
index b5c337c..14aaa71 100644
--- a/libc/arch-arm/cortex-a53/cortex-a53.mk
+++ b/libc/arch-arm/cortex-a53/cortex-a53.mk
@@ -1 +1,21 @@
-include bionic/libc/arch-arm/cortex-a7/cortex-a7.mk
+libc_bionic_src_files_arm += \
+ arch-arm/cortex-a53/bionic/memcpy.S \
+ arch-arm/cortex-a53/bionic/__strcat_chk.S \
+ arch-arm/cortex-a53/bionic/__strcpy_chk.S \
+
+libc_bionic_src_files_arm += \
+ arch-arm/cortex-a7/bionic/memset.S \
+
+libc_bionic_src_files_arm += \
+ arch-arm/cortex-a15/bionic/stpcpy.S \
+ arch-arm/cortex-a15/bionic/strcat.S \
+ arch-arm/cortex-a15/bionic/strcmp.S \
+ arch-arm/cortex-a15/bionic/strcpy.S \
+ arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memchr.S \
+ arch-arm/generic/bionic/memcmp.S \
+
+libc_bionic_src_files_arm += \
+ arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a7/bionic/memset.S b/libc/arch-arm/cortex-a7/bionic/memset.S
new file mode 100644
index 0000000..6365b06
--- /dev/null
+++ b/libc/arch-arm/cortex-a7/bionic/memset.S
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/cpu-features.h>
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+ /*
+ * Optimized memset() for ARM.
+ *
+ * memset() returns its first argument.
+ */
+
+ .fpu neon
+ .syntax unified
+
+ENTRY(__memset_chk)
+ cmp r2, r3
+ bls .L_done
+
+ // Preserve lr for backtrace.
+ push {lr}
+ .cfi_def_cfa_offset 4
+ .cfi_rel_offset lr, 0
+
+ ldr r0, error_message
+ ldr r1, error_code
+1:
+ add r0, pc
+ bl __fortify_chk_fail
+error_code:
+ .word BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
+error_message:
+ .word error_string-(1b+8)
+END(__memset_chk)
+
+ENTRY(bzero)
+ mov r2, r1
+ mov r1, #0
+.L_done:
+ // Fall through to memset...
+END(bzero)
+
+ENTRY(memset)
+ mov r3, r0
+ // At this point only d0, d1 are going to be used below.
+ vdup.8 q0, r1
+ cmp r2, #16
+ blo .L_set_less_than_16_unknown_align
+
+.L_check_alignment:
+ // Align destination to a double word to avoid the store crossing
+ // a cache line boundary.
+ ands ip, r3, #7
+ bne .L_do_double_word_align
+
+.L_double_word_aligned:
+ // Duplicate since the less than 64 can use d2, d3.
+ vmov q1, q0
+ subs r2, #64
+ blo .L_set_less_than_64
+
+ // Duplicate the copy value so that we can store 64 bytes at a time.
+ vmov q2, q0
+ vmov q3, q0
+
+1: // Main loop stores 64 bytes at a time.
+ subs r2, #64
+ vstmia r3!, {d0 - d7}
+ bge 1b
+
+.L_set_less_than_64:
+ // Restore r2 to the count of bytes left to set.
+ add r2, #64
+ lsls ip, r2, #27
+ bcc .L_set_less_than_32
+ // Set 32 bytes.
+ vstmia r3!, {d0 - d3}
+
+.L_set_less_than_32:
+ bpl .L_set_less_than_16
+ // Set 16 bytes.
+ vstmia r3!, {d0, d1}
+
+.L_set_less_than_16:
+ // Less than 16 bytes to set.
+ lsls ip, r2, #29
+ bcc .L_set_less_than_8
+
+ // Set 8 bytes.
+ vstmia r3!, {d0}
+
+.L_set_less_than_8:
+ bpl .L_set_less_than_4
+ // Set 4 bytes
+ vst1.32 {d0[0]}, [r3]!
+
+.L_set_less_than_4:
+ lsls ip, r2, #31
+ it ne
+ strbne r1, [r3], #1
+ itt cs
+ strbcs r1, [r3], #1
+ strbcs r1, [r3]
+ bx lr
+
+.L_do_double_word_align:
+ rsb ip, ip, #8
+ sub r2, r2, ip
+
+ // Do this comparison now, otherwise we'll need to save a
+ // register to the stack since we've used all available
+ // registers.
+ cmp ip, #4
+ blo 1f
+
+ // Need to do a four byte copy.
+ movs ip, ip, lsl #31
+ it mi
+ strbmi r1, [r3], #1
+ itt cs
+ strbcs r1, [r3], #1
+ strbcs r1, [r3], #1
+ vst1.32 {d0[0]}, [r3]!
+ b .L_double_word_aligned
+
+1:
+ // No four byte copy.
+ movs ip, ip, lsl #31
+ it mi
+ strbmi r1, [r3], #1
+ itt cs
+ strbcs r1, [r3], #1
+ strbcs r1, [r3], #1
+ b .L_double_word_aligned
+
+.L_set_less_than_16_unknown_align:
+ // Set up to 15 bytes.
+ movs ip, r2, lsl #29
+ bcc 1f
+ vst1.8 {d0}, [r3]!
+1: bge 2f
+ vst1.32 {d0[0]}, [r3]!
+2: movs ip, r2, lsl #31
+ it mi
+ strbmi r1, [r3], #1
+ itt cs
+ strbcs r1, [r3], #1
+ strbcs r1, [r3], #1
+ bx lr
+END(memset)
+
+ .data
+error_string:
+ .string "memset: prevented write past end of buffer"
diff --git a/libc/arch-arm/cortex-a7/cortex-a7.mk b/libc/arch-arm/cortex-a7/cortex-a7.mk
index 9af03d9..3629a57 100644
--- a/libc/arch-arm/cortex-a7/cortex-a7.mk
+++ b/libc/arch-arm/cortex-a7/cortex-a7.mk
@@ -1 +1,19 @@
-include bionic/libc/arch-arm/cortex-a15/cortex-a15.mk
+libc_bionic_src_files_arm += \
+ arch-arm/cortex-a7/bionic/memset.S \
+
+libc_bionic_src_files_arm += \
+ arch-arm/cortex-a15/bionic/memcpy.S \
+ arch-arm/cortex-a15/bionic/stpcpy.S \
+ arch-arm/cortex-a15/bionic/strcat.S \
+ arch-arm/cortex-a15/bionic/__strcat_chk.S \
+ arch-arm/cortex-a15/bionic/strcmp.S \
+ arch-arm/cortex-a15/bionic/strcpy.S \
+ arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+ arch-arm/cortex-a15/bionic/strlen.S \
+
+libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memchr.S \
+ arch-arm/generic/bionic/memcmp.S \
+
+libc_bionic_src_files_arm += \
+ arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
index 5e81305..6ab5a69 100644
--- a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
+++ b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S
@@ -133,8 +133,7 @@ ENTRY_PRIVATE(MEMCPY_BASE)
strbcs ip, [r0], #1
strbcs lr, [r0], #1
- ldmfd sp!, {r0, lr}
- bx lr
+ ldmfd sp!, {r0, pc}
END(MEMCPY_BASE)
ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
diff --git a/libc/arch-arm/cortex-a9/bionic/memset.S b/libc/arch-arm/cortex-a9/bionic/memset.S
index 8ee6ac2..b39fcc4 100644
--- a/libc/arch-arm/cortex-a9/bionic/memset.S
+++ b/libc/arch-arm/cortex-a9/bionic/memset.S
@@ -69,12 +69,9 @@ END(bzero)
ENTRY(memset)
// The neon memset only wins for less than 132.
cmp r2, #132
- bhi __memset_large_copy
-
- stmfd sp!, {r0}
- .cfi_def_cfa_offset 4
- .cfi_rel_offset r0, 0
+ bhi .L_memset_large_copy
+ mov r3, r0
vdup.8 q0, r1
/* make sure we have at least 32 bytes to write */
@@ -84,7 +81,7 @@ ENTRY(memset)
1: /* The main loop writes 32 bytes at a time */
subs r2, r2, #32
- vst1.8 {d0 - d3}, [r0]!
+ vst1.8 {d0 - d3}, [r3]!
bhs 1b
2: /* less than 32 left */
@@ -93,22 +90,20 @@ ENTRY(memset)
beq 3f
// writes 16 bytes, 128-bits aligned
- vst1.8 {d0, d1}, [r0]!
+ vst1.8 {d0, d1}, [r3]!
3: /* write up to 15-bytes (count in r2) */
movs ip, r2, lsl #29
bcc 1f
- vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r3]!
1: bge 2f
- vst1.32 {d0[0]}, [r0]!
+ vst1.32 {d0[0]}, [r3]!
2: movs ip, r2, lsl #31
- strbmi r1, [r0], #1
- strbcs r1, [r0], #1
- strbcs r1, [r0], #1
- ldmfd sp!, {r0}
+ strbmi r1, [r3], #1
+ strbcs r1, [r3], #1
+ strbcs r1, [r3], #1
bx lr
-END(memset)
-ENTRY_PRIVATE(__memset_large_copy)
+.L_memset_large_copy:
/* compute the offset to align the destination
* offset = (4-(src&3))&3 = -src & 3
*/
@@ -136,8 +131,7 @@ ENTRY_PRIVATE(__memset_large_copy)
strbcs r1, [r0], #1
strbmi r1, [r0], #1
subs r2, r2, r3
- popls {r0, r4-r7, lr} /* return */
- bxls lr
+ popls {r0, r4-r7, pc} /* return */
/* align the destination to a cache-line */
mov r12, r1
@@ -180,9 +174,8 @@ ENTRY_PRIVATE(__memset_large_copy)
strhmi r1, [r0], #2
movs r2, r2, lsl #2
strbcs r1, [r0]
- ldmfd sp!, {r0, r4-r7, lr}
- bx lr
-END(__memset_large_copy)
+ ldmfd sp!, {r0, r4-r7, pc}
+END(memset)
.data
error_string:
diff --git a/libc/arch-arm/cortex-a9/bionic/strcat.S b/libc/arch-arm/cortex-a9/bionic/strcat.S
index f5a855e..9077a74 100644
--- a/libc/arch-arm/cortex-a9/bionic/strcat.S
+++ b/libc/arch-arm/cortex-a9/bionic/strcat.S
@@ -70,7 +70,7 @@
.macro m_scan_byte
ldrb r3, [r0]
- cbz r3, strcat_r0_scan_done
+ cbz r3, .Lstrcat_r0_scan_done
add r0, #1
.endm // m_scan_byte
@@ -84,10 +84,10 @@ ENTRY(strcat)
// Quick check to see if src is empty.
ldrb r2, [r1]
pld [r1, #0]
- cbnz r2, strcat_continue
+ cbnz r2, .Lstrcat_continue
bx lr
-strcat_continue:
+.Lstrcat_continue:
// To speed up really small dst strings, unroll checking the first 4 bytes.
m_push
m_scan_byte
@@ -96,10 +96,10 @@ strcat_continue:
m_scan_byte
ands r3, r0, #7
- bne strcat_align_src
+ bne .Lstrcat_align_src
.p2align 2
-strcat_mainloop:
+.Lstrcat_mainloop:
ldmia r0!, {r2, r3}
pld [r0, #64]
@@ -107,28 +107,28 @@ strcat_mainloop:
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcat_zero_in_first_register
+ bne .Lstrcat_zero_in_first_register
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcat_zero_in_second_register
- b strcat_mainloop
+ bne .Lstrcat_zero_in_second_register
+ b .Lstrcat_mainloop
-strcat_zero_in_first_register:
+.Lstrcat_zero_in_first_register:
sub r0, r0, #4
-strcat_zero_in_second_register:
+.Lstrcat_zero_in_second_register:
// Check for zero in byte 0.
tst ip, #0x80
it ne
subne r0, r0, #4
- bne strcat_r0_scan_done
+ bne .Lstrcat_r0_scan_done
// Check for zero in byte 1.
tst ip, #0x8000
it ne
subne r0, r0, #3
- bne strcat_r0_scan_done
+ bne .Lstrcat_r0_scan_done
// Check for zero in byte 2.
tst ip, #0x800000
it ne
@@ -137,33 +137,33 @@ strcat_zero_in_second_register:
// Zero is in byte 3.
subeq r0, r0, #1
-strcat_r0_scan_done:
+.Lstrcat_r0_scan_done:
// Unroll the first 8 bytes that will be copied.
- m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish
- m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue
-
-strcpy_finish:
+ m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish
+ m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish
+ m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish
+ m_copy_byte reg=r5, cmd=cbz, label=.Lstrcpy_finish
+ m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish
+ m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish
+ m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish
+ m_copy_byte reg=r5, cmd=cbnz, label=.Lstrcpy_continue
+
+.Lstrcpy_finish:
m_ret inst=pop
-strcpy_continue:
+.Lstrcpy_continue:
pld [r1, #0]
ands r3, r0, #7
- bne strcpy_align_dst
+ bne .Lstrcpy_align_dst
-strcpy_check_src_align:
+.Lstrcpy_check_src_align:
// At this point dst is aligned to a double word, check if src
// is also aligned to a double word.
ands r3, r1, #7
- bne strcpy_unaligned_copy
+ bne .Lstrcpy_unaligned_copy
.p2align 2
-strcpy_mainloop:
+.Lstrcpy_mainloop:
ldmia r1!, {r2, r3}
pld [r1, #64]
@@ -171,17 +171,17 @@ strcpy_mainloop:
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .Lstrcpy_zero_in_first_register
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .Lstrcpy_zero_in_second_register
stmia r0!, {r2, r3}
- b strcpy_mainloop
+ b .Lstrcpy_mainloop
-strcpy_zero_in_first_register:
+.Lstrcpy_zero_in_first_register:
lsls lr, ip, #17
itt ne
strbne r2, [r0]
@@ -198,7 +198,7 @@ strcpy_zero_in_first_register:
strb r3, [r0]
m_ret inst=pop
-strcpy_zero_in_second_register:
+.Lstrcpy_zero_in_second_register:
lsls lr, ip, #17
ittt ne
stmiane r0!, {r2}
@@ -218,18 +218,18 @@ strcpy_zero_in_second_register:
strb r4, [r0]
m_ret inst=pop
-strcpy_align_dst:
+.Lstrcpy_align_dst:
// Align to a double word (64 bits).
rsb r3, r3, #8
lsls ip, r3, #31
- beq strcpy_align_to_32
+ beq .Lstrcpy_align_to_32
ldrb r2, [r1], #1
strb r2, [r0], #1
- cbz r2, strcpy_complete
+ cbz r2, .Lstrcpy_complete
-strcpy_align_to_32:
- bcc strcpy_align_to_64
+.Lstrcpy_align_to_32:
+ bcc .Lstrcpy_align_to_64
ldrb r4, [r1], #1
strb r4, [r0], #1
@@ -242,76 +242,83 @@ strcpy_align_to_32:
it eq
m_ret inst=popeq
-strcpy_align_to_64:
+.Lstrcpy_align_to_64:
tst r3, #4
- beq strcpy_check_src_align
- ldr r2, [r1], #4
-
- sub ip, r2, #0x01010101
- bic ip, ip, r2
- ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
- stmia r0!, {r2}
- b strcpy_check_src_align
+ beq .Lstrcpy_check_src_align
+ // Read one byte at a time since we don't know the src alignment
+ // and we don't want to read into a different page.
+ ldrb r4, [r1], #1
+ strb r4, [r0], #1
+ cbz r4, .Lstrcpy_complete
+ ldrb r5, [r1], #1
+ strb r5, [r0], #1
+ cbz r5, .Lstrcpy_complete
+ ldrb r4, [r1], #1
+ strb r4, [r0], #1
+ cbz r4, .Lstrcpy_complete
+ ldrb r5, [r1], #1
+ strb r5, [r0], #1
+ cbz r5, .Lstrcpy_complete
+ b .Lstrcpy_check_src_align
-strcpy_complete:
+.Lstrcpy_complete:
m_ret inst=pop
-strcpy_unaligned_copy:
+.Lstrcpy_unaligned_copy:
// Dst is aligned to a double word, while src is at an unknown alignment.
// There are 7 different versions of the unaligned copy code
// to prevent overreading the src. The mainloop of every single version
// will store 64 bits per loop. The difference is how much of src can
// be read without potentially crossing a page boundary.
tbb [pc, r3]
-strcpy_unaligned_branchtable:
+.Lstrcpy_unaligned_branchtable:
.byte 0
- .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2)
- .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2)
+ .byte ((.Lstrcpy_unalign7 - .Lstrcpy_unaligned_branchtable)/2)
+ .byte ((.Lstrcpy_unalign6 - .Lstrcpy_unaligned_branchtable)/2)
+ .byte ((.Lstrcpy_unalign5 - .Lstrcpy_unaligned_branchtable)/2)
+ .byte ((.Lstrcpy_unalign4 - .Lstrcpy_unaligned_branchtable)/2)
+ .byte ((.Lstrcpy_unalign3 - .Lstrcpy_unaligned_branchtable)/2)
+ .byte ((.Lstrcpy_unalign2 - .Lstrcpy_unaligned_branchtable)/2)
+ .byte ((.Lstrcpy_unalign1 - .Lstrcpy_unaligned_branchtable)/2)
.p2align 2
// Can read 7 bytes before possibly crossing a page.
-strcpy_unalign7:
+.Lstrcpy_unalign7:
ldr r2, [r1], #4
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .Lstrcpy_zero_in_first_register
ldrb r3, [r1]
- cbz r3, strcpy_unalign7_copy5bytes
+ cbz r3, .Lstrcpy_unalign7_copy5bytes
ldrb r4, [r1, #1]
- cbz r4, strcpy_unalign7_copy6bytes
+ cbz r4, .Lstrcpy_unalign7_copy6bytes
ldrb r5, [r1, #2]
- cbz r5, strcpy_unalign7_copy7bytes
+ cbz r5, .Lstrcpy_unalign7_copy7bytes
ldr r3, [r1], #4
pld [r1, #64]
lsrs ip, r3, #24
stmia r0!, {r2, r3}
- beq strcpy_unalign_return
- b strcpy_unalign7
+ beq .Lstrcpy_unalign_return
+ b .Lstrcpy_unalign7
-strcpy_unalign7_copy5bytes:
+.Lstrcpy_unalign7_copy5bytes:
stmia r0!, {r2}
strb r3, [r0]
-strcpy_unalign_return:
+.Lstrcpy_unalign_return:
m_ret inst=pop
-strcpy_unalign7_copy6bytes:
+.Lstrcpy_unalign7_copy6bytes:
stmia r0!, {r2}
strb r3, [r0], #1
strb r4, [r0], #1
m_ret inst=pop
-strcpy_unalign7_copy7bytes:
+.Lstrcpy_unalign7_copy7bytes:
stmia r0!, {r2}
strb r3, [r0], #1
strb r4, [r0], #1
@@ -320,30 +327,30 @@ strcpy_unalign7_copy7bytes:
.p2align 2
// Can read 6 bytes before possibly crossing a page.
-strcpy_unalign6:
+.Lstrcpy_unalign6:
ldr r2, [r1], #4
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .Lstrcpy_zero_in_first_register
ldrb r4, [r1]
- cbz r4, strcpy_unalign_copy5bytes
+ cbz r4, .Lstrcpy_unalign_copy5bytes
ldrb r5, [r1, #1]
- cbz r5, strcpy_unalign_copy6bytes
+ cbz r5, .Lstrcpy_unalign_copy6bytes
ldr r3, [r1], #4
pld [r1, #64]
tst r3, #0xff0000
- beq strcpy_unalign6_copy7bytes
+ beq .Lstrcpy_unalign6_copy7bytes
lsrs ip, r3, #24
stmia r0!, {r2, r3}
- beq strcpy_unalign_return
- b strcpy_unalign6
+ beq .Lstrcpy_unalign_return
+ b .Lstrcpy_unalign6
-strcpy_unalign6_copy7bytes:
+.Lstrcpy_unalign6_copy7bytes:
stmia r0!, {r2}
strh r3, [r0], #2
lsr r3, #16
@@ -352,16 +359,16 @@ strcpy_unalign6_copy7bytes:
.p2align 2
// Can read 5 bytes before possibly crossing a page.
-strcpy_unalign5:
+.Lstrcpy_unalign5:
ldr r2, [r1], #4
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .Lstrcpy_zero_in_first_register
ldrb r4, [r1]
- cbz r4, strcpy_unalign_copy5bytes
+ cbz r4, .Lstrcpy_unalign_copy5bytes
ldr r3, [r1], #4
@@ -370,17 +377,17 @@ strcpy_unalign5:
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .Lstrcpy_zero_in_second_register
stmia r0!, {r2, r3}
- b strcpy_unalign5
+ b .Lstrcpy_unalign5
-strcpy_unalign_copy5bytes:
+.Lstrcpy_unalign_copy5bytes:
stmia r0!, {r2}
strb r4, [r0]
m_ret inst=pop
-strcpy_unalign_copy6bytes:
+.Lstrcpy_unalign_copy6bytes:
stmia r0!, {r2}
strb r4, [r0], #1
strb r5, [r0]
@@ -388,13 +395,13 @@ strcpy_unalign_copy6bytes:
.p2align 2
// Can read 4 bytes before possibly crossing a page.
-strcpy_unalign4:
+.Lstrcpy_unalign4:
ldmia r1!, {r2}
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .Lstrcpy_zero_in_first_register
ldmia r1!, {r3}
pld [r1, #64]
@@ -402,20 +409,20 @@ strcpy_unalign4:
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .Lstrcpy_zero_in_second_register
stmia r0!, {r2, r3}
- b strcpy_unalign4
+ b .Lstrcpy_unalign4
.p2align 2
// Can read 3 bytes before possibly crossing a page.
-strcpy_unalign3:
+.Lstrcpy_unalign3:
ldrb r2, [r1]
- cbz r2, strcpy_unalign3_copy1byte
+ cbz r2, .Lstrcpy_unalign3_copy1byte
ldrb r3, [r1, #1]
- cbz r3, strcpy_unalign3_copy2bytes
+ cbz r3, .Lstrcpy_unalign3_copy2bytes
ldrb r4, [r1, #2]
- cbz r4, strcpy_unalign3_copy3bytes
+ cbz r4, .Lstrcpy_unalign3_copy3bytes
ldr r2, [r1], #4
ldr r3, [r1], #4
@@ -423,26 +430,26 @@ strcpy_unalign3:
pld [r1, #64]
lsrs lr, r2, #24
- beq strcpy_unalign_copy4bytes
+ beq .Lstrcpy_unalign_copy4bytes
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .Lstrcpy_zero_in_second_register
stmia r0!, {r2, r3}
- b strcpy_unalign3
+ b .Lstrcpy_unalign3
-strcpy_unalign3_copy1byte:
+.Lstrcpy_unalign3_copy1byte:
strb r2, [r0]
m_ret inst=pop
-strcpy_unalign3_copy2bytes:
+.Lstrcpy_unalign3_copy2bytes:
strb r2, [r0], #1
strb r3, [r0]
m_ret inst=pop
-strcpy_unalign3_copy3bytes:
+.Lstrcpy_unalign3_copy3bytes:
strb r2, [r0], #1
strb r3, [r0], #1
strb r4, [r0]
@@ -450,34 +457,34 @@ strcpy_unalign3_copy3bytes:
.p2align 2
// Can read 2 bytes before possibly crossing a page.
-strcpy_unalign2:
+.Lstrcpy_unalign2:
ldrb r2, [r1]
- cbz r2, strcpy_unalign_copy1byte
+ cbz r2, .Lstrcpy_unalign_copy1byte
ldrb r3, [r1, #1]
- cbz r3, strcpy_unalign_copy2bytes
+ cbz r3, .Lstrcpy_unalign_copy2bytes
ldr r2, [r1], #4
ldr r3, [r1], #4
pld [r1, #64]
tst r2, #0xff0000
- beq strcpy_unalign_copy3bytes
+ beq .Lstrcpy_unalign_copy3bytes
lsrs ip, r2, #24
- beq strcpy_unalign_copy4bytes
+ beq .Lstrcpy_unalign_copy4bytes
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .Lstrcpy_zero_in_second_register
stmia r0!, {r2, r3}
- b strcpy_unalign2
+ b .Lstrcpy_unalign2
.p2align 2
// Can read 1 byte before possibly crossing a page.
-strcpy_unalign1:
+.Lstrcpy_unalign1:
ldrb r2, [r1]
- cbz r2, strcpy_unalign_copy1byte
+ cbz r2, .Lstrcpy_unalign_copy1byte
ldr r2, [r1], #4
ldr r3, [r1], #4
@@ -487,62 +494,62 @@ strcpy_unalign1:
sub ip, r2, #0x01010101
bic ip, ip, r2
ands ip, ip, #0x80808080
- bne strcpy_zero_in_first_register
+ bne .Lstrcpy_zero_in_first_register
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcpy_zero_in_second_register
+ bne .Lstrcpy_zero_in_second_register
stmia r0!, {r2, r3}
- b strcpy_unalign1
+ b .Lstrcpy_unalign1
-strcpy_unalign_copy1byte:
+.Lstrcpy_unalign_copy1byte:
strb r2, [r0]
m_ret inst=pop
-strcpy_unalign_copy2bytes:
+.Lstrcpy_unalign_copy2bytes:
strb r2, [r0], #1
strb r3, [r0]
m_ret inst=pop
-strcpy_unalign_copy3bytes:
+.Lstrcpy_unalign_copy3bytes:
strh r2, [r0], #2
lsr r2, #16
strb r2, [r0]
m_ret inst=pop
-strcpy_unalign_copy4bytes:
+.Lstrcpy_unalign_copy4bytes:
stmia r0, {r2}
m_ret inst=pop
-strcat_align_src:
+.Lstrcat_align_src:
// Align to a double word (64 bits).
rsb r3, r3, #8
lsls ip, r3, #31
- beq strcat_align_to_32
+ beq .Lstrcat_align_to_32
ldrb r2, [r0], #1
- cbz r2, strcat_r0_update
+ cbz r2, .Lstrcat_r0_update
-strcat_align_to_32:
- bcc strcat_align_to_64
+.Lstrcat_align_to_32:
+ bcc .Lstrcat_align_to_64
ldrb r2, [r0], #1
- cbz r2, strcat_r0_update
+ cbz r2, .Lstrcat_r0_update
ldrb r2, [r0], #1
- cbz r2, strcat_r0_update
+ cbz r2, .Lstrcat_r0_update
-strcat_align_to_64:
+.Lstrcat_align_to_64:
tst r3, #4
- beq strcat_mainloop
+ beq .Lstrcat_mainloop
ldr r3, [r0], #4
sub ip, r3, #0x01010101
bic ip, ip, r3
ands ip, ip, #0x80808080
- bne strcat_zero_in_second_register
- b strcat_mainloop
+ bne .Lstrcat_zero_in_second_register
+ b .Lstrcat_mainloop
-strcat_r0_update:
+.Lstrcat_r0_update:
sub r0, r0, #1
- b strcat_r0_scan_done
+ b .Lstrcat_r0_scan_done
END(strcat)
diff --git a/libc/arch-arm/cortex-a9/bionic/string_copy.S b/libc/arch-arm/cortex-a9/bionic/string_copy.S
index caf5a11..642db0f 100644
--- a/libc/arch-arm/cortex-a9/bionic/string_copy.S
+++ b/libc/arch-arm/cortex-a9/bionic/string_copy.S
@@ -244,13 +244,20 @@ ENTRY(strcpy)
.Lstringcopy_align_to_64:
tst r3, #4
beq .Lstringcopy_check_src_align
- ldr r2, [r1], #4
-
- sub ip, r2, #0x01010101
- bic ip, ip, r2
- ands ip, ip, #0x80808080
- bne .Lstringcopy_zero_in_first_register
- stmia r0!, {r2}
+ // Read one byte at a time since we don't have any idea about the alignment
+ // of the source and we don't want to read into a different page.
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .Lstringcopy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .Lstringcopy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .Lstringcopy_complete
+ ldrb r2, [r1], #1
+ strb r2, [r0], #1
+ cbz r2, .Lstringcopy_complete
b .Lstringcopy_check_src_align
.Lstringcopy_complete:
diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk
index 7b38de1..db4bcc7 100644
--- a/libc/arch-arm/cortex-a9/cortex-a9.mk
+++ b/libc/arch-arm/cortex-a9/cortex-a9.mk
@@ -10,6 +10,7 @@ libc_bionic_src_files_arm += \
arch-arm/cortex-a9/bionic/strlen.S \
libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memchr.S \
arch-arm/generic/bionic/memcmp.S \
libc_bionic_src_files_arm += \
diff --git a/libc/arch-arm/denver/denver.mk b/libc/arch-arm/denver/denver.mk
index 5fddf95..e81f8c7 100644
--- a/libc/arch-arm/denver/denver.mk
+++ b/libc/arch-arm/denver/denver.mk
@@ -1,4 +1,5 @@
libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memchr.S \
arch-arm/generic/bionic/memcmp.S \
arch-arm/denver/bionic/memcpy.S \
arch-arm/denver/bionic/memmove.S \
diff --git a/libc/arch-arm/generic/bionic/memchr.S b/libc/arch-arm/generic/bionic/memchr.S
new file mode 100644
index 0000000..cb00d82
--- /dev/null
+++ b/libc/arch-arm/generic/bionic/memchr.S
@@ -0,0 +1,155 @@
+/* Copyright (c) 2010-2015, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Linaro Limited nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ This memchr routine is optimised on a Cortex-A9 and should work on
+ all ARMv7 processors. It has a fast past for short sizes, and has
+ an optimised path for large data sets; the worst case is finding the
+ match early in a large data set.
+
+ */
+
+#include <private/bionic_asm.h>
+
+@ 2011-02-07 david.gilbert@linaro.org
+@ Extracted from local git a5b438d861
+@ 2011-07-14 david.gilbert@linaro.org
+@ Import endianness fix from local git ea786f1b
+@ 2011-12-07 david.gilbert@linaro.org
+@ Removed unneeded cbz from align loop
+
+ .syntax unified
+ .arch armv7-a
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+ .text
+ .thumb
+
+@ ---------------------------------------------------------------------------
+ .thumb_func
+ENTRY(memchr)
+ .p2align 4,,15
+ @ r0 = start of memory to scan
+ @ r1 = character to look for
+ @ r2 = length
+ @ returns r0 = pointer to character or NULL if not found
+ and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char
+
+ cmp r2,#16 @ If it's short don't bother with anything clever
+ blt 20f
+
+ tst r0, #7 @ If it's already aligned skip the next bit
+ beq 10f
+
+ @ Work up to an aligned point
+5:
+ ldrb r3, [r0],#1
+ subs r2, r2, #1
+ cmp r3, r1
+ beq 50f @ If it matches exit found
+ tst r0, #7
+ bne 5b @ If not aligned yet then do next byte
+
+10:
+ @ At this point, we are aligned, we know we have at least 8 bytes to work with
+ push {r4,r5,r6,r7}
+ orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes
+ orr r1, r1, r1, lsl #16
+ bic r4, r2, #7 @ Number of double words to work with
+ mvns r7, #0 @ all F's
+ movs r3, #0
+
+15:
+ ldrd r5,r6,[r0],#8
+ subs r4, r4, #8
+ eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target
+ eor r6,r6, r1
+ uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cbnz r6, 60f
+ bne 15b @ (Flags from the subs above) If not run out of bytes then go around again
+
+ pop {r4,r5,r6,r7}
+ and r1,r1,#0xff @ Get r1 back to a single character from the expansion above
+ and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done
+
+20:
+ cbz r2, 40f @ 0 length or hit the end already then not found
+
+21: @ Post aligned section, or just a short call
+ ldrb r3,[r0],#1
+ subs r2,r2,#1
+ eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub
+ cbz r3, 50f
+ bne 21b @ on r2 flags
+
+40:
+ movs r0,#0 @ not found
+ bx lr
+
+50:
+ subs r0,r0,#1 @ found
+ bx lr
+
+60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was
+ @ r0 points to the start of the double word after the one that was tested
+ @ r5 has the 00/ff pattern for the first word, r6 has the chained value
+ cmp r5, #0
+ itte eq
+ moveq r5, r6 @ the end is in the 2nd word
+ subeq r0,r0,#3 @ Points to 2nd byte of 2nd word
+ subne r0,r0,#7 @ or 2nd byte of 1st word
+
+ @ r0 currently points to the 3rd byte of the word containing the hit
+ tst r5, # CHARTSTMASK(0) @ 1st character
+ bne 61f
+ adds r0,r0,#1
+ tst r5, # CHARTSTMASK(1) @ 2nd character
+ ittt eq
+ addeq r0,r0,#1
+ tsteq r5, # (3<<15) @ 2nd & 3rd character
+ @ If not the 3rd must be the last one
+ addeq r0,r0,#1
+
+61:
+ pop {r4,r5,r6,r7}
+ subs r0,r0,#1
+ bx lr
+END(memchr)
diff --git a/libc/arch-arm/generic/bionic/memcmp.S b/libc/arch-arm/generic/bionic/memcmp.S
index c78dbd4..6643d55 100644
--- a/libc/arch-arm/generic/bionic/memcmp.S
+++ b/libc/arch-arm/generic/bionic/memcmp.S
@@ -221,8 +221,7 @@ ENTRY(memcmp)
bne 8b
9: /* restore registers and return */
- ldmfd sp!, {r4, lr}
- bx lr
+ ldmfd sp!, {r4, pc}
10: /* process less than 12 bytes */
cmp r2, #0
diff --git a/libc/arch-arm/generic/bionic/memcpy.S b/libc/arch-arm/generic/bionic/memcpy.S
index ea5a399..65cba4c 100644
--- a/libc/arch-arm/generic/bionic/memcpy.S
+++ b/libc/arch-arm/generic/bionic/memcpy.S
@@ -194,8 +194,7 @@ ENTRY(memcpy)
/* we're done! restore everything and return */
1: ldmfd sp!, {r5-r11}
- ldmfd sp!, {r0, r4, lr}
- bx lr
+ ldmfd sp!, {r0, r4, pc}
/********************************************************************/
@@ -385,8 +384,7 @@ ENTRY(memcpy)
/* we're done! restore sp and spilled registers and return */
add sp, sp, #28
- ldmfd sp!, {r0, r4, lr}
- bx lr
+ ldmfd sp!, {r0, r4, pc}
END(memcpy)
// Only reached when the __memcpy_chk check fails.
diff --git a/libc/arch-arm/generic/bionic/memset.S b/libc/arch-arm/generic/bionic/memset.S
index d17a9c4..b8eabbf 100644
--- a/libc/arch-arm/generic/bionic/memset.S
+++ b/libc/arch-arm/generic/bionic/memset.S
@@ -82,8 +82,7 @@ ENTRY(memset)
strbcs r1, [r0], #1
strbmi r1, [r0], #1
subs r2, r2, r3
- popls {r0, r4-r7, lr} /* return */
- bxls lr
+ popls {r0, r4-r7, pc} /* return */
/* align the destination to a cache-line */
mov r12, r1
@@ -126,8 +125,7 @@ ENTRY(memset)
strhmi r1, [r0], #2
movs r2, r2, lsl #2
strbcs r1, [r0]
- ldmfd sp!, {r0, r4-r7, lr}
- bx lr
+ ldmfd sp!, {r0, r4-r7, pc}
END(memset)
.data
diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index e49d6d2..016c882 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -1,4 +1,5 @@
libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memchr.S \
arch-arm/generic/bionic/memcmp.S \
arch-arm/generic/bionic/memcpy.S \
arch-arm/generic/bionic/memset.S \
diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S
index 246f159..1a39c5b 100644
--- a/libc/arch-arm/krait/bionic/__strcat_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcat_chk.S
@@ -40,7 +40,7 @@
ENTRY(__strcat_chk)
pld [r0, #0]
push {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
push {r4, r5}
@@ -177,7 +177,7 @@ ENTRY(__strcat_chk)
.L_strlen_done:
add r2, r3, r4
cmp r2, lr
- bhi __strcat_chk_failed
+ bhi .L_strcat_chk_failed
// Set up the registers for the memcpy code.
mov r1, r5
@@ -185,20 +185,17 @@ ENTRY(__strcat_chk)
mov r2, r4
add r0, r0, r3
pop {r4, r5}
-END(__strcat_chk)
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r4
+ .cfi_restore r5
-#define MEMCPY_BASE __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__strcat_chk_failed)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
+ // Undo the above cfi directives.
.cfi_adjust_cfa_offset 8
.cfi_rel_offset r4, 0
.cfi_rel_offset r5, 4
-
+.L_strcat_chk_failed:
ldr r0, error_message
ldr r1, error_code
1:
@@ -208,7 +205,7 @@ error_code:
.word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__strcat_chk_failed)
+END(__strcat_chk)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S
index db76686..00202f3 100644
--- a/libc/arch-arm/krait/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S
@@ -39,7 +39,7 @@
ENTRY(__strcpy_chk)
pld [r0, #0]
push {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
@@ -149,21 +149,14 @@ ENTRY(__strcpy_chk)
pld [r1, #64]
ldr r0, [sp]
cmp r3, lr
- bhs __strcpy_chk_failed
+ bhs .L_strcpy_chk_failed
// Add 1 for copy length to get the string terminator.
add r2, r3, #1
-END(__strcpy_chk)
-#define MEMCPY_BASE __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__strcpy_chk_failed)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
-
+.L_strcpy_chk_failed:
ldr r0, error_message
ldr r1, error_code
1:
@@ -173,7 +166,7 @@ error_code:
.word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__strcpy_chk_failed)
+END(__strcpy_chk)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 9ff46a8..5d27b57 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -45,7 +45,7 @@
ENTRY(__memcpy_chk)
cmp r2, r3
- bhi __memcpy_chk_fail
+ bhi .L_memcpy_chk_fail
// Fall through to memcpy...
END(__memcpy_chk)
@@ -53,19 +53,20 @@ END(__memcpy_chk)
ENTRY(memcpy)
pld [r1, #64]
stmfd sp!, {r0, lr}
- .cfi_def_cfa_offset 8
+ .cfi_adjust_cfa_offset 8
.cfi_rel_offset r0, 0
.cfi_rel_offset lr, 4
-END(memcpy)
-#define MEMCPY_BASE __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
#include "memcpy_base.S"
-ENTRY_PRIVATE(__memcpy_chk_fail)
+ // Undo the cfi directives from above.
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r0
+ .cfi_restore lr
+.L_memcpy_chk_fail:
// Preserve lr for backtrace.
push {lr}
- .cfi_def_cfa_offset 4
+ .cfi_adjust_cfa_offset 4
.cfi_rel_offset lr, 0
ldr r0, error_message
@@ -77,7 +78,7 @@ error_code:
.word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
error_message:
.word error_string-(1b+4)
-END(__memcpy_chk_fail)
+END(memcpy)
.data
error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S
index 035dcf1..76c5a84 100644
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -1,123 +1,191 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-
-/*
- * This code assumes it is running on a processor that supports all arm v7
- * instructions, that supports neon instructions, and that has a 32 byte
- * cache line.
- */
-
-// Assumes neon instructions and a cache line size of 32 bytes.
-
-ENTRY_PRIVATE(MEMCPY_BASE)
- .cfi_def_cfa_offset 8
- .cfi_rel_offset r0, 0
- .cfi_rel_offset lr, 4
-
- /* do we have at least 16-bytes to copy (needed for alignment below) */
- cmp r2, #16
- blo 5f
-
- /* align destination to cache-line for the write-buffer */
- rsb r3, r0, #0
- ands r3, r3, #0xF
- beq 2f
-
- /* copy up to 15-bytes (count in r3) */
- sub r2, r2, r3
- movs ip, r3, lsl #31
- itt mi
- ldrbmi lr, [r1], #1
- strbmi lr, [r0], #1
- itttt cs
- ldrbcs ip, [r1], #1
- ldrbcs lr, [r1], #1
- strbcs ip, [r0], #1
- strbcs lr, [r0], #1
- movs ip, r3, lsl #29
- bge 1f
- // copies 4 bytes, destination 32-bits aligned
- vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
- vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
-1: bcc 2f
- // copies 8 bytes, destination 64-bits aligned
- vld1.8 {d0}, [r1]!
- vst1.8 {d0}, [r0, :64]!
-
-2: /* make sure we have at least 64 bytes to copy */
- subs r2, r2, #64
- blo 2f
-
-1: /* The main loop copies 64 bytes at a time */
- vld1.8 {d0 - d3}, [r1]!
- vld1.8 {d4 - d7}, [r1]!
- pld [r1, #(32*8)]
- subs r2, r2, #64
- vst1.8 {d0 - d3}, [r0, :128]!
- vst1.8 {d4 - d7}, [r0, :128]!
- bhs 1b
-
-2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
- adds r2, r2, #32
- blo 4f
-
- /* Copy 32 bytes. These cache lines were already preloaded */
- vld1.8 {d0 - d3}, [r1]!
- sub r2, r2, #32
- vst1.8 {d0 - d3}, [r0, :128]!
-
-4: /* less than 32 left */
- add r2, r2, #32
- tst r2, #0x10
- beq 5f
- // copies 16 bytes, 128-bits aligned
- vld1.8 {d0, d1}, [r1]!
- vst1.8 {d0, d1}, [r0, :128]!
-
-5: /* copy up to 15-bytes (count in r2) */
- movs ip, r2, lsl #29
- bcc 1f
- vld1.8 {d0}, [r1]!
- vst1.8 {d0}, [r0]!
-1: bge 2f
- vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
- vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
-2: movs ip, r2, lsl #31
- itt mi
- ldrbmi r3, [r1], #1
- strbmi r3, [r0], #1
- itttt cs
- ldrbcs ip, [r1], #1
- ldrbcs lr, [r1], #1
- strbcs ip, [r0], #1
- strbcs lr, [r0], #1
-
- ldmfd sp!, {r0, lr}
- bx lr
-END(MEMCPY_BASE)
+/***************************************************************************
+ Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of The Linux Foundation nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************/
+
+/* Assumes neon instructions and a cache line size of 64 bytes. */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+#define PLDOFFS (10)
+#define PLDTHRESH (PLDOFFS)
+#define BBTHRESH (4096/64)
+#define PLDSIZE (64)
+
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+
+ .text
+ .fpu neon
+
+.L_memcpy_base:
+ cmp r2, #4
+ blt .L_neon_lt4
+ cmp r2, #16
+ blt .L_neon_lt16
+ cmp r2, #32
+ blt .L_neon_16
+ cmp r2, #64
+ blt .L_neon_copy_32_a
+
+ mov r12, r2, lsr #6
+ cmp r12, #PLDTHRESH
+ ble .L_neon_copy_64_loop_nopld
+
+ push {r9, r10}
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset r9, 0
+ .cfi_rel_offset r10, 4
+
+ cmp r12, #BBTHRESH
+ ble .L_neon_prime_pump
+
+ add lr, r0, #0x400
+ add r9, r1, #(PLDOFFS*PLDSIZE)
+ sub lr, lr, r9
+ lsl lr, lr, #21
+ lsr lr, lr, #21
+ add lr, lr, #(PLDOFFS*PLDSIZE)
+ cmp r12, lr, lsr #6
+ ble .L_neon_prime_pump
+
+ itt gt
+ movgt r9, #(PLDOFFS)
+ rsbsgt r9, r9, lr, lsr #6
+ ble .L_neon_prime_pump
+
+ add r10, r1, lr
+ bic r10, #0x3F
+
+ sub r12, r12, lr, lsr #6
+
+ cmp r9, r12
+ itee le
+ suble r12, r12, r9
+ movgt r9, r12
+ movgt r12, #0
+
+ pld [r1, #((PLDOFFS-1)*PLDSIZE)]
+.L_neon_copy_64_loop_outer_doublepld:
+ pld [r1, #((PLDOFFS)*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r9, r9, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .L_neon_copy_64_loop_outer_doublepld
+ cmp r12, #0
+ beq .L_neon_pop_before_nopld
+
+ cmp r12, #(512*1024/64)
+ blt .L_neon_copy_64_loop_outer
+
+.L_neon_copy_64_loop_ddr:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ pld [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .L_neon_copy_64_loop_ddr
+ b .L_neon_pop_before_nopld
+
+.L_neon_prime_pump:
+ mov lr, #(PLDOFFS*PLDSIZE)
+ add r10, r1, #(PLDOFFS*PLDSIZE)
+ bic r10, #0x3F
+ sub r12, r12, #PLDOFFS
+ ldr r3, [r10, #(-1*PLDSIZE)]
+
+.L_neon_copy_64_loop_outer:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ ldr r3, [r10]
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ add r10, #64
+ bne .L_neon_copy_64_loop_outer
+
+.L_neon_pop_before_nopld:
+ mov r12, lr, lsr #6
+ pop {r9, r10}
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore r9
+ .cfi_restore r10
+
+.L_neon_copy_64_loop_nopld:
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .L_neon_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ beq .L_neon_exit
+
+.L_neon_copy_32_a:
+ movs r3, r2, lsl #27
+ bcc .L_neon_16
+ vld1.32 {q0,q1}, [r1]!
+ vst1.32 {q0,q1}, [r0]!
+
+.L_neon_16:
+ bpl .L_neon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ ands r2, r2, #0x0f
+ beq .L_neon_exit
+
+.L_neon_lt16:
+ movs r3, r2, lsl #29
+ bcc 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0]!
+1:
+ bge .L_neon_lt4
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+.L_neon_lt4:
+ movs r2, r2, lsl #31
+ itt cs
+ ldrhcs r3, [r1], #2
+ strhcs r3, [r0], #2
+ itt mi
+ ldrbmi r3, [r1]
+ strbmi r3, [r0]
+
+.L_neon_exit:
+ pop {r0, pc}
diff --git a/libc/arch-arm/krait/bionic/memmove.S b/libc/arch-arm/krait/bionic/memmove.S
new file mode 100644
index 0000000..aea7315
--- /dev/null
+++ b/libc/arch-arm/krait/bionic/memmove.S
@@ -0,0 +1,219 @@
+/***************************************************************************
+ Copyright (c) 2009-2014 The Linux Foundation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of The Linux Foundation nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************/
+
+/***************************************************************************
+ * Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ * Inputs:
+ * dest: The destination buffer
+ * src: The source buffer
+ * n: The size of the buffer to transfer
+ * Outputs:
+ *
+ ***************************************************************************/
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+/*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_KRAIT_PLD_SET := true
+ * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
+ * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (10)
+#endif
+#ifndef PLDTHRESH
+#define PLDTHRESH (PLDOFFS)
+#endif
+#if (PLDOFFS < 5)
+#error Routine does not support offsets less than 5
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (64)
+#endif
+
+ .text
+ .syntax unified
+ .fpu neon
+ .thumb
+ .thumb_func
+
+//ENTRY(bcopy)
+// //.cfi_startproc
+// mov r12, r0
+// mov r0, r1
+// mov r1, r12
+// // Fall through to memmove
+// //.cfi_endproc
+//END(bcopy)
+
+ENTRY(memmove)
+_memmove_words:
+ //.cfi_startproc
+ .save {r0, lr}
+ cmp r2, #0
+ it ne
+ subsne r12, r0, r1 // Warning: do not combine these "it" blocks
+ it eq
+ bxeq lr
+// memmove only if r1 < r0 < r1+r2
+ cmp r0, r1
+ itt ge
+ addge r12, r1, r2
+ cmpge r12, r0
+ it le
+ ble memcpy
+ cmp r2, #4
+ it le
+ ble .Lneon_b2f_smallcopy_loop
+ push {r0, lr}
+ add r0, r0, r2
+ add r1, r1, r2
+ cmp r2, #64
+ it ge
+ bge .Lneon_b2f_copy_64
+ cmp r2, #32
+ it ge
+ bge .Lneon_b2f_copy_32
+ cmp r2, #8
+ it ge
+ bge .Lneon_b2f_copy_8
+ b .Lneon_b2f_copy_1
+.Lneon_b2f_copy_64:
+ mov r12, r2, lsr #6
+ add r0, r0, #32
+ add r1, r1, #32
+ cmp r12, #PLDTHRESH
+ it le
+ ble .Lneon_b2f_copy_64_loop_nopld
+ sub r12, #PLDOFFS
+ sub lr, r1, #(PLDOFFS)*PLDSIZE
+.Lneon_b2f_copy_64_loop_outer:
+ pld [lr]
+ sub r1, r1, #96
+ sub r0, r0, #96
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]
+ sub lr, lr, #64
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]
+ it ne
+ bne .Lneon_b2f_copy_64_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_b2f_copy_64_loop_nopld:
+ sub r1, r1, #96
+ sub r0, r0, #96
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]
+ it ne
+ bne .Lneon_b2f_copy_64_loop_nopld
+ ands r2, r2, #0x3f
+ it eq
+ beq .Lneon_memmove_done
+ sub r1, r1, #32
+ sub r0, r0, #32
+ cmp r2, #32
+ it lt
+ blt .Lneon_b2f_copy_8
+.Lneon_b2f_copy_32:
+ sub r1, r1, #32
+ sub r0, r0, #32
+ vld1.32 {q0, q1}, [r1]
+ vst1.32 {q0, q1}, [r0]
+ ands r2, r2, #0x1f
+ it eq
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_8:
+ movs r12, r2, lsr #0x3
+ it eq
+ beq .Lneon_b2f_copy_1
+.Lneon_b2f_copy_8_loop:
+ sub r1, r1, #8
+ sub r0, r0, #8
+ vld1.32 {d0}, [r1]
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]
+ it ne
+ bne .Lneon_b2f_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_1:
+ movs r12, r2, lsl #29
+ itttt mi
+ submi r1, r1, #4
+ submi r0, r0, #4
+ ldrmi r3, [r1]
+ strmi r3, [r0]
+ movs r2, r2, lsl #31
+ itttt cs
+ subcs r1, r1, #2
+ subcs r0, r0, #2
+ ldrhcs r3, [r1]
+ strhcs r3, [r0]
+ itttt mi
+ submi r1, r1, #1
+ submi r0, r0, #1
+ ldrbmi r12, [r1]
+ strbmi r12, [r0]
+.Lneon_memmove_done:
+ pop {r0, pc}
+.Lneon_b2f_smallcopy_loop:
+ // 4 bytes or less
+ add r1, r1, r2
+ add r0, r0, r2
+ movs r12, r2, lsl #29
+ itttt mi
+ submi r1, r1, #4
+ submi r0, r0, #4
+ ldrmi r3, [r1]
+ strmi r3, [r0]
+ movs r2, r2, lsl #31
+ itttt cs
+ subcs r1, r1, #2
+ subcs r0, r0, #2
+ ldrhcs r3, [r1]
+ strhcs r3, [r0]
+ itttt mi
+ submi r1, r1, #1
+ submi r0, r0, #1
+ ldrbmi r12, [r1]
+ strbmi r12, [r0]
+ bx lr
+// .cfi_endproc
+END(memmove)
+
diff --git a/libc/arch-arm/krait/bionic/memset.S b/libc/arch-arm/krait/bionic/memset.S
index a4fbe17..ae05965 100644
--- a/libc/arch-arm/krait/bionic/memset.S
+++ b/libc/arch-arm/krait/bionic/memset.S
@@ -69,10 +69,7 @@ END(bzero)
/* memset() returns its first argument. */
ENTRY(memset)
- stmfd sp!, {r0}
- .cfi_def_cfa_offset 4
- .cfi_rel_offset r0, 0
-
+ mov r3, r0
vdup.8 q0, r1
/* make sure we have at least 32 bytes to write */
@@ -82,7 +79,7 @@ ENTRY(memset)
1: /* The main loop writes 32 bytes at a time */
subs r2, r2, #32
- vst1.8 {d0 - d3}, [r0]!
+ vst1.8 {d0 - d3}, [r3]!
bhs 1b
2: /* less than 32 left */
@@ -91,18 +88,17 @@ ENTRY(memset)
beq 3f
// writes 16 bytes, 128-bits aligned
- vst1.8 {d0, d1}, [r0]!
+ vst1.8 {d0, d1}, [r3]!
3: /* write up to 15-bytes (count in r2) */
movs ip, r2, lsl #29
bcc 1f
- vst1.8 {d0}, [r0]!
+ vst1.8 {d0}, [r3]!
1: bge 2f
- vst1.32 {d0[0]}, [r0]!
+ vst1.32 {d0[0]}, [r3]!
2: movs ip, r2, lsl #31
- strbmi r1, [r0], #1
- strbcs r1, [r0], #1
- strbcs r1, [r0], #1
- ldmfd sp!, {r0}
+ strbmi r1, [r3], #1
+ strbcs r1, [r3], #1
+ strbcs r1, [r3], #1
bx lr
END(memset)
diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk
index 88b4d66..5f5b414 100644
--- a/libc/arch-arm/krait/krait.mk
+++ b/libc/arch-arm/krait/krait.mk
@@ -1,9 +1,19 @@
libc_bionic_src_files_arm += \
- arch-arm/krait/bionic/memcpy.S \
arch-arm/krait/bionic/memset.S \
arch-arm/krait/bionic/strcmp.S \
arch-arm/krait/bionic/__strcat_chk.S \
arch-arm/krait/bionic/__strcpy_chk.S \
+ arch-arm/krait/bionic/memmove.S
+
+#For some targets we don't need this optimization.
+#Corresponding flag is defined in device specific folder.
+ifeq ($(TARGET_CPU_MEMCPY_BASE_OPT_DISABLE),true)
+libc_bionic_src_files_arm += \
+ arch-arm/cortex-a15/bionic/memcpy.S
+else
+libc_bionic_src_files_arm += \
+ arch-arm/krait/bionic/memcpy.S
+endif
# Use cortex-a15 versions of strcat/strcpy/strlen and standard memmove
libc_bionic_src_files_arm += \
@@ -13,7 +23,7 @@ libc_bionic_src_files_arm += \
arch-arm/cortex-a15/bionic/strlen.S \
libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memchr.S \
arch-arm/generic/bionic/memcmp.S \
-libc_bionic_src_files_arm += \
- arch-arm/denver/bionic/memmove.S \
+
diff --git a/libc/arch-arm/scorpion/scorpion.mk b/libc/arch-arm/scorpion/scorpion.mk
new file mode 100644
index 0000000..ce18a7e
--- /dev/null
+++ b/libc/arch-arm/scorpion/scorpion.mk
@@ -0,0 +1,18 @@
+# Use krait versions of memset/strcmp/memmove
+libc_bionic_src_files_arm += \
+ arch-arm/krait/bionic/memset.S \
+ arch-arm/krait/bionic/strcmp.S \
+ arch-arm/krait/bionic/memmove.S
+
+libc_bionic_src_files_arm += \
+ arch-arm/cortex-a15/bionic/memcpy.S \
+ arch-arm/cortex-a15/bionic/stpcpy.S \
+ arch-arm/cortex-a15/bionic/strcat.S \
+ arch-arm/cortex-a15/bionic/__strcat_chk.S \
+ arch-arm/cortex-a15/bionic/strcpy.S \
+ arch-arm/cortex-a15/bionic/__strcpy_chk.S \
+ arch-arm/cortex-a15/bionic/strlen.S
+
+libc_bionic_src_files_arm += \
+ arch-arm/generic/bionic/memchr.S \
+ arch-arm/generic/bionic/memcmp.S
diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index 470a038..1b8d534 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -8,7 +8,6 @@ libc_bionic_src_files_arm64 += \
bionic/__memset_chk.cpp \
bionic/__strcpy_chk.cpp \
bionic/__strcat_chk.cpp \
- bionic/strrchr.cpp \
libc_freebsd_src_files_arm64 += \
upstream-freebsd/lib/libc/string/wcscat.c \
diff --git a/libc/arch-arm64/denver64/denver64.mk b/libc/arch-arm64/denver64/denver64.mk
index d619c11..3c453bb 100644
--- a/libc/arch-arm64/denver64/denver64.mk
+++ b/libc/arch-arm64/denver64/denver64.mk
@@ -11,4 +11,5 @@ libc_bionic_src_files_arm64 += \
arch-arm64/generic/bionic/strlen.S \
arch-arm64/generic/bionic/strncmp.S \
arch-arm64/generic/bionic/strnlen.S \
+ arch-arm64/generic/bionic/strrchr.S \
arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S
index c5d42ce..f850624 100644
--- a/libc/arch-arm64/generic/bionic/memcpy_base.S
+++ b/libc/arch-arm64/generic/bionic/memcpy_base.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Linaro Limited
+/* Copyright (c) 2012-2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -22,158 +22,196 @@
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses.
*
*/
+#include <private/bionic_asm.h>
+
#define dstin x0
#define src x1
#define count x2
-#define tmp1 x3
-#define tmp1w w3
-#define tmp2 x4
-#define tmp2w w4
-#define tmp3 x5
-#define tmp3w w5
-#define dst x6
-
-#define A_l x7
-#define A_h x8
-#define B_l x9
-#define B_h x10
-#define C_l x11
-#define C_h x12
-#define D_l x13
-#define D_h x14
-
- mov dst, dstin
- cmp count, #64
- b.ge .Lcpy_not_short
- cmp count, #15
- b.le .Ltail15tiny
-
- /* Deal with small copies quickly by dropping straight into the
- * exit block. */
-.Ltail63:
- /* Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate. */
- ands tmp1, count, #0x30
- b.eq .Ltail15
- add dst, dst, tmp1
- add src, src, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #-48]
- stp A_l, A_h, [dst, #-48]
-1:
- ldp A_l, A_h, [src, #-32]
- stp A_l, A_h, [dst, #-32]
-2:
- ldp A_l, A_h, [src, #-16]
- stp A_l, A_h, [dst, #-16]
-
-.Ltail15:
- ands count, count, #15
- beq 1f
- add src, src, count
- ldp A_l, A_h, [src, #-16]
- add dst, dst, count
- stp A_l, A_h, [dst, #-16]
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define tmp1 x9
+
+#define L(l) .L ## l
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Large copies
+ of more than 96 bytes align the destination and use an unrolled loop
+ processing 64 bytes per iteration.
+ Small and medium copies read all data before writing, allowing any
+ kind of overlap, and memmove tailcalls memcpy for these cases as
+ well as non-overlapping copies.
+*/
+
+ prfm PLDL1KEEP, [src]
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
ret
-.Ltail15tiny:
- /* Copy up to 15 bytes of data. Does not assume additional data
- being copied. */
- tbz count, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
-1:
- tbz count, #2, 1f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-1:
- tbz count, #1, 1f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-1:
- tbz count, #0, 1f
- ldrb tmp1w, [src]
- strb tmp1w, [dst]
+ .p2align 4
+
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
1:
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
ret
-.Lcpy_not_short:
- /* We don't much care about the alignment of DST, but we want SRC
- * to be 128-bit (16 byte) aligned so that we don't cross cache line
- * boundaries on both loads and stores. */
- neg tmp2, src
- ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
- b.eq 2f
- sub count, count, tmp2
- /* Copy more data than needed; it's faster than jumping
- * around copying sub-Quadword quantities. We know that
- * it can't overrun. */
- ldp A_l, A_h, [src]
- add src, src, tmp2
- stp A_l, A_h, [dst]
- add dst, dst, tmp2
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le .Ltail63
-2:
- subs count, count, #128
- b.ge .Lcpy_body_large
- /* Less than 128 bytes to copy, so handle 64 here and then jump
- * to the tail. */
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]
- stp A_l, A_h, [dst]
- stp B_l, B_h, [dst, #16]
- stp C_l, C_h, [dst, #32]
- stp D_l, D_h, [dst, #48]
- tst count, #0x3f
- add src, src, #64
- add dst, dst, #64
- b.ne .Ltail63
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+1:
+ cbz count, 2f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+2: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
ret
- /* Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line. */
- .p2align 6
-.Lcpy_body_large:
- /* There are at least 128 bytes to copy. */
- ldp A_l, A_h, [src, #0]
- sub dst, dst, #16 /* Pre-bias. */
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
+ /* Align DST to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls 2f
1:
- stp A_l, A_h, [dst, #16]
- ldp A_l, A_h, [src, #16]
- stp B_l, B_h, [dst, #32]
- ldp B_l, B_h, [src, #32]
- stp C_l, C_h, [dst, #48]
- ldp C_l, C_h, [src, #48]
- stp D_l, D_h, [dst, #64]!
- ldp D_l, D_h, [src, #64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #16]
- stp B_l, B_h, [dst, #32]
- stp C_l, C_h, [dst, #48]
- stp D_l, D_h, [dst, #64]
- add src, src, #16
- add dst, dst, #64 + 16
- tst count, #0x3f
- b.ne .Ltail63
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+2:
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
ret
diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S
index 8b366a3..c50112d 100644
--- a/libc/arch-arm64/generic/bionic/memmove.S
+++ b/libc/arch-arm64/generic/bionic/memmove.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -22,319 +22,131 @@
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Unaligned accesses
- * wchar_t is 4 bytes
+ * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes
*/
#include <private/bionic_asm.h>
/* Parameters and result. */
-#ifdef BCOPY
-#define origdstin x1
-#define origsrc x0
-#endif
#define dstin x0
#define src x1
#define count x2
-#define tmp1 x3
-#define tmp1w w3
-#define tmp2 x4
-#define tmp2w w4
-#define tmp3 x5
-#define tmp3w w5
-#define dst x6
-
-#define A_l x7
-#define A_h x8
-#define B_l x9
-#define B_h x10
-#define C_l x11
-#define C_h x12
-#define D_l x13
-#define D_h x14
+#define srcend x3
+#define dstend x4
+#define tmp1 x5
+#define A_l x6
+#define A_h x7
+#define B_l x8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l count
+#define E_h tmp1
+
+/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+ Larger backwards copies are also handled by memcpy. The only remaining
+ case is forward large copies. The destination is aligned, and an
+ unrolled loop processes 64 bytes per iteration.
+*/
-#ifdef BCOPY
-ENTRY(bcopy)
- /* Swap src and dst so that a branch to memcpy doesn't cause issues. */
- mov tmp1, origsrc
- mov origsrc, origdstin
- mov origdstin, tmp1
-#elif defined(WMEMMOVE)
+#if defined(WMEMMOVE)
ENTRY(wmemmove)
lsl count, count, #2
#else
ENTRY(memmove)
#endif
- cmp dstin, src
- b.lo .Ldownwards
- add tmp1, src, count
- cmp dstin, tmp1
- b.hs memcpy /* No overlap. */
-
- /* Upwards move with potential overlap.
- * Need to move from the tail backwards. SRC and DST point one
- * byte beyond the remaining data to move. */
- add dst, dstin, count
- add src, src, count
- cmp count, #64
- b.ge .Lmov_not_short_up
-
- /* Deal with small moves quickly by dropping straight into the
- * exit block. */
-.Ltail63up:
- /* Move up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate. */
- ands tmp1, count, #0x30
- b.eq .Ltail15up
- sub dst, dst, tmp1
- sub src, src, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #32]
- stp A_l, A_h, [dst, #32]
-1:
- ldp A_l, A_h, [src, #16]
- stp A_l, A_h, [dst, #16]
-2:
- ldp A_l, A_h, [src]
- stp A_l, A_h, [dst]
-.Ltail15up:
- /* Move up to 15 bytes of data. Does not assume additional data
- * being moved. */
- tbz count, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-1:
- tbz count, #2, 1f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-1:
- tbz count, #1, 1f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-1:
- tbz count, #0, 1f
- ldrb tmp1w, [src, #-1]
- strb tmp1w, [dst, #-1]
-1:
- ret
-
-.Lmov_not_short_up:
- /* We don't much care about the alignment of DST, but we want SRC
- * to be 128-bit (16 byte) aligned so that we don't cross cache line
- * boundaries on both loads and stores. */
- ands tmp2, src, #15 /* Bytes to reach alignment. */
- b.eq 2f
- sub count, count, tmp2
- /* Move enough data to reach alignment; unlike memcpy, we have to
- * be aware of the overlap, which means we can't move data twice. */
- tbz tmp2, #3, 1f
- ldr tmp1, [src, #-8]!
- str tmp1, [dst, #-8]!
-1:
- tbz tmp2, #2, 1f
- ldr tmp1w, [src, #-4]!
- str tmp1w, [dst, #-4]!
-1:
- tbz tmp2, #1, 1f
- ldrh tmp1w, [src, #-2]!
- strh tmp1w, [dst, #-2]!
-1:
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src, #-1]!
- strb tmp1w, [dst, #-1]!
-1:
-
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le .Ltail63up
+ sub tmp1, dstin, src
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.hs memcpy
+
+ cbz tmp1, 3f
+ add dstend, dstin, count
+ add srcend, src, count
+
+ /* Align dstend to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 64 bytes per iteration and prefetches one iteration ahead. */
+
+ and tmp1, dstend, 15
+ ldp D_l, D_h, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+ nop
+1:
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
2:
- subs count, count, #128
- b.ge .Lmov_body_large_up
- /* Less than 128 bytes to move, so handle 64 here and then jump
- * to the tail. */
- ldp A_l, A_h, [src, #-64]!
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]
- stp A_l, A_h, [dst, #-64]!
- stp B_l, B_h, [dst, #16]
- stp C_l, C_h, [dst, #32]
- stp D_l, D_h, [dst, #48]
- tst count, #0x3f
- b.ne .Ltail63up
- ret
-
- /* Critical loop. Start at a new Icache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line. */
- .p2align 6
-.Lmov_body_large_up:
- /* There are at least 128 bytes to move. */
- ldp A_l, A_h, [src, #-16]
- ldp B_l, B_h, [src, #-32]
- ldp C_l, C_h, [src, #-48]
- ldp D_l, D_h, [src, #-64]!
-1:
- stp A_l, A_h, [dst, #-16]
- ldp A_l, A_h, [src, #-16]
- stp B_l, B_h, [dst, #-32]
- ldp B_l, B_h, [src, #-32]
- stp C_l, C_h, [dst, #-48]
- ldp C_l, C_h, [src, #-48]
- stp D_l, D_h, [dst, #-64]!
- ldp D_l, D_h, [src, #-64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #-16]
- stp B_l, B_h, [dst, #-32]
- stp C_l, C_h, [dst, #-48]
- stp D_l, D_h, [dst, #-64]!
- tst count, #0x3f
- b.ne .Ltail63up
- ret
-
-
-.Ldownwards:
- /* For a downwards move we can safely use memcpy provided that
- * DST is more than 16 bytes away from SRC. */
- sub tmp1, src, #16
- cmp dstin, tmp1
- b.ls memcpy /* May overlap, but not critically. */
-
- mov dst, dstin /* Preserve DSTIN for return value. */
- cmp count, #64
- b.ge .Lmov_not_short_down
-
- /* Deal with small moves quickly by dropping straight into the
- * exit block. */
-.Ltail63down:
- /* Move up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate. */
- ands tmp1, count, #0x30
- b.eq .Ltail15down
- add dst, dst, tmp1
- add src, src, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp A_l, A_h, [src, #-48]
- stp A_l, A_h, [dst, #-48]
-1:
- ldp A_l, A_h, [src, #-32]
- stp A_l, A_h, [dst, #-32]
-2:
- ldp A_l, A_h, [src, #-16]
- stp A_l, A_h, [dst, #-16]
-.Ltail15down:
- /* Move up to 15 bytes of data. Does not assume additional data
- being moved. */
- tbz count, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
-1:
- tbz count, #2, 1f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-1:
- tbz count, #1, 1f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-1:
- tbz count, #0, 1f
- ldrb tmp1w, [src]
- strb tmp1w, [dst]
-1:
- ret
-
-.Lmov_not_short_down:
- /* We don't much care about the alignment of DST, but we want SRC
- * to be 128-bit (16 byte) aligned so that we don't cross cache line
- * boundaries on both loads and stores. */
- neg tmp2, src
- ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
- b.eq 2f
- sub count, count, tmp2
- /* Move enough data to reach alignment; unlike memcpy, we have to
- * be aware of the overlap, which means we can't move data twice. */
- tbz tmp2, #3, 1f
- ldr tmp1, [src], #8
- str tmp1, [dst], #8
-1:
- tbz tmp2, #2, 1f
- ldr tmp1w, [src], #4
- str tmp1w, [dst], #4
-1:
- tbz tmp2, #1, 1f
- ldrh tmp1w, [src], #2
- strh tmp1w, [dst], #2
-1:
- tbz tmp2, #0, 1f
- ldrb tmp1w, [src], #1
- strb tmp1w, [dst], #1
-1:
-
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le .Ltail63down
-2:
- subs count, count, #128
- b.ge .Lmov_body_large_down
- /* Less than 128 bytes to move, so handle 64 here and then jump
- * to the tail. */
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]
- stp A_l, A_h, [dst]
- stp B_l, B_h, [dst, #16]
- stp C_l, C_h, [dst, #32]
- stp D_l, D_h, [dst, #48]
- tst count, #0x3f
- add src, src, #64
- add dst, dst, #64
- b.ne .Ltail63down
- ret
-
- /* Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line. */
- .p2align 6
-.Lmov_body_large_down:
- /* There are at least 128 bytes to move. */
- ldp A_l, A_h, [src, #0]
- sub dst, dst, #16 /* Pre-bias. */
- ldp B_l, B_h, [src, #16]
- ldp C_l, C_h, [src, #32]
- ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
-1:
- stp A_l, A_h, [dst, #16]
- ldp A_l, A_h, [src, #16]
- stp B_l, B_h, [dst, #32]
- ldp B_l, B_h, [src, #32]
- stp C_l, C_h, [dst, #48]
- ldp C_l, C_h, [src, #48]
- stp D_l, D_h, [dst, #64]!
- ldp D_l, D_h, [src, #64]!
- subs count, count, #64
- b.ge 1b
- stp A_l, A_h, [dst, #16]
- stp B_l, B_h, [dst, #32]
- stp C_l, C_h, [dst, #48]
- stp D_l, D_h, [dst, #64]
- add src, src, #16
- add dst, dst, #64 + 16
- tst count, #0x3f
- b.ne .Ltail63down
- ret
-#ifdef BCOPY
-END(bcopy)
-#elif defined(WMEMMOVE)
+ ldp E_l, E_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp E_l, E_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+3: ret
+
+#if defined(WMEMMOVE)
END(wmemmove)
#else
END(memmove)
diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S
index 7c204b4..4b3b17b 100644
--- a/libc/arch-arm64/generic/bionic/memset.S
+++ b/libc/arch-arm64/generic/bionic/memset.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, Linaro Limited
+/* Copyright (c) 2012-2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -22,226 +22,207 @@
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/*
+ * Copyright (c) 2015 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Unaligned accesses
+ * ARMv8-a, AArch64, unaligned accesses
*
*/
#include <private/bionic_asm.h>
-/* By default we assume that the DC instruction can be used to zero
- data blocks more efficiently. In some circumstances this might be
- unsafe, for example in an asymmetric multiprocessor environment with
- different DC clear lengths (neither the upper nor lower lengths are
- safe to use).
-
- If code may be run in a virtualized environment, then define
- MAYBE_VIRT. This will cause the code to cache the system register
- values rather than re-reading them each call. */
-
-#define dstin x0
-#ifdef BZERO
-#define count x1
-#else
-#define count x2
-#endif
-#define val w1
-#define tmp1 x3
-#define tmp1w w3
-#define tmp2 x4
-#define tmp2w w4
-#define zva_len_x x5
-#define zva_len w5
-#define zva_bits_x x6
-
-#define A_l x7
-#define A_lw w7
-#define dst x8
-#define tmp3w w9
-
-#ifdef BZERO
-ENTRY(bzero)
-#else
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define tmp1 x5
+#define tmp1w w5
+#define tmp2 x6
+#define tmp2w w6
+#define zva_len x7
+#define zva_lenw w7
+
+#define L(l) .L ## l
+
ENTRY(memset)
-#endif
-
- mov dst, dstin /* Preserve return value. */
-#ifdef BZERO
- b .Lzero_mem
-#endif
- ands A_lw, val, #255
- b.eq .Lzero_mem
- orr A_lw, A_lw, A_lw, lsl #8
- orr A_lw, A_lw, A_lw, lsl #16
- orr A_l, A_l, A_l, lsl #32
-.Ltail_maybe_long:
- cmp count, #64
- b.ge .Lnot_short
-.Ltail_maybe_tiny:
- cmp count, #15
- b.le .Ltail15tiny
-.Ltail63:
- ands tmp1, count, #0x30
- b.eq .Ltail15
- add dst, dst, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- stp A_l, A_l, [dst, #-48]
-1:
- stp A_l, A_l, [dst, #-32]
-2:
- stp A_l, A_l, [dst, #-16]
-
-.Ltail15:
- and count, count, #15
- add dst, dst, count
- stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
- ret
-.Ltail15tiny:
- /* Set up to 15 bytes. Does not assume earlier memory
- being set. */
- tbz count, #3, 1f
- str A_l, [dst], #8
-1:
- tbz count, #2, 1f
- str A_lw, [dst], #4
-1:
- tbz count, #1, 1f
- strh A_lw, [dst], #2
-1:
- tbz count, #0, 1f
- strb A_lw, [dst]
-1:
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ nop
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
ret
- /* Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line, this ensures the entire loop is in one line. */
- .p2align 6
-.Lnot_short:
- neg tmp2, dst
- ands tmp2, tmp2, #15
- b.eq 2f
- /* Bring DST to 128-bit (16-byte) alignment. We know that there's
- * more than that to set, so we simply store 16 bytes and advance by
- * the amount required to reach alignment. */
- sub count, count, tmp2
- stp A_l, A_l, [dst]
- add dst, dst, tmp2
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le .Ltail63
-2:
- sub dst, dst, #16 /* Pre-bias. */
- sub count, count, #64
-1:
- stp A_l, A_l, [dst, #16]
- stp A_l, A_l, [dst, #32]
- stp A_l, A_l, [dst, #48]
- stp A_l, A_l, [dst, #64]!
- subs count, count, #64
- b.ge 1b
- tst count, #0x3f
- add dst, dst, #16
- b.ne .Ltail63
+ .p2align 3
+ nop
+L(set_long):
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 256
+ ccmp valw, 0, 0, cs
+ b.eq L(try_zva)
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ add dst, dst, 16
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+L(tail64):
+ subs count, count, 64
+ b.hi 1b
+2: stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
ret
- /* For zeroing memory, check to see if we can use the ZVA feature to
- * zero entire 'cache' lines. */
-.Lzero_mem:
- mov A_l, #0
- cmp count, #63
- b.le .Ltail_maybe_tiny
- neg tmp2, dst
- ands tmp2, tmp2, #15
- b.eq 1f
- sub count, count, tmp2
- stp A_l, A_l, [dst]
- add dst, dst, tmp2
- cmp count, #63
- b.le .Ltail63
-1:
- /* For zeroing small amounts of memory, it's not worth setting up
- * the line-clear code. */
- cmp count, #128
- b.lt .Lnot_short
-#ifdef MAYBE_VIRT
- /* For efficiency when virtualized, we cache the ZVA capability. */
- adrp tmp2, .Lcache_clear
- ldr zva_len, [tmp2, #:lo12:.Lcache_clear]
- tbnz zva_len, #31, .Lnot_short
- cbnz zva_len, .Lzero_by_line
+ .p2align 3
+L(try_zva):
mrs tmp1, dczid_el0
- tbz tmp1, #4, 1f
- /* ZVA not available. Remember this for next time. */
- mov zva_len, #~0
- str zva_len, [tmp2, #:lo12:.Lcache_clear]
- b .Lnot_short
-1:
- mov tmp3w, #4
- and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
- lsl zva_len, tmp3w, zva_len
- str zva_len, [tmp2, #:lo12:.Lcache_clear]
-#else
- mrs tmp1, dczid_el0
- tbnz tmp1, #4, .Lnot_short
- mov tmp3w, #4
- and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
- lsl zva_len, tmp3w, zva_len
-#endif
-
-.Lzero_by_line:
- /* Compute how far we need to go to become suitably aligned. We're
- * already at quad-word alignment. */
- cmp count, zva_len_x
- b.lt .Lnot_short /* Not enough to reach alignment. */
- sub zva_bits_x, zva_len_x, #1
- neg tmp2, dst
- ands tmp2, tmp2, zva_bits_x
- b.eq 1f /* Already aligned. */
- /* Not aligned, check that there's enough to copy after alignment. */
- sub tmp1, count, tmp2
- cmp tmp1, #64
- ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
- b.lt .Lnot_short
- /* We know that there's at least 64 bytes to zero and that it's safe
- * to overrun by 64 bytes. */
- mov count, tmp1
-2:
- stp A_l, A_l, [dst]
- stp A_l, A_l, [dst, #16]
- stp A_l, A_l, [dst, #32]
- subs tmp2, tmp2, #64
- stp A_l, A_l, [dst, #48]
- add dst, dst, #64
- b.ge 2b
- /* We've overrun a bit, so adjust dst downwards. */
- add dst, dst, tmp2
-1:
- sub count, count, zva_len_x
-3:
- dc zva, dst
- add dst, dst, zva_len_x
- subs count, count, zva_len_x
- b.ge 3b
- ands count, count, zva_bits_x
- b.ne .Ltail_maybe_long
+ tbnz tmp1w, 4, L(no_zva)
+ and tmp1w, tmp1w, 15
+ cmp tmp1w, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+
+ /* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores.
+ */
+L(zva_64):
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+ nop
+1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi 1b
+ stp q0, q0, [dst, 0]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
ret
-#ifdef BZERO
-END(bzero)
-#else
+
+ .p2align 3
+L(zva_128):
+ cmp tmp1w, 5 /* ZVA size is 128 bytes. */
+ b.ne L(zva_other)
+
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]
+ stp q0, q0, [dst, 96]
+ bic dst, dst, 127
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+128 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+1: dc zva, dst
+ add dst, dst, 128
+ subs count, count, 128
+ b.hi 1b
+ stp q0, q0, [dstend, -128]
+ stp q0, q0, [dstend, -96]
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(zva_other):
+ mov tmp2w, 4
+ lsl zva_lenw, tmp2w, tmp1w
+ add tmp1, zva_len, 64 /* Max alignment bytes written. */
+ cmp count, tmp1
+ blo L(no_zva)
+
+ sub tmp2, zva_len, 1
+ add tmp1, dst, zva_len
+ add dst, dst, 16
+ subs count, tmp1, dst /* Actual alignment bytes to write. */
+ bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
+ beq 2f
+1: stp q0, q0, [dst], 64
+ stp q0, q0, [dst, -32]
+ subs count, count, 64
+ b.hi 1b
+2: mov dst, tmp1
+ sub count, dstend, tmp1 /* Remaining bytes to write. */
+ subs count, count, zva_len
+ b.lo 4f
+3: dc zva, dst
+ add dst, dst, zva_len
+ subs count, count, zva_len
+ b.hs 3b
+4: add count, count, zva_len
+ b L(tail64)
+
END(memset)
-#endif
-
-#ifdef MAYBE_VIRT
- .bss
- .p2align 2
-.Lcache_clear:
- .space 4
-#endif
diff --git a/libc/arch-arm64/generic/bionic/strlen.S b/libc/arch-arm64/generic/bionic/strlen.S
index 3bd9809..6e540fc 100644
--- a/libc/arch-arm64/generic/bionic/strlen.S
+++ b/libc/arch-arm64/generic/bionic/strlen.S
@@ -1,16 +1,16 @@
-/* Copyright (c) 2014, Linaro Limited
+/* Copyright (c) 2013-2015, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
+ notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
* Neither the name of the Linaro nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
@@ -22,16 +22,19 @@
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
#include <private/bionic_asm.h>
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
+
/* Arguments and results. */
#define srcin x0
#define len x0
@@ -40,87 +43,185 @@
#define src x1
#define data1 x2
#define data2 x3
-#define data2a x4
-#define has_nul1 x5
-#define has_nul2 x6
-#define tmp1 x7
-#define tmp2 x8
-#define tmp3 x9
-#define tmp4 x10
-#define zeroones x11
-#define pos x12
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define L(l) .L ## l
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. A faster check
+ (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
+ false hits for characters 129..255. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
- /* Start of critial section -- keep to one 64Byte cache line. */
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 15
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+ /* Since strings are short on average, we check the first 16 bytes
+ of the string for a NUL character. In order to do an unaligned ldp
+ safely we have to do a page cross check first. If there is a NUL
+ byte we calculate the length from the 2 8-byte words using
+ conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 16 bytes, we align src so don't need
+ further page cross checks, and process 32 bytes per iteration
+ using the fast NUL check. If we encounter non-ASCII characters,
+ fallback to a second loop using the full NUL check.
+
+ If the page cross check fails, we read 16 bytes from an aligned
+ address, remove any characters before the string, and continue
+ in the main loop using aligned loads. Since strings crossing a
+ page in the first 16 bytes are rare (probability of
+ 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
+
+ AArch64 systems have a minimum page size of 4k. We don't bother
+ checking for larger page sizes - the cost of setting up the correct
+ page size is just not worth the extra gain from a small reduction in
+ the cases taking the slow path. Note that we only care about
+ whether the first fetch, which may be misaligned, crosses a page
+ boundary. */
+
ENTRY(strlen)
- mov zeroones, #REP8_01
- bic src, srcin, #15
- ands tmp1, srcin, #15
- b.ne .Lmisaligned
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
- /* The inner loop deals with two Dwords at a time. This has a
- slightly higher start-up cost, but we should win quite quickly,
- especially on cores with a high number of issue slots per
- cycle, as we get much better parallelism out of the operations. */
-.Lloop:
- ldp data1, data2, [src], #16
-.Lrealigned:
+ and tmp1, srcin, MIN_PAGE_SIZE - 1
+ mov zeroones, REP8_01
+ cmp tmp1, MIN_PAGE_SIZE - 16
+ b.gt L(page_cross)
+ ldp data1, data2, [srcin]
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul1/2 directly.
+ Since we expect strings to be small and early-exit,
+ byte-swap the data now so has_null1/2 will be correct. */
+ rev data1, data1
+ rev data2, data2
+#endif
sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
+ orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lloop
- /* End of critical section -- keep to one 64Byte cache line. */
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(main_loop_entry)
- sub len, src, srcin
- cbz has_nul1, .Lnul_in_data2
-#ifdef __AARCH64EB__
- mov data2, data1
-#endif
- sub len, len, #8
- mov has_nul2, has_nul1
-.Lnul_in_data2:
+ /* Enter with C = has_nul1 == 0. */
+ csel has_nul1, has_nul1, has_nul2, cc
+ mov len, 8
+ rev has_nul1, has_nul1
+ clz tmp1, has_nul1
+ csel len, xzr, len, cc
+ add len, len, tmp1, lsr 3
+ ret
+
+ /* The inner loop processes 32 bytes per iteration and uses the fast
+ NUL check. If we encounter non-ASCII characters, use a second
+ loop with the accurate NUL check. */
+ .p2align 4
+L(main_loop_entry):
+ bic src, srcin, 15
+ sub src, src, 16
+L(main_loop):
+ ldp data1, data2, [src, 32]!
+.Lpage_cross_entry:
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ bne 1f
+ ldp data1, data2, [src, 16]
+ sub tmp1, data1, zeroones
+ sub tmp3, data2, zeroones
+ orr tmp2, tmp1, tmp3
+ tst tmp2, zeroones, lsl 7
+ beq L(main_loop)
+ add src, src, 16
+1:
+ /* The fast check failed, so do the slower, accurate NUL check. */
+ orr tmp2, data1, REP8_7f
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+
+ /* Enter with C = has_nul1 == 0. */
+L(tail):
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul directly. The
+ string is 0x01) means we cannot use has_nul1/2 directly. The
easiest way to get the correct byte is to byte-swap the data
and calculate the syndrome a second time. */
- rev data2, data2
- sub tmp1, data2, zeroones
- orr tmp2, data2, #REP8_7f
- bic has_nul2, tmp1, tmp2
+ csel data1, data1, data2, cc
+ rev data1, data1
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ bic has_nul1, tmp1, tmp2
+#else
+ csel has_nul1, has_nul1, has_nul2, cc
#endif
- sub len, len, #8
- rev has_nul2, has_nul2
- clz pos, has_nul2
- add len, len, pos, lsr #3 /* Bits to bytes. */
+ sub len, src, srcin
+ rev has_nul1, has_nul1
+ add tmp2, len, 8
+ clz tmp1, has_nul1
+ csel len, len, tmp2, cc
+ add len, len, tmp1, lsr 3
ret
-.Lmisaligned:
- cmp tmp1, #8
- neg tmp1, tmp1
- ldp data1, data2, [src], #16
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- mov tmp2, #~0
+L(nonascii_loop):
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ bne L(tail)
+ ldp data1, data2, [src, 16]!
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, REP8_7f
+ bics has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ ccmp has_nul2, 0, 0, eq
+ beq L(nonascii_loop)
+ b L(tail)
+
+ /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
+ srcin to 0x7f, so we ignore any NUL bytes before the string.
+ Then continue in the aligned loop. */
+L(page_cross):
+ bic src, srcin, 15
+ ldp data1, data2, [src]
+ lsl tmp1, srcin, 3
+ mov tmp4, -1
#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+ /* Big-endian. Early bytes are at MSB. */
+ lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
#else
/* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+ lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
#endif
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
- csinv data1, data1, xzr, le
- csel data2, data2, data2a, le
- b .Lrealigned
+ orr tmp1, tmp1, REP8_80
+ orn data1, data1, tmp1
+ orn tmp2, data2, tmp1
+ tst srcin, 8
+ csel data1, data1, tmp4, eq
+ csel data2, data2, tmp2, eq
+ b L(page_cross_entry)
END(strlen)
diff --git a/libc/arch-arm64/generic/bionic/strrchr.S b/libc/arch-arm64/generic/bionic/strrchr.S
new file mode 100644
index 0000000..46b5031
--- /dev/null
+++ b/libc/arch-arm64/generic/bionic/strrchr.S
@@ -0,0 +1,171 @@
+/*
+ strrchr - find last instance of a character in a string
+
+ Copyright (c) 2014, ARM Limited
+ All rights Reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the company nor the names of its contributors
+ may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include <private/bionic_asm.h>
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+#define src_match x6
+#define src_offset x7
+#define const_m1 x8
+#define tmp4 x9
+#define nul_match x10
+#define chr_match x11
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask_0 v7
+#define vrepmask_c v16
+#define vend1 v17
+#define vend2 v18
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character; bit 1 is set
+ iff the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL). Since the bits
+ in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination, and why. */
+
+/* Locals and temporaries. */
+
+ENTRY(strrchr)
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x80200802 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask_c.4s, wtmp2
+ mov src_offset, #0
+ ands tmp1, srcin, #31
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq .Laligned
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vhas_nul1.2d[0]
+ lsl tmp1, tmp1, #1
+ mov const_m1, #~0
+ mov chr_match, vhas_chr1.2d[0]
+ lsr tmp3, const_m1, tmp1
+
+ bic nul_match, nul_match, tmp3 // Mask padding bits.
+ bic chr_match, chr_match, tmp3 // Mask padding bits.
+ cbnz nul_match, .Ltail
+
+.Lloop:
+ cmp chr_match, #0
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+.Laligned:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.2d[0]
+ mov chr_match, vhas_chr1.2d[0]
+ cbz nul_match, .Lloop
+
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+ mov nul_match, vhas_nul1.2d[0]
+
+.Ltail:
+ /* Work out exactly where the string ends. */
+ sub tmp4, nul_match, #1
+ eor tmp4, tmp4, nul_match
+ ands chr_match, chr_match, tmp4
+ /* And pick the values corresponding to the last match. */
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+
+ /* Count down from the top of the syndrome to find the last match. */
+ clz tmp3, src_offset
+ /* Src_match points beyond the word containing the match, so we can
+ simply subtract half the bit-offset into the syndrome. Because
+ we are counting down, we need to go back one more character. */
+ add tmp3, tmp3, #2
+ sub result, src_match, tmp3, lsr #1
+ /* But if the syndrome shows no match was found, then return NULL. */
+ cmp src_offset, #0
+ csel result, result, xzr, ne
+
+ ret
+
+END(strrchr)
diff --git a/libc/arch-arm64/generic/generic.mk b/libc/arch-arm64/generic/generic.mk
index 1b595aa..4512dc5 100644
--- a/libc/arch-arm64/generic/generic.mk
+++ b/libc/arch-arm64/generic/generic.mk
@@ -11,4 +11,5 @@ libc_bionic_src_files_arm64 += \
arch-arm64/generic/bionic/strlen.S \
arch-arm64/generic/bionic/strncmp.S \
arch-arm64/generic/bionic/strnlen.S \
+ arch-arm64/generic/bionic/strrchr.S \
arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-arm64/kryo/bionic/memcpy.S b/libc/arch-arm64/kryo/bionic/memcpy.S
new file mode 100644
index 0000000..87e1b3b
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy.S
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+// Prototype: void *memcpy (void *dst, const void *src, size_t count).
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+ENTRY(__memcpy_chk)
+ cmp x2, x3
+ b.hi __memcpy_chk_fail
+
+ // Fall through to memcpy...
+ b memcpy
+END(__memcpy_chk)
+
+ .align 6
+ENTRY(memcpy)
+ #include "memcpy_base.S"
+END(memcpy)
+
+ENTRY_PRIVATE(__memcpy_chk_fail)
+ // Preserve for accurate backtrace.
+ stp x29, x30, [sp, -16]!
+ .cfi_def_cfa_offset 16
+ .cfi_rel_offset x29, 0
+ .cfi_rel_offset x30, 8
+
+ adrp x0, error_string
+ add x0, x0, :lo12:error_string
+ ldr x1, error_code
+ bl __fortify_chk_fail
+error_code:
+ .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+END(__memcpy_chk_fail)
+
+ .data
+ .align 2
+error_string:
+ .string "memcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm64/kryo/bionic/memcpy_base.S b/libc/arch-arm64/kryo/bionic/memcpy_base.S
new file mode 100644
index 0000000..0096bb7
--- /dev/null
+++ b/libc/arch-arm64/kryo/bionic/memcpy_base.S
@@ -0,0 +1,244 @@
+/* Copyright (c) 2015 The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of The Linux Foundation nor the names of its contributors may
+ * be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef PLDOFFS
+#undef PLDOFFS
+#endif
+#define PLDOFFS (16)
+
+#ifdef PLDTHRESH
+#undef PLDTHRESH
+#endif
+#define PLDTHRESH (PLDOFFS)
+
+#ifdef BBTHRESH
+#undef BBTHRESH
+#endif
+#define BBTHRESH (2048/128)
+
+#if (PLDOFFS < 1)
+#error Routine does not support offsets less than 1
+#endif
+#if (PLDTHRESH < PLDOFFS)
+#error PLD threshold must be greater than or equal to the PLD offset
+#endif
+
+#ifdef PLDSIZE
+#undef PLDSIZE
+#endif
+#define PLDSIZE (128)
+
+kryo_bb_memcpy:
+ mov x11, x0
+ cmp x2, #4
+ blo kryo_bb_lt4
+ cmp x2, #16
+ blo kryo_bb_lt16
+ cmp x2, #32
+ blo kryo_bb_16
+ cmp x2, #64
+ blo kryo_bb_copy_32_a
+ cmp x2, #128
+ blo kryo_bb_copy_64_a
+
+ // we have at least 127 bytes to achieve 128-byte alignment
+ neg x3, x1 // calculate count to get SOURCE aligned
+ ands x3, x3, #0x7F
+ b.eq kryo_bb_source_aligned // already aligned
+ // alignment fixup, small to large (favorable alignment)
+ tbz x3, #0, 1f
+ ldrb w5, [x1], #1
+ strb w5, [x0], #1
+1: tbz x3, #1, 2f
+ ldrh w6, [x1], #2
+ strh w6, [x0], #2
+2: tbz x3, #2, 3f
+ ldr w8, [x1], #4
+ str w8, [x0], #4
+3: tbz x3, #3, 4f
+ ldr x9, [x1], #8
+ str x9, [x0], #8
+4: tbz x3, #4, 5f
+ ldr q7, [x1], #16
+ str q7, [x0], #16
+5: tbz x3, #5, 55f
+ ldp q0, q1, [x1], #32
+ stp q0, q1, [x0], #32
+55: tbz x3, #6, 6f
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+6: subs x2, x2, x3 // fixup count after alignment
+ b.eq kryo_bb_exit
+ cmp x2, #128
+ blo kryo_bb_copy_64_a
+kryo_bb_source_aligned:
+ lsr x12, x2, #7
+ cmp x12, #PLDTHRESH
+ bls kryo_bb_copy_128_loop_nopld
+
+ cmp x12, #BBTHRESH
+ bls kryo_bb_prime_pump
+
+ add x14, x0, #0x400
+ add x9, x1, #(PLDOFFS*PLDSIZE)
+ sub x14, x14, x9
+ lsl x14, x14, #(21+32)
+ lsr x14, x14, #(21+32)
+ add x14, x14, #(PLDOFFS*PLDSIZE)
+ cmp x12, x14, lsr #7
+ bls kryo_bb_prime_pump
+
+ mov x9, #(PLDOFFS)
+ lsr x13, x14, #7
+ subs x9, x13, x9
+ bls kryo_bb_prime_pump
+
+ add x10, x1, x14
+ bic x10, x10, #0x7F // Round to multiple of PLDSIZE
+
+ sub x12, x12, x14, lsr #7
+ cmp x9, x12
+ sub x13, x12, x9
+ csel x12, x13, x12, LS
+ csel x9, x12, x9, HI
+ csel x12, xzr, x12, HI
+
+ prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)]
+ prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)]
+kryo_bb_copy_128_loop_outer_doublepld:
+ prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)]
+ prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64]
+ subs x9, x9, #1
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ ldp q4, q5, [x1], #32
+ ldp q6, q7, [x1], #32
+ prfm PLDL1KEEP, [x10]
+ prfm PLDL1KEEP, [x10, #64]
+ add x10, x10, #128
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+ stp q4, q5, [x0], #32
+ stp q6, q7, [x0], #32
+ bne kryo_bb_copy_128_loop_outer_doublepld
+ cmp x12, #0
+ beq kryo_bb_pop_before_nopld
+ cmp x12, #(448*1024/128)
+ bls kryo_bb_copy_128_loop_outer
+
+kryo_bb_copy_128_loop_ddr:
+ subs x12, x12, #1
+ ldr x3, [x10], #128
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ ldp q4, q5, [x1], #32
+ ldp q6, q7, [x1], #32
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+ stp q4, q5, [x0], #32
+ stp q6, q7, [x0], #32
+ bne kryo_bb_copy_128_loop_ddr
+ b kryo_bb_pop_before_nopld
+
+kryo_bb_prime_pump:
+ mov x14, #(PLDOFFS*PLDSIZE)
+ add x10, x1, #(PLDOFFS*PLDSIZE)
+ bic x10, x10, #0x7F
+ sub x12, x12, #PLDOFFS
+ prfm PLDL1KEEP, [x10, #(-1*PLDSIZE)]
+ prfm PLDL1KEEP, [x10, #(-1*PLDSIZE+64)]
+ cmp x12, #(448*1024/128)
+ bhi kryo_bb_copy_128_loop_ddr
+
+kryo_bb_copy_128_loop_outer:
+ subs x12, x12, #1
+ prfm PLDL1KEEP, [x10]
+ prfm PLDL1KEEP, [x10, #64]
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ ldp q4, q5, [x1], #32
+ ldp q6, q7, [x1], #32
+ add x10, x10, #128
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+ stp q4, q5, [x0], #32
+ stp q6, q7, [x0], #32
+ bne kryo_bb_copy_128_loop_outer
+
+kryo_bb_pop_before_nopld:
+ lsr x12, x14, #7
+kryo_bb_copy_128_loop_nopld:
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ ldp q4, q5, [x1], #32
+ ldp q6, q7, [x1], #32
+ subs x12, x12, #1
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+ stp q4, q5, [x0], #32
+ stp q6, q7, [x0], #32
+ bne kryo_bb_copy_128_loop_nopld
+ ands x2, x2, #0x7f
+ beq kryo_bb_exit
+
+kryo_bb_copy_64_a:
+ tbz x2, #6, kryo_bb_copy_32_a
+ ldp q0, q1, [x1], #32
+ ldp q2, q3, [x1], #32
+ stp q0, q1, [x0], #32
+ stp q2, q3, [x0], #32
+kryo_bb_copy_32_a:
+ tbz x2, #5, kryo_bb_16
+ ldp q0, q1, [x1], #32
+ stp q0, q1, [x0], #32
+kryo_bb_16:
+ tbz x2, #4, kryo_bb_lt16
+ ldr q7, [x1], #16
+ str q7, [x0], #16
+ ands x2, x2, #0x0f
+ beq kryo_bb_exit
+kryo_bb_lt16:
+ tbz x2, #3, kryo_bb_lt8
+ ldr x3, [x1], #8
+ str x3, [x0], #8
+kryo_bb_lt8:
+ tbz x2, #2, kryo_bb_lt4
+ ldr w3, [x1], #4
+ str w3, [x0], #4
+kryo_bb_lt4:
+ tbz x2, #1, kryo_bb_lt2
+ ldrh w3, [x1], #2
+ strh w3, [x0], #2
+kryo_bb_lt2:
+ tbz x2, #0, kryo_bb_exit
+ ldrb w3, [x1], #1
+ strb w3, [x0], #1
+kryo_bb_exit:
+ mov x0, x11
+ ret
+
diff --git a/libc/arch-arm64/kryo/kryo.mk b/libc/arch-arm64/kryo/kryo.mk
new file mode 100644
index 0000000..1d901d0
--- /dev/null
+++ b/libc/arch-arm64/kryo/kryo.mk
@@ -0,0 +1,15 @@
+libc_bionic_src_files_arm64 += \
+ arch-arm64/generic/bionic/memchr.S \
+ arch-arm64/generic/bionic/memcmp.S \
+ arch-arm64/kryo/bionic/memcpy.S \
+ arch-arm64/generic/bionic/memmove.S \
+ arch-arm64/generic/bionic/memset.S \
+ arch-arm64/generic/bionic/stpcpy.S \
+ arch-arm64/generic/bionic/strchr.S \
+ arch-arm64/generic/bionic/strcmp.S \
+ arch-arm64/generic/bionic/strcpy.S \
+ arch-arm64/generic/bionic/strlen.S \
+ arch-arm64/generic/bionic/strncmp.S \
+ arch-arm64/generic/bionic/strnlen.S \
+ arch-arm64/generic/bionic/strrchr.S \
+ arch-arm64/generic/bionic/wmemmove.S
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
index 0dbffad..6a5afd6 100644
--- a/libc/arch-x86_64/string/sse2-memmove-slm.S
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -91,9 +91,6 @@ name: \
.section .text.sse2,"ax",@progbits
ENTRY (MEMMOVE)
ENTRANCE
-#ifdef USE_AS_BCOPY
- xchg %rsi, %rdi
-#endif
mov %rdi, %rax
/* Check whether we should copy backward or forward. */
diff --git a/libc/bionic/malloc_debug_check.cpp b/libc/bionic/malloc_debug_check.cpp
index dee03fa..ad0e613 100644
--- a/libc/bionic/malloc_debug_check.cpp
+++ b/libc/bionic/malloc_debug_check.cpp
@@ -45,6 +45,7 @@
#include <time.h>
#include <unistd.h>
#include <unwind.h>
+#include <signal.h>
#include "debug_mapinfo.h"
#include "debug_stacktrace.h"
@@ -55,6 +56,14 @@
#include "private/libc_logging.h"
#include "private/ScopedPthreadMutexLocker.h"
+static unsigned int malloc_sig_enabled = 0;
+static unsigned int min_allocation_report_limit;
+static unsigned int max_allocation_limit;
+static const char* process_name;
+static size_t total_count = 0;
+static bool isDumped = false;
+static bool sigHandled = false;
+
#define MAX_BACKTRACE_DEPTH 16
#define ALLOCATION_TAG 0x1ee7d00d
#define BACKLOG_TAG 0xbabecafe
@@ -63,6 +72,11 @@
#define FRONT_GUARD_LEN (1<<5)
#define REAR_GUARD 0xbb
#define REAR_GUARD_LEN (1<<5)
+#define FRONT_GUARD_SS 0xab
+#define DEBUG_SIGNAL SIGWINCH
+
+static void malloc_sigaction(int signum, siginfo_t * sg, void * cxt);
+static struct sigaction default_sa;
static void log_message(const char* format, ...) {
va_list args;
@@ -135,9 +149,14 @@ static inline void init_front_guard(hdr_t* hdr) {
memset(hdr->front_guard, FRONT_GUARD, FRONT_GUARD_LEN);
}
+static inline void set_snapshot(hdr_t* hdr) {
+ memset(hdr->front_guard, FRONT_GUARD_SS, FRONT_GUARD_LEN);
+}
+
static inline bool is_front_guard_valid(hdr_t* hdr) {
for (size_t i = 0; i < FRONT_GUARD_LEN; i++) {
- if (hdr->front_guard[i] != FRONT_GUARD) {
+ if (!((hdr->front_guard[i] == FRONT_GUARD) ||
+ (hdr->front_guard[i] == FRONT_GUARD_SS))) {
return false;
}
}
@@ -171,6 +190,9 @@ static inline bool is_rear_guard_valid(hdr_t* hdr) {
}
static inline void add_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
+ if (hdr->tag == ALLOCATION_TAG) {
+ total_count += hdr->size;
+ }
hdr->prev = NULL;
hdr->next = *head;
if (*head)
@@ -181,6 +203,9 @@ static inline void add_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
}
static inline int del_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
+ if (hdr->tag == ALLOCATION_TAG) {
+ total_count -= hdr->size;
+ }
if (hdr->prev) {
hdr->prev->next = hdr->next;
} else {
@@ -194,6 +219,25 @@ static inline int del_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) {
return 0;
}
+static void snapshot_report_leaked_nodes() {
+ log_message("%s: %s\n", __FILE__, __FUNCTION__);
+ hdr_t * iterator = head;
+ size_t total_size = 0;
+ do {
+ if (iterator->front_guard[0] == FRONT_GUARD &&
+ iterator->size >= min_allocation_report_limit) {
+ log_message("obj %p, size %d", iterator, iterator->size);
+ total_size += iterator->size;
+ log_backtrace(iterator->bt, iterator->bt_depth);
+ log_message("------------------------------"); // as an end marker
+ // Marking the node as we do not want to print it again.
+ set_snapshot(iterator);
+ }
+ iterator = iterator->next;
+ } while (iterator);
+ log_message("Total Pending allocations after last snapshot: %d", total_size);
+}
+
static inline void add(hdr_t* hdr, size_t size) {
ScopedPthreadMutexLocker locker(&lock);
hdr->tag = ALLOCATION_TAG;
@@ -202,6 +246,11 @@ static inline void add(hdr_t* hdr, size_t size) {
init_rear_guard(hdr);
++g_allocated_block_count;
add_locked(hdr, &tail, &head);
+ if ((total_count >= max_allocation_limit) && !isDumped && malloc_sig_enabled) {
+ isDumped = true;
+ sigHandled = true; // Need to bypass the snapshot
+ kill(getpid(), DEBUG_SIGNAL);
+ }
}
static inline int del(hdr_t* hdr) {
@@ -233,7 +282,8 @@ static bool was_used_after_free(hdr_t* hdr) {
static inline int check_guards(hdr_t* hdr, int* safe) {
*safe = 1;
if (!is_front_guard_valid(hdr)) {
- if (hdr->front_guard[0] == FRONT_GUARD) {
+ if ((hdr->front_guard[0] == FRONT_GUARD) ||
+ ((hdr->front_guard[0] == FRONT_GUARD_SS))) {
log_message("+++ ALLOCATION %p SIZE %d HAS A CORRUPTED FRONT GUARD\n",
user(hdr), hdr->size);
} else {
@@ -656,6 +706,42 @@ extern "C" bool malloc_debug_initialize(HashTable* hash_table, const MallocDebug
__libc_format_log(ANDROID_LOG_INFO, "libc", "not gathering backtrace information\n");
}
+ if (__system_property_get("libc.debug.malloc", env)) {
+ if(atoi(env) == 40) malloc_sig_enabled = 1;
+ }
+
+ if (malloc_sig_enabled) {
+ char debug_proc_size[PROP_VALUE_MAX];
+ if (__system_property_get("libc.debug.malloc.maxprocsize", debug_proc_size))
+ max_allocation_limit = atoi(debug_proc_size);
+ else
+ max_allocation_limit = 30 * 1024 * 1024; // In Bytes [Default is 30 MB]
+ if (__system_property_get("libc.debug.malloc.minalloclim", debug_proc_size))
+ min_allocation_report_limit = atoi(debug_proc_size);
+ else
+ min_allocation_report_limit = 10 * 1024; // In Bytes [Default is 10 KB]
+ process_name = getprogname();
+ }
+
+/* Initializes malloc debugging framework.
+ * See comments on MallocDebugInit in malloc_debug_common.h
+ */
+ if (malloc_sig_enabled) {
+ struct sigaction sa; //local or static?
+ sa.sa_handler = NULL;
+ sa.sa_sigaction = malloc_sigaction;
+ sigemptyset(&sa.sa_mask);
+ sigaddset(&sa.sa_mask, DEBUG_SIGNAL);
+ sa.sa_flags = SA_SIGINFO;
+ sa.sa_restorer = NULL;
+ if (sigaction(DEBUG_SIGNAL, &sa, &default_sa) < 0) {
+ log_message("Failed to register signal handler w/ errno %s", strerror(errno));
+ malloc_sig_enabled = 0;
+ } else {
+ log_message("Registered signal handler");
+ sigHandled = false;
+ }
+ }
if (g_backtrace_enabled) {
backtrace_startup();
}
@@ -668,9 +754,66 @@ extern "C" void malloc_debug_finalize(int malloc_debug_level) {
if (malloc_debug_level == 10) {
ReportMemoryLeaks();
}
+ if (malloc_sig_enabled) {
+ log_message("Deregister %d signal handler", DEBUG_SIGNAL);
+ sigaction(DEBUG_SIGNAL, &default_sa, NULL);
+ malloc_sig_enabled = 0;
+ sigHandled = false;
+ }
if (g_backtrace_enabled) {
backtrace_shutdown();
}
pthread_setspecific(g_debug_calls_disabled, NULL);
}
+
+static void snapshot_nodes_locked() {
+ log_message("%s: %s\n", __FILE__, __FUNCTION__);
+ hdr_t * iterator = head;
+ do {
+ if (iterator->front_guard[0] == FRONT_GUARD) {
+ set_snapshot(iterator);
+ }
+ iterator = iterator->next;
+ } while (iterator);
+}
+
+static void malloc_sigaction(int signum, siginfo_t * info, void * context)
+{
+ log_message("%s: %s\n", __FILE__, __FUNCTION__);
+ log_message("%s got %d signal from PID: %d (context:%x)\n",
+ __func__, signum, info->si_pid, context);
+
+ if (signum != DEBUG_SIGNAL) {
+ log_message("RECEIVED %d instead of %d\n", signum, DEBUG_SIGNAL);
+ return;
+ }
+
+ log_message("Process under observation:%s", process_name);
+ log_message("Maximum process size limit:%d Bytes", max_allocation_limit);
+ log_message("Won't print allocation below %d Bytes", min_allocation_report_limit);
+ log_message("Total count: %d\n", total_count);
+
+ if (!head) {
+ log_message("No allocations?");
+ return;
+ }
+ // If sigHandled is false, meaning it's being handled first time
+ if (!sigHandled) {
+ sigHandled = true;
+ // Marking the nodes assuming that they should not be leaked nodes.
+ snapshot_nodes_locked();
+ } else {
+ // We need to print new allocations now
+ log_message("Start dumping allocations of the process %s", process_name);
+ log_message("+++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ ***\n");
+
+ // Print allocations of the process
+ if (g_backtrace_enabled)
+ snapshot_report_leaked_nodes();
+
+ log_message("*** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++\n");
+ log_message("Completed dumping allocations of the process %s", process_name);
+ }
+ return;
+}
diff --git a/libc/bionic/malloc_debug_common.cpp b/libc/bionic/malloc_debug_common.cpp
index ee796c6..12fc6dd 100644
--- a/libc/bionic/malloc_debug_common.cpp
+++ b/libc/bionic/malloc_debug_common.cpp
@@ -396,6 +396,9 @@ static void malloc_init_impl() {
}
so_name = "libc_malloc_debug_qemu.so";
break;
+ case 40:
+ so_name = "libc_malloc_debug_leak.so";
+ break;
default:
error_log("%s: Debug level %d is unknown\n", getprogname(), g_malloc_debug_level);
return;
@@ -456,6 +459,9 @@ static void malloc_init_impl() {
case 20:
InitMalloc(malloc_impl_handle, &malloc_dispatch_table, "qemu_instrumented");
break;
+ case 40:
+ InitMalloc(malloc_impl_handle, &malloc_dispatch_table, "chk");
+ break;
default:
break;
}
diff --git a/libc/bionic/mmap.cpp b/libc/bionic/mmap.cpp
index 8f25a89..53e8b46 100644
--- a/libc/bionic/mmap.cpp
+++ b/libc/bionic/mmap.cpp
@@ -36,6 +36,11 @@
extern "C" void* __mmap2(void*, size_t, int, int, int, size_t);
#define MMAP2_SHIFT 12 // 2**12 == 4096
+#ifdef LEGACY_MMAP
+#define TO_64(a) ((a) & 0x00000000ffffffff)
+#else
+#define TO_64(a) (a)
+#endif
static bool kernel_has_MADV_MERGEABLE = true;
@@ -60,5 +65,5 @@ void* mmap64(void* addr, size_t size, int prot, int flags, int fd, off64_t offse
}
void* mmap(void* addr, size_t size, int prot, int flags, int fd, off_t offset) {
- return mmap64(addr, size, prot, flags, fd, static_cast<off64_t>(offset));
+ return mmap64(addr, size, prot, flags, fd, TO_64(static_cast<off64_t>(offset)));
}
diff --git a/libc/include/paths.h b/libc/include/paths.h
index 82c2804..7700cdd 100644
--- a/libc/include/paths.h
+++ b/libc/include/paths.h
@@ -33,6 +33,7 @@
#define _PATHS_H_
#define _PATH_BSHELL "/system/bin/sh"
+#define _PATH_BSHELL2 "/sbin/sh"
#define _PATH_CONSOLE "/dev/console"
#define _PATH_DEFPATH "/sbin:/vendor/bin:/system/sbin:/system/bin:/system/xbin"
#define _PATH_DEV "/dev/"
diff --git a/libc/include/regex.h b/libc/include/regex.h
index aec38e3..b06a515 100644
--- a/libc/include/regex.h
+++ b/libc/include/regex.h
@@ -42,8 +42,9 @@
#include <sys/cdefs.h>
#include <sys/types.h>
-/* types */
-typedef off_t regoff_t;
+/* POSIX says regoff_t is at least as large as the larger of ptrdiff_t and
+ * ssize_t. BSD uses off_t, but that interacts badly with _FILE_OFFSET_BITS. */
+typedef ssize_t regoff_t;
typedef struct {
int re_magic;
diff --git a/libc/kernel/uapi/linux/android_alarm.h b/libc/kernel/uapi/linux/android_alarm.h
index 801a01e..9f2de28 100644
--- a/libc/kernel/uapi/linux/android_alarm.h
+++ b/libc/kernel/uapi/linux/android_alarm.h
@@ -28,28 +28,31 @@ enum android_alarm_type {
/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
ANDROID_ALARM_ELAPSED_REALTIME,
ANDROID_ALARM_SYSTEMTIME,
+ ANDROID_ALARM_RTC_POWEROFF_WAKEUP,
ANDROID_ALARM_TYPE_COUNT,
-};
/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+};
enum android_alarm_return_flags {
ANDROID_ALARM_RTC_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_WAKEUP,
ANDROID_ALARM_RTC_MASK = 1U << ANDROID_ALARM_RTC,
- ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+ ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP,
ANDROID_ALARM_ELAPSED_REALTIME_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME,
ANDROID_ALARM_SYSTEMTIME_MASK = 1U << ANDROID_ALARM_SYSTEMTIME,
+ ANDROID_ALARM_RTC_POWEROFF_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_POWEROFF_WAKEUP,
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
ANDROID_ALARM_TIME_CHANGE_MASK = 1U << 16
};
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
#define ANDROID_ALARM_CLEAR(type) _IO('a', 0 | ((type) << 4))
#define ANDROID_ALARM_WAIT _IO('a', 1)
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
#define ALARM_IOW(c,type,size) _IOW('a', (c) | ((type) << 4), size)
#define ANDROID_ALARM_SET(type) ALARM_IOW(2, type, struct timespec)
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
#define ANDROID_ALARM_SET_AND_WAIT(type) ALARM_IOW(3, type, struct timespec)
#define ANDROID_ALARM_GET_TIME(type) ALARM_IOW(4, type, struct timespec)
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
#define ANDROID_ALARM_SET_RTC _IOW('a', 5, struct timespec)
#define ANDROID_ALARM_BASE_CMD(cmd) (cmd & ~(_IOC(0, 0, 0xf0, 0)))
-/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
#define ANDROID_ALARM_IOCTL_TO_TYPE(cmd) (_IOC_NR(cmd) >> 4)
#endif
+/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
diff --git a/libc/kernel/uapi/linux/time.h b/libc/kernel/uapi/linux/time.h
index bf245fc..5690d27 100644
--- a/libc/kernel/uapi/linux/time.h
+++ b/libc/kernel/uapi/linux/time.h
@@ -67,9 +67,10 @@ struct itimerval {
#define CLOCK_SGI_CYCLE 10
/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
#define CLOCK_TAI 11
+#define CLOCK_POWEROFF_ALARM 12
#define MAX_CLOCKS 16
#define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC)
-#define CLOCKS_MONO CLOCK_MONOTONIC
/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */
+#define CLOCKS_MONO CLOCK_MONOTONIC
#define TIMER_ABSTIME 0x01
#endif
diff --git a/libc/upstream-netbsd/lib/libc/gen/popen.c b/libc/upstream-netbsd/lib/libc/gen/popen.c
index 593e346..b6ce47c 100644
--- a/libc/upstream-netbsd/lib/libc/gen/popen.c
+++ b/libc/upstream-netbsd/lib/libc/gen/popen.c
@@ -152,6 +152,8 @@ popen(const char *command, const char *type)
}
execl(_PATH_BSHELL, "sh", "-c", command, NULL);
+ if (errno == ENOENT)
+ execl(_PATH_BSHELL2, "sh", "-c", command, NULL);
_exit(127);
/* NOTREACHED */
}