diff options
Diffstat (limited to 'libc')
60 files changed, 3330 insertions, 1681 deletions
diff --git a/libc/Android.mk b/libc/Android.mk index f0c5e9f..f7f2adc 100644 --- a/libc/Android.mk +++ b/libc/Android.mk @@ -611,6 +611,10 @@ ifneq ($(BOARD_MALLOC_ALIGNMENT),) libc_common_cflags += -DMALLOC_ALIGNMENT=$(BOARD_MALLOC_ALIGNMENT) endif +ifeq ($(BOARD_USES_LEGACY_MMAP),true) + libc_common_cflags += -DLEGACY_MMAP +endif + # Define some common conlyflags libc_common_conlyflags := \ -std=gnu99 @@ -1394,6 +1398,9 @@ LOCAL_SRC_FILES_arm += \ LOCAL_ADDRESS_SANITIZER := false LOCAL_NATIVE_COVERAGE := $(bionic_coverage) +# Allow devices to provide additional symbols +LOCAL_WHOLE_STATIC_LIBRARIES += $(BOARD_PROVIDES_ADDITIONAL_BIONIC_STATIC_LIBS) + include $(BUILD_SHARED_LIBRARY) diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk index d72a160..c2b80c5 100644 --- a/libc/arch-arm/arm.mk +++ b/libc/arch-arm/arm.mk @@ -20,7 +20,6 @@ libc_freebsd_src_files_arm += \ upstream-freebsd/lib/libc/string/wmemmove.c \ libc_openbsd_src_files_arm += \ - upstream-openbsd/lib/libc/string/memchr.c \ upstream-openbsd/lib/libc/string/memrchr.c \ upstream-openbsd/lib/libc/string/stpncpy.c \ upstream-openbsd/lib/libc/string/strlcat.c \ @@ -52,7 +51,7 @@ ifeq ($(strip $(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)),) endif cpu_variant_mk := $(LOCAL_PATH)/arch-arm/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT)/$(TARGET_$(my_2nd_arch_prefix)CPU_VARIANT).mk ifeq ($(wildcard $(cpu_variant_mk)),) -$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.") +$(error "TARGET_$(my_2nd_arch_prefix)CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait, scorpion, denver. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.") endif include $(cpu_variant_mk) libc_common_additional_dependencies += $(cpu_variant_mk) diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S index a2e9c22..3692f04 100644 --- a/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S +++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 The Android Open Source Project + * Copyright (C) 2015 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -26,191 +26,7 @@ * SUCH DAMAGE. */ -#include <private/bionic_asm.h> -#include <private/libc_events.h> +// Indicate which memcpy base file to include. +#define MEMCPY_BASE "memcpy_base.S" - .syntax unified - - .thumb - .thumb_func - -// Get the length of src string, then get the source of the dst string. -// Check that the two lengths together don't exceed the threshold, then -// do a memcpy of the data. -ENTRY(__strcat_chk) - pld [r0, #0] - push {r0, lr} - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - push {r4, r5} - .cfi_adjust_cfa_offset 8 - .cfi_rel_offset r4, 0 - .cfi_rel_offset r5, 4 - - mov lr, r2 - - // Save the dst register to r5 - mov r5, r0 - - // Zero out r4 - eor r4, r4, r4 - - // r1 contains the address of the string to count. -.L_strlen_start: - mov r0, r1 - ands r3, r1, #7 - beq .L_mainloop - - // Align to a double word (64 bits). - rsb r3, r3, #8 - lsls ip, r3, #31 - beq .L_align_to_32 - - ldrb r2, [r1], #1 - cbz r2, .L_update_count_and_finish - -.L_align_to_32: - bcc .L_align_to_64 - ands ip, r3, #2 - beq .L_align_to_64 - - ldrb r2, [r1], #1 - cbz r2, .L_update_count_and_finish - ldrb r2, [r1], #1 - cbz r2, .L_update_count_and_finish - -.L_align_to_64: - tst r3, #4 - beq .L_mainloop - ldr r3, [r1], #4 - - sub ip, r3, #0x01010101 - bic ip, ip, r3 - ands ip, ip, #0x80808080 - bne .L_zero_in_second_register - - .p2align 2 -.L_mainloop: - ldrd r2, r3, [r1], #8 - - pld [r1, #64] - - sub ip, r2, #0x01010101 - bic ip, ip, r2 - ands ip, ip, #0x80808080 - bne .L_zero_in_first_register - - sub ip, r3, #0x01010101 - bic ip, ip, r3 - ands ip, ip, #0x80808080 - bne .L_zero_in_second_register - b .L_mainloop - -.L_update_count_and_finish: - sub r3, r1, r0 - sub r3, r3, #1 - b .L_finish - -.L_zero_in_first_register: - sub r3, r1, r0 - lsls r2, ip, #17 - bne .L_sub8_and_finish - bcs .L_sub7_and_finish - lsls ip, ip, #1 - bne .L_sub6_and_finish - - sub r3, r3, #5 - b .L_finish - -.L_sub8_and_finish: - sub r3, r3, #8 - b .L_finish - -.L_sub7_and_finish: - sub r3, r3, #7 - b .L_finish - -.L_sub6_and_finish: - sub r3, r3, #6 - b .L_finish - -.L_zero_in_second_register: - sub r3, r1, r0 - lsls r2, ip, #17 - bne .L_sub4_and_finish - bcs .L_sub3_and_finish - lsls ip, ip, #1 - bne .L_sub2_and_finish - - sub r3, r3, #1 - b .L_finish - -.L_sub4_and_finish: - sub r3, r3, #4 - b .L_finish - -.L_sub3_and_finish: - sub r3, r3, #3 - b .L_finish - -.L_sub2_and_finish: - sub r3, r3, #2 - -.L_finish: - cmp r4, #0 - bne .L_strlen_done - - // Time to get the dst string length. - mov r1, r5 - - // Save the original source address to r5. - mov r5, r0 - - // Save the current length (adding 1 for the terminator). - add r4, r3, #1 - b .L_strlen_start - - // r0 holds the pointer to the dst string. - // r3 holds the dst string length. - // r4 holds the src string length + 1. -.L_strlen_done: - add r2, r3, r4 - cmp r2, lr - bhi __strcat_chk_failed - - // Set up the registers for the memcpy code. - mov r1, r5 - pld [r1, #64] - mov r2, r4 - add r0, r0, r3 - pop {r4, r5} -END(__strcat_chk) - -#define MEMCPY_BASE __strcat_chk_memcpy_base -#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned - -#include "memcpy_base.S" - -ENTRY_PRIVATE(__strcat_chk_failed) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - .cfi_adjust_cfa_offset 8 - .cfi_rel_offset r4, 0 - .cfi_rel_offset r5, 4 - - ldr r0, error_message - ldr r1, error_code -1: - add r0, pc - bl __fortify_chk_fail -error_code: - .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW -error_message: - .word error_string-(1b+4) -END(__strcat_chk_failed) - - .data -error_string: - .string "strcat: prevented write past end of buffer" +#include "__strcat_chk_common.S" diff --git a/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S b/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S new file mode 100644 index 0000000..de66967 --- /dev/null +++ b/libc/arch-arm/cortex-a15/bionic/__strcat_chk_common.S @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <private/bionic_asm.h> +#include <private/libc_events.h> + + .syntax unified + + .thumb + .thumb_func + +// Get the length of src string, then get the source of the dst string. +// Check that the two lengths together don't exceed the threshold, then +// do a memcpy of the data. +ENTRY(__strcat_chk) + pld [r0, #0] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + push {r4, r5} + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 4 + + mov lr, r2 + + // Save the dst register to r5 + mov r5, r0 + + // Zero out r4 + eor r4, r4, r4 + + // r1 contains the address of the string to count. +.L_strlen_start: + mov r0, r1 + ands r3, r1, #7 + beq .L_mainloop + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq .L_align_to_32 + + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_32: + bcc .L_align_to_64 + ands ip, r3, #2 + beq .L_align_to_64 + + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + ldrb r2, [r1], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_64: + tst r3, #4 + beq .L_mainloop + ldr r3, [r1], #4 + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + + .p2align 2 +.L_mainloop: + ldrd r2, r3, [r1], #8 + + pld [r1, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_update_count_and_finish: + sub r3, r1, r0 + sub r3, r3, #1 + b .L_finish + +.L_zero_in_first_register: + sub r3, r1, r0 + lsls r2, ip, #17 + bne .L_sub8_and_finish + bcs .L_sub7_and_finish + lsls ip, ip, #1 + bne .L_sub6_and_finish + + sub r3, r3, #5 + b .L_finish + +.L_sub8_and_finish: + sub r3, r3, #8 + b .L_finish + +.L_sub7_and_finish: + sub r3, r3, #7 + b .L_finish + +.L_sub6_and_finish: + sub r3, r3, #6 + b .L_finish + +.L_zero_in_second_register: + sub r3, r1, r0 + lsls r2, ip, #17 + bne .L_sub4_and_finish + bcs .L_sub3_and_finish + lsls ip, ip, #1 + bne .L_sub2_and_finish + + sub r3, r3, #1 + b .L_finish + +.L_sub4_and_finish: + sub r3, r3, #4 + b .L_finish + +.L_sub3_and_finish: + sub r3, r3, #3 + b .L_finish + +.L_sub2_and_finish: + sub r3, r3, #2 + +.L_finish: + cmp r4, #0 + bne .L_strlen_done + + // Time to get the dst string length. + mov r1, r5 + + // Save the original source address to r5. + mov r5, r0 + + // Save the current length (adding 1 for the terminator). + add r4, r3, #1 + b .L_strlen_start + + // r0 holds the pointer to the dst string. + // r3 holds the dst string length. + // r4 holds the src string length + 1. +.L_strlen_done: + add r2, r3, r4 + cmp r2, lr + bhi .L_strcat_chk_failed + + // Set up the registers for the memcpy code. + mov r1, r5 + pld [r1, #64] + mov r2, r4 + add r0, r0, r3 + pop {r4, r5} + .cfi_adjust_cfa_offset -8 + .cfi_restore r4 + .cfi_restore r5 + +#include MEMCPY_BASE + + // Undo the above cfi directives + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 4 +.L_strcat_chk_failed: + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+4) +END(__strcat_chk) + + .data +error_string: + .string "strcat: prevented write past end of buffer" diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S index db76686..d8cb3d9 100644 --- a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S +++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 The Android Open Source Project + * Copyright (C) 2015 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -26,155 +26,7 @@ * SUCH DAMAGE. */ -#include <private/bionic_asm.h> -#include <private/libc_events.h> +// Indicate which memcpy base file to include. +#define MEMCPY_BASE "memcpy_base.S" - .syntax unified - - .thumb - .thumb_func - -// Get the length of the source string first, then do a memcpy of the data -// instead of a strcpy. -ENTRY(__strcpy_chk) - pld [r0, #0] - push {r0, lr} - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - - mov lr, r2 - mov r0, r1 - - ands r3, r1, #7 - beq .L_mainloop - - // Align to a double word (64 bits). - rsb r3, r3, #8 - lsls ip, r3, #31 - beq .L_align_to_32 - - ldrb r2, [r0], #1 - cbz r2, .L_update_count_and_finish - -.L_align_to_32: - bcc .L_align_to_64 - ands ip, r3, #2 - beq .L_align_to_64 - - ldrb r2, [r0], #1 - cbz r2, .L_update_count_and_finish - ldrb r2, [r0], #1 - cbz r2, .L_update_count_and_finish - -.L_align_to_64: - tst r3, #4 - beq .L_mainloop - ldr r3, [r0], #4 - - sub ip, r3, #0x01010101 - bic ip, ip, r3 - ands ip, ip, #0x80808080 - bne .L_zero_in_second_register - - .p2align 2 -.L_mainloop: - ldrd r2, r3, [r0], #8 - - pld [r0, #64] - - sub ip, r2, #0x01010101 - bic ip, ip, r2 - ands ip, ip, #0x80808080 - bne .L_zero_in_first_register - - sub ip, r3, #0x01010101 - bic ip, ip, r3 - ands ip, ip, #0x80808080 - bne .L_zero_in_second_register - b .L_mainloop - -.L_update_count_and_finish: - sub r3, r0, r1 - sub r3, r3, #1 - b .L_check_size - -.L_zero_in_first_register: - sub r3, r0, r1 - lsls r2, ip, #17 - bne .L_sub8_and_finish - bcs .L_sub7_and_finish - lsls ip, ip, #1 - bne .L_sub6_and_finish - - sub r3, r3, #5 - b .L_check_size - -.L_sub8_and_finish: - sub r3, r3, #8 - b .L_check_size - -.L_sub7_and_finish: - sub r3, r3, #7 - b .L_check_size - -.L_sub6_and_finish: - sub r3, r3, #6 - b .L_check_size - -.L_zero_in_second_register: - sub r3, r0, r1 - lsls r2, ip, #17 - bne .L_sub4_and_finish - bcs .L_sub3_and_finish - lsls ip, ip, #1 - bne .L_sub2_and_finish - - sub r3, r3, #1 - b .L_check_size - -.L_sub4_and_finish: - sub r3, r3, #4 - b .L_check_size - -.L_sub3_and_finish: - sub r3, r3, #3 - b .L_check_size - -.L_sub2_and_finish: - sub r3, r3, #2 - -.L_check_size: - pld [r1, #0] - pld [r1, #64] - ldr r0, [sp] - cmp r3, lr - bhs __strcpy_chk_failed - - // Add 1 for copy length to get the string terminator. - add r2, r3, #1 -END(__strcpy_chk) - -#define MEMCPY_BASE __strcpy_chk_memcpy_base -#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned -#include "memcpy_base.S" - -ENTRY_PRIVATE(__strcpy_chk_failed) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - - ldr r0, error_message - ldr r1, error_code -1: - add r0, pc - bl __fortify_chk_fail -error_code: - .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW -error_message: - .word error_string-(1b+4) -END(__strcpy_chk_failed) - - .data -error_string: - .string "strcpy: prevented write past end of buffer" +#include "__strcpy_chk_common.S" diff --git a/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S new file mode 100644 index 0000000..69ebcb4 --- /dev/null +++ b/libc/arch-arm/cortex-a15/bionic/__strcpy_chk_common.S @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <private/bionic_asm.h> +#include <private/libc_events.h> + + .syntax unified + + .thumb + .thumb_func + +// Get the length of the source string first, then do a memcpy of the data +// instead of a strcpy. +ENTRY(__strcpy_chk) + pld [r0, #0] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + + mov lr, r2 + mov r0, r1 + + ands r3, r1, #7 + beq .L_mainloop + + // Align to a double word (64 bits). + rsb r3, r3, #8 + lsls ip, r3, #31 + beq .L_align_to_32 + + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_32: + bcc .L_align_to_64 + ands ip, r3, #2 + beq .L_align_to_64 + + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + ldrb r2, [r0], #1 + cbz r2, .L_update_count_and_finish + +.L_align_to_64: + tst r3, #4 + beq .L_mainloop + ldr r3, [r0], #4 + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + + .p2align 2 +.L_mainloop: + ldrd r2, r3, [r0], #8 + + pld [r0, #64] + + sub ip, r2, #0x01010101 + bic ip, ip, r2 + ands ip, ip, #0x80808080 + bne .L_zero_in_first_register + + sub ip, r3, #0x01010101 + bic ip, ip, r3 + ands ip, ip, #0x80808080 + bne .L_zero_in_second_register + b .L_mainloop + +.L_update_count_and_finish: + sub r3, r0, r1 + sub r3, r3, #1 + b .L_check_size + +.L_zero_in_first_register: + sub r3, r0, r1 + lsls r2, ip, #17 + bne .L_sub8_and_finish + bcs .L_sub7_and_finish + lsls ip, ip, #1 + bne .L_sub6_and_finish + + sub r3, r3, #5 + b .L_check_size + +.L_sub8_and_finish: + sub r3, r3, #8 + b .L_check_size + +.L_sub7_and_finish: + sub r3, r3, #7 + b .L_check_size + +.L_sub6_and_finish: + sub r3, r3, #6 + b .L_check_size + +.L_zero_in_second_register: + sub r3, r0, r1 + lsls r2, ip, #17 + bne .L_sub4_and_finish + bcs .L_sub3_and_finish + lsls ip, ip, #1 + bne .L_sub2_and_finish + + sub r3, r3, #1 + b .L_check_size + +.L_sub4_and_finish: + sub r3, r3, #4 + b .L_check_size + +.L_sub3_and_finish: + sub r3, r3, #3 + b .L_check_size + +.L_sub2_and_finish: + sub r3, r3, #2 + +.L_check_size: + pld [r1, #0] + pld [r1, #64] + ldr r0, [sp] + cmp r3, lr + bhs .L_strcpy_chk_failed + + // Add 1 for copy length to get the string terminator. + add r2, r3, #1 + +#include MEMCPY_BASE + +.L_strcpy_chk_failed: + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+4) +END(__strcpy_chk) + + .data +error_string: + .string "strcpy: prevented write past end of buffer" diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S index 410b663..537f3de 100644 --- a/libc/arch-arm/cortex-a15/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 The Android Open Source Project + * Copyright (C) 2015 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,79 +25,8 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -/* - * Copyright (c) 2013 ARM Ltd - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the company may not be used to endorse or promote - * products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -// Prototype: void *memcpy (void *dst, const void *src, size_t count). - -#include <private/bionic_asm.h> -#include <private/libc_events.h> - - .text - .syntax unified - .fpu neon - -ENTRY(__memcpy_chk) - cmp r2, r3 - bhi __memcpy_chk_fail - - // Fall through to memcpy... -END(__memcpy_chk) - -ENTRY(memcpy) - pld [r1, #64] - push {r0, lr} - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 -END(memcpy) - -#define MEMCPY_BASE __memcpy_base -#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned -#include "memcpy_base.S" - -ENTRY_PRIVATE(__memcpy_chk_fail) - // Preserve lr for backtrace. - push {lr} - .cfi_def_cfa_offset 4 - .cfi_rel_offset lr, 0 - ldr r0, error_message - ldr r1, error_code -1: - add r0, pc - bl __fortify_chk_fail -error_code: - .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW -error_message: - .word error_string-(1b+8) -END(__memcpy_chk_fail) +// Indicate which memcpy base file to include. +#define MEMCPY_BASE "memcpy_base.S" - .data -error_string: - .string "memcpy: prevented write past end of buffer" +#include "memcpy_common.S" diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S index 2a73852..aac737d 100644 --- a/libc/arch-arm/cortex-a15/bionic/memcpy_base.S +++ b/libc/arch-arm/cortex-a15/bionic/memcpy_base.S @@ -53,11 +53,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -ENTRY_PRIVATE(MEMCPY_BASE) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - +.L_memcpy_base: // Assumes that n >= 0, and dst, src are valid pointers. // For any sizes less than 832 use the neon code that doesn't // care about the src alignment. This avoids any checks @@ -168,12 +164,6 @@ ENTRY_PRIVATE(MEMCPY_BASE) eor r3, r0, r1 ands r3, r3, #0x3 bne .L_copy_unknown_alignment -END(MEMCPY_BASE) - -ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 // To try and improve performance, stack layout changed, // i.e., not keeping the stack looking like users expect @@ -185,7 +175,7 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED) strd r6, r7, [sp, #-8]! .cfi_adjust_cfa_offset 8 .cfi_rel_offset r6, 0 - .cfi_rel_offset r7, 0 + .cfi_rel_offset r7, 4 strd r8, r9, [sp, #-8]! .cfi_adjust_cfa_offset 8 .cfi_rel_offset r8, 0 @@ -291,10 +281,28 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED) // Restore registers: optimized pop {r0, pc} ldrd r8, r9, [sp], #8 + .cfi_adjust_cfa_offset -8 + .cfi_restore r8 + .cfi_restore r9 ldrd r6, r7, [sp], #8 + .cfi_adjust_cfa_offset -8 + .cfi_restore r6 + .cfi_restore r7 ldrd r4, r5, [sp], #8 + .cfi_adjust_cfa_offset -8 + .cfi_restore r4 + .cfi_restore r5 pop {r0, pc} + // Put the cfi directives back for the below instructions. + .cfi_adjust_cfa_offset 24 + .cfi_rel_offset r4, 0 + .cfi_rel_offset r5, 4 + .cfi_rel_offset r6, 8 + .cfi_rel_offset r7, 12 + .cfi_rel_offset r8, 16 + .cfi_rel_offset r9, 20 + .L_dst_not_word_aligned: // Align dst to word. rsb ip, ip, #4 @@ -315,4 +323,12 @@ ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED) // Src is guaranteed to be at least word aligned by this point. b .L_word_aligned -END(MEMCPY_BASE_ALIGNED) + + // Undo any cfi directives from above. + .cfi_adjust_cfa_offset -24 + .cfi_restore r4 + .cfi_restore r5 + .cfi_restore r6 + .cfi_restore r7 + .cfi_restore r8 + .cfi_restore r9 diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy_common.S b/libc/arch-arm/cortex-a15/bionic/memcpy_common.S new file mode 100644 index 0000000..464fb46 --- /dev/null +++ b/libc/arch-arm/cortex-a15/bionic/memcpy_common.S @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <private/bionic_asm.h> +#include <private/libc_events.h> + + .text + .syntax unified + .fpu neon + +ENTRY(__memcpy_chk) + cmp r2, r3 + bhi .L_memcpy_chk_fail + + // Fall through to memcpy... +END(__memcpy_chk) + +// Prototype: void *memcpy (void *dst, const void *src, size_t count). +ENTRY(memcpy) + pld [r1, #64] + push {r0, lr} + .cfi_def_cfa_offset 8 + .cfi_rel_offset r0, 0 + .cfi_rel_offset lr, 4 + +#include MEMCPY_BASE + + // Undo the cfi instructions from above. + .cfi_def_cfa_offset 0 + .cfi_restore r0 + .cfi_restore lr +.L_memcpy_chk_fail: + // Preserve lr for backtrace. + push {lr} + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset lr, 0 + + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+8) +END(memcpy) + + .data +error_string: + .string "memcpy: prevented write past end of buffer" diff --git a/libc/arch-arm/cortex-a15/bionic/strcat.S b/libc/arch-arm/cortex-a15/bionic/strcat.S index b95be94..157cc9f 100644 --- a/libc/arch-arm/cortex-a15/bionic/strcat.S +++ b/libc/arch-arm/cortex-a15/bionic/strcat.S @@ -70,7 +70,7 @@ .macro m_scan_byte ldrb r3, [r0] - cbz r3, strcat_r0_scan_done + cbz r3, .L_strcat_r0_scan_done add r0, #1 .endm // m_scan_byte @@ -84,10 +84,10 @@ ENTRY(strcat) // Quick check to see if src is empty. ldrb r2, [r1] pld [r1, #0] - cbnz r2, strcat_continue + cbnz r2, .L_strcat_continue bx lr -strcat_continue: +.L_strcat_continue: // To speed up really small dst strings, unroll checking the first 4 bytes. m_push m_scan_byte @@ -96,95 +96,102 @@ strcat_continue: m_scan_byte ands r3, r0, #7 - beq strcat_mainloop + beq .L_strcat_mainloop // Align to a double word (64 bits). rsb r3, r3, #8 lsls ip, r3, #31 - beq strcat_align_to_32 + beq .L_strcat_align_to_32 ldrb r5, [r0] - cbz r5, strcat_r0_scan_done + cbz r5, .L_strcat_r0_scan_done add r0, r0, #1 -strcat_align_to_32: - bcc strcat_align_to_64 +.L_strcat_align_to_32: + bcc .L_strcat_align_to_64 ldrb r2, [r0] - cbz r2, strcat_r0_scan_done + cbz r2, .L_strcat_r0_scan_done add r0, r0, #1 ldrb r4, [r0] - cbz r4, strcat_r0_scan_done + cbz r4, .L_strcat_r0_scan_done add r0, r0, #1 -strcat_align_to_64: +.L_strcat_align_to_64: tst r3, #4 - beq strcat_mainloop + beq .L_strcat_mainloop ldr r3, [r0], #4 sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcat_zero_in_second_register - b strcat_mainloop + bne .L_strcat_zero_in_second_register + b .L_strcat_mainloop -strcat_r0_scan_done: +.L_strcat_r0_scan_done: // For short copies, hard-code checking the first 8 bytes since this // new code doesn't win until after about 8 bytes. - m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue - -strcpy_finish: + m_copy_byte reg=r2, cmd=cbz, label=.L_strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=.L_strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=.L_strcpy_finish + m_copy_byte reg=r5, cmd=cbz, label=.L_strcpy_finish + m_copy_byte reg=r2, cmd=cbz, label=.L_strcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=.L_strcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=.L_strcpy_finish + m_copy_byte reg=r5, cmd=cbnz, label=.L_strcpy_continue + +.L_strcpy_finish: m_pop -strcpy_continue: +.L_strcpy_continue: ands r3, r0, #7 - beq strcpy_check_src_align + beq .L_strcpy_check_src_align // Align to a double word (64 bits). rsb r3, r3, #8 lsls ip, r3, #31 - beq strcpy_align_to_32 + beq .L_strcpy_align_to_32 ldrb r2, [r1], #1 strb r2, [r0], #1 - cbz r2, strcpy_complete + cbz r2, .L_strcpy_complete -strcpy_align_to_32: - bcc strcpy_align_to_64 +.L_strcpy_align_to_32: + bcc .L_strcpy_align_to_64 ldrb r2, [r1], #1 strb r2, [r0], #1 - cbz r2, strcpy_complete + cbz r2, .L_strcpy_complete ldrb r2, [r1], #1 strb r2, [r0], #1 - cbz r2, strcpy_complete + cbz r2, .L_strcpy_complete -strcpy_align_to_64: +.L_strcpy_align_to_64: tst r3, #4 - beq strcpy_check_src_align - ldr r2, [r1], #4 - - sub ip, r2, #0x01010101 - bic ip, ip, r2 - ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register - str r2, [r0], #4 + beq .L_strcpy_check_src_align + // Read one byte at a time since we don't know the src alignment + // and we don't want to read into a different page. + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .L_strcpy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .L_strcpy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .L_strcpy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .L_strcpy_complete -strcpy_check_src_align: +.L_strcpy_check_src_align: // At this point dst is aligned to a double word, check if src // is also aligned to a double word. ands r3, r1, #7 - bne strcpy_unaligned_copy + bne .L_strcpy_unaligned_copy .p2align 2 -strcpy_mainloop: +.L_strcpy_mainloop: ldrd r2, r3, [r1], #8 pld [r1, #64] @@ -192,128 +199,128 @@ strcpy_mainloop: sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .L_strcpy_zero_in_first_register sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .L_strcpy_zero_in_second_register strd r2, r3, [r0], #8 - b strcpy_mainloop + b .L_strcpy_mainloop -strcpy_complete: +.L_strcpy_complete: m_pop -strcpy_zero_in_first_register: +.L_strcpy_zero_in_first_register: lsls lr, ip, #17 - bne strcpy_copy1byte - bcs strcpy_copy2bytes + bne .L_strcpy_copy1byte + bcs .L_strcpy_copy2bytes lsls ip, ip, #1 - bne strcpy_copy3bytes + bne .L_strcpy_copy3bytes -strcpy_copy4bytes: +.L_strcpy_copy4bytes: // Copy 4 bytes to the destiniation. str r2, [r0] m_pop -strcpy_copy1byte: +.L_strcpy_copy1byte: strb r2, [r0] m_pop -strcpy_copy2bytes: +.L_strcpy_copy2bytes: strh r2, [r0] m_pop -strcpy_copy3bytes: +.L_strcpy_copy3bytes: strh r2, [r0], #2 lsr r2, #16 strb r2, [r0] m_pop -strcpy_zero_in_second_register: +.L_strcpy_zero_in_second_register: lsls lr, ip, #17 - bne strcpy_copy5bytes - bcs strcpy_copy6bytes + bne .L_strcpy_copy5bytes + bcs .L_strcpy_copy6bytes lsls ip, ip, #1 - bne strcpy_copy7bytes + bne .L_strcpy_copy7bytes // Copy 8 bytes to the destination. strd r2, r3, [r0] m_pop -strcpy_copy5bytes: +.L_strcpy_copy5bytes: str r2, [r0], #4 strb r3, [r0] m_pop -strcpy_copy6bytes: +.L_strcpy_copy6bytes: str r2, [r0], #4 strh r3, [r0] m_pop -strcpy_copy7bytes: +.L_strcpy_copy7bytes: str r2, [r0], #4 strh r3, [r0], #2 lsr r3, #16 strb r3, [r0] m_pop -strcpy_unaligned_copy: +.L_strcpy_unaligned_copy: // Dst is aligned to a double word, while src is at an unknown alignment. // There are 7 different versions of the unaligned copy code // to prevent overreading the src. The mainloop of every single version // will store 64 bits per loop. The difference is how much of src can // be read without potentially crossing a page boundary. tbb [pc, r3] -strcpy_unaligned_branchtable: +.L_strcpy_unaligned_branchtable: .byte 0 - .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2) + .byte ((.L_strcpy_unalign7 - .L_strcpy_unaligned_branchtable)/2) + .byte ((.L_strcpy_unalign6 - .L_strcpy_unaligned_branchtable)/2) + .byte ((.L_strcpy_unalign5 - .L_strcpy_unaligned_branchtable)/2) + .byte ((.L_strcpy_unalign4 - .L_strcpy_unaligned_branchtable)/2) + .byte ((.L_strcpy_unalign3 - .L_strcpy_unaligned_branchtable)/2) + .byte ((.L_strcpy_unalign2 - .L_strcpy_unaligned_branchtable)/2) + .byte ((.L_strcpy_unalign1 - .L_strcpy_unaligned_branchtable)/2) .p2align 2 // Can read 7 bytes before possibly crossing a page. -strcpy_unalign7: +.L_strcpy_unalign7: ldr r2, [r1], #4 sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .L_strcpy_zero_in_first_register ldrb r3, [r1] - cbz r3, strcpy_unalign7_copy5bytes + cbz r3, .L_strcpy_unalign7_copy5bytes ldrb r4, [r1, #1] - cbz r4, strcpy_unalign7_copy6bytes + cbz r4, .L_strcpy_unalign7_copy6bytes ldrb r5, [r1, #2] - cbz r5, strcpy_unalign7_copy7bytes + cbz r5, .L_strcpy_unalign7_copy7bytes ldr r3, [r1], #4 pld [r1, #64] lsrs ip, r3, #24 strd r2, r3, [r0], #8 - beq strcpy_unalign_return - b strcpy_unalign7 + beq .L_strcpy_unalign_return + b .L_strcpy_unalign7 -strcpy_unalign7_copy5bytes: +.L_strcpy_unalign7_copy5bytes: str r2, [r0], #4 strb r3, [r0] -strcpy_unalign_return: +.L_strcpy_unalign_return: m_pop -strcpy_unalign7_copy6bytes: +.L_strcpy_unalign7_copy6bytes: str r2, [r0], #4 strb r3, [r0], #1 strb r4, [r0], #1 m_pop -strcpy_unalign7_copy7bytes: +.L_strcpy_unalign7_copy7bytes: str r2, [r0], #4 strb r3, [r0], #1 strb r4, [r0], #1 @@ -322,41 +329,41 @@ strcpy_unalign7_copy7bytes: .p2align 2 // Can read 6 bytes before possibly crossing a page. -strcpy_unalign6: +.L_strcpy_unalign6: ldr r2, [r1], #4 sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .L_strcpy_zero_in_first_register ldrb r4, [r1] - cbz r4, strcpy_unalign_copy5bytes + cbz r4, .L_strcpy_unalign_copy5bytes ldrb r5, [r1, #1] - cbz r5, strcpy_unalign_copy6bytes + cbz r5, .L_strcpy_unalign_copy6bytes ldr r3, [r1], #4 pld [r1, #64] tst r3, #0xff0000 - beq strcpy_copy7bytes + beq .L_strcpy_copy7bytes lsrs ip, r3, #24 strd r2, r3, [r0], #8 - beq strcpy_unalign_return - b strcpy_unalign6 + beq .L_strcpy_unalign_return + b .L_strcpy_unalign6 .p2align 2 // Can read 5 bytes before possibly crossing a page. -strcpy_unalign5: +.L_strcpy_unalign5: ldr r2, [r1], #4 sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .L_strcpy_zero_in_first_register ldrb r4, [r1] - cbz r4, strcpy_unalign_copy5bytes + cbz r4, .L_strcpy_unalign_copy5bytes ldr r3, [r1], #4 @@ -365,17 +372,17 @@ strcpy_unalign5: sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .L_strcpy_zero_in_second_register strd r2, r3, [r0], #8 - b strcpy_unalign5 + b .L_strcpy_unalign5 -strcpy_unalign_copy5bytes: +.L_strcpy_unalign_copy5bytes: str r2, [r0], #4 strb r4, [r0] m_pop -strcpy_unalign_copy6bytes: +.L_strcpy_unalign_copy6bytes: str r2, [r0], #4 strb r4, [r0], #1 strb r5, [r0] @@ -383,13 +390,13 @@ strcpy_unalign_copy6bytes: .p2align 2 // Can read 4 bytes before possibly crossing a page. -strcpy_unalign4: +.L_strcpy_unalign4: ldr r2, [r1], #4 sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .L_strcpy_zero_in_first_register ldr r3, [r1], #4 pld [r1, #64] @@ -397,20 +404,20 @@ strcpy_unalign4: sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .L_strcpy_zero_in_second_register strd r2, r3, [r0], #8 - b strcpy_unalign4 + b .L_strcpy_unalign4 .p2align 2 // Can read 3 bytes before possibly crossing a page. -strcpy_unalign3: +.L_strcpy_unalign3: ldrb r2, [r1] - cbz r2, strcpy_unalign3_copy1byte + cbz r2, .L_strcpy_unalign3_copy1byte ldrb r3, [r1, #1] - cbz r3, strcpy_unalign3_copy2bytes + cbz r3, .L_strcpy_unalign3_copy2bytes ldrb r4, [r1, #2] - cbz r4, strcpy_unalign3_copy3bytes + cbz r4, .L_strcpy_unalign3_copy3bytes ldr r2, [r1], #4 ldr r3, [r1], #4 @@ -418,26 +425,26 @@ strcpy_unalign3: pld [r1, #64] lsrs lr, r2, #24 - beq strcpy_copy4bytes + beq .L_strcpy_copy4bytes sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .L_strcpy_zero_in_second_register strd r2, r3, [r0], #8 - b strcpy_unalign3 + b .L_strcpy_unalign3 -strcpy_unalign3_copy1byte: +.L_strcpy_unalign3_copy1byte: strb r2, [r0] m_pop -strcpy_unalign3_copy2bytes: +.L_strcpy_unalign3_copy2bytes: strb r2, [r0], #1 strb r3, [r0] m_pop -strcpy_unalign3_copy3bytes: +.L_strcpy_unalign3_copy3bytes: strb r2, [r0], #1 strb r3, [r0], #1 strb r4, [r0] @@ -445,34 +452,34 @@ strcpy_unalign3_copy3bytes: .p2align 2 // Can read 2 bytes before possibly crossing a page. -strcpy_unalign2: +.L_strcpy_unalign2: ldrb r2, [r1] - cbz r2, strcpy_unalign_copy1byte + cbz r2, .L_strcpy_unalign_copy1byte ldrb r4, [r1, #1] - cbz r4, strcpy_unalign_copy2bytes + cbz r4, .L_strcpy_unalign_copy2bytes ldr r2, [r1], #4 ldr r3, [r1], #4 pld [r1, #64] tst r2, #0xff0000 - beq strcpy_copy3bytes + beq .L_strcpy_copy3bytes lsrs ip, r2, #24 - beq strcpy_copy4bytes + beq .L_strcpy_copy4bytes sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .L_strcpy_zero_in_second_register strd r2, r3, [r0], #8 - b strcpy_unalign2 + b .L_strcpy_unalign2 .p2align 2 // Can read 1 byte before possibly crossing a page. -strcpy_unalign1: +.L_strcpy_unalign1: ldrb r2, [r1] - cbz r2, strcpy_unalign_copy1byte + cbz r2, .L_strcpy_unalign_copy1byte ldr r2, [r1], #4 ldr r3, [r1], #4 @@ -482,27 +489,27 @@ strcpy_unalign1: sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .L_strcpy_zero_in_first_register sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .L_strcpy_zero_in_second_register strd r2, r3, [r0], #8 - b strcpy_unalign1 + b .L_strcpy_unalign1 -strcpy_unalign_copy1byte: +.L_strcpy_unalign_copy1byte: strb r2, [r0] m_pop -strcpy_unalign_copy2bytes: +.L_strcpy_unalign_copy2bytes: strb r2, [r0], #1 strb r4, [r0] m_pop .p2align 2 -strcat_mainloop: +.L_strcat_mainloop: ldrd r2, r3, [r0], #8 pld [r0, #64] @@ -510,59 +517,59 @@ strcat_mainloop: sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcat_zero_in_first_register + bne .L_strcat_zero_in_first_register sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcat_zero_in_second_register - b strcat_mainloop + bne .L_strcat_zero_in_second_register + b .L_strcat_mainloop -strcat_zero_in_first_register: +.L_strcat_zero_in_first_register: // Prefetch the src now, it's going to be used soon. pld [r1, #0] lsls lr, ip, #17 - bne strcat_sub8 - bcs strcat_sub7 + bne .L_strcat_sub8 + bcs .L_strcat_sub7 lsls ip, ip, #1 - bne strcat_sub6 + bne .L_strcat_sub6 sub r0, r0, #5 - b strcat_r0_scan_done + b .L_strcat_r0_scan_done -strcat_sub8: +.L_strcat_sub8: sub r0, r0, #8 - b strcat_r0_scan_done + b .L_strcat_r0_scan_done -strcat_sub7: +.L_strcat_sub7: sub r0, r0, #7 - b strcat_r0_scan_done + b .L_strcat_r0_scan_done -strcat_sub6: +.L_strcat_sub6: sub r0, r0, #6 - b strcat_r0_scan_done + b .L_strcat_r0_scan_done -strcat_zero_in_second_register: +.L_strcat_zero_in_second_register: // Prefetch the src now, it's going to be used soon. pld [r1, #0] lsls lr, ip, #17 - bne strcat_sub4 - bcs strcat_sub3 + bne .L_strcat_sub4 + bcs .L_strcat_sub3 lsls ip, ip, #1 - bne strcat_sub2 + bne .L_strcat_sub2 sub r0, r0, #1 - b strcat_r0_scan_done + b .L_strcat_r0_scan_done -strcat_sub4: +.L_strcat_sub4: sub r0, r0, #4 - b strcat_r0_scan_done + b .L_strcat_r0_scan_done -strcat_sub3: +.L_strcat_sub3: sub r0, r0, #3 - b strcat_r0_scan_done + b .L_strcat_r0_scan_done -strcat_sub2: +.L_strcat_sub2: sub r0, r0, #2 - b strcat_r0_scan_done + b .L_strcat_r0_scan_done END(strcat) diff --git a/libc/arch-arm/cortex-a15/bionic/string_copy.S b/libc/arch-arm/cortex-a15/bionic/string_copy.S index 20f0e91..92d1c98 100644 --- a/libc/arch-arm/cortex-a15/bionic/string_copy.S +++ b/libc/arch-arm/cortex-a15/bionic/string_copy.S @@ -149,13 +149,20 @@ ENTRY(strcpy) .Lstringcopy_align_to_64: tst r3, #4 beq .Lstringcopy_check_src_align - ldr r2, [r1], #4 - - sub ip, r2, #0x01010101 - bic ip, ip, r2 - ands ip, ip, #0x80808080 - bne .Lstringcopy_zero_in_first_register - str r2, [r0], #4 + // Read one byte at a time since we don't have any idea about the alignment + // of the source and we don't want to read into a different page. + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .Lstringcopy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .Lstringcopy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .Lstringcopy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .Lstringcopy_complete .Lstringcopy_check_src_align: // At this point dst is aligned to a double word, check if src diff --git a/libc/arch-arm/cortex-a15/bionic/strlen.S b/libc/arch-arm/cortex-a15/bionic/strlen.S index 9a0ce62..4fd6284 100644 --- a/libc/arch-arm/cortex-a15/bionic/strlen.S +++ b/libc/arch-arm/cortex-a15/bionic/strlen.S @@ -65,38 +65,38 @@ ENTRY(strlen) mov r1, r0 ands r3, r0, #7 - beq mainloop + beq .L_mainloop // Align to a double word (64 bits). rsb r3, r3, #8 lsls ip, r3, #31 - beq align_to_32 + beq .L_align_to_32 ldrb r2, [r1], #1 - cbz r2, update_count_and_return + cbz r2, .L_update_count_and_return -align_to_32: - bcc align_to_64 +.L_align_to_32: + bcc .L_align_to_64 ands ip, r3, #2 - beq align_to_64 + beq .L_align_to_64 ldrb r2, [r1], #1 - cbz r2, update_count_and_return + cbz r2, .L_update_count_and_return ldrb r2, [r1], #1 - cbz r2, update_count_and_return + cbz r2, .L_update_count_and_return -align_to_64: +.L_align_to_64: tst r3, #4 - beq mainloop + beq .L_mainloop ldr r3, [r1], #4 sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne zero_in_second_register + bne .L_zero_in_second_register .p2align 2 -mainloop: +.L_mainloop: ldrd r2, r3, [r1], #8 pld [r1, #64] @@ -104,62 +104,62 @@ mainloop: sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne zero_in_first_register + bne .L_zero_in_first_register sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne zero_in_second_register - b mainloop + bne .L_zero_in_second_register + b .L_mainloop -update_count_and_return: +.L_update_count_and_return: sub r0, r1, r0 sub r0, r0, #1 bx lr -zero_in_first_register: +.L_zero_in_first_register: sub r0, r1, r0 lsls r3, ip, #17 - bne sub8_and_return - bcs sub7_and_return + bne .L_sub8_and_return + bcs .L_sub7_and_return lsls ip, ip, #1 - bne sub6_and_return + bne .L_sub6_and_return sub r0, r0, #5 bx lr -sub8_and_return: +.L_sub8_and_return: sub r0, r0, #8 bx lr -sub7_and_return: +.L_sub7_and_return: sub r0, r0, #7 bx lr -sub6_and_return: +.L_sub6_and_return: sub r0, r0, #6 bx lr -zero_in_second_register: +.L_zero_in_second_register: sub r0, r1, r0 lsls r3, ip, #17 - bne sub4_and_return - bcs sub3_and_return + bne .L_sub4_and_return + bcs .L_sub3_and_return lsls ip, ip, #1 - bne sub2_and_return + bne .L_sub2_and_return sub r0, r0, #1 bx lr -sub4_and_return: +.L_sub4_and_return: sub r0, r0, #4 bx lr -sub3_and_return: +.L_sub3_and_return: sub r0, r0, #3 bx lr -sub2_and_return: +.L_sub2_and_return: sub r0, r0, #2 bx lr END(strlen) diff --git a/libc/arch-arm/cortex-a15/cortex-a15.mk b/libc/arch-arm/cortex-a15/cortex-a15.mk index 6fa3270..202a3bf 100644 --- a/libc/arch-arm/cortex-a15/cortex-a15.mk +++ b/libc/arch-arm/cortex-a15/cortex-a15.mk @@ -10,6 +10,7 @@ libc_bionic_src_files_arm += \ arch-arm/cortex-a15/bionic/strlen.S \ libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memchr.S \ arch-arm/generic/bionic/memcmp.S \ libc_bionic_src_files_arm += \ diff --git a/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk b/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk new file mode 100644 index 0000000..5d7efc6 --- /dev/null +++ b/libc/arch-arm/cortex-a53.a57/cortex-a53.a57.mk @@ -0,0 +1,22 @@ +# This file represents the best optimized routines that are the middle +# ground when running on a big/little system that is cortex-a57/cortex-a53. +# The cortex-a7 optimized routines, and the cortex-a53 optimized routines +# decrease performance on cortex-a57 processors by as much as 20%. + +libc_bionic_src_files_arm += \ + arch-arm/cortex-a15/bionic/memcpy.S \ + arch-arm/cortex-a15/bionic/memset.S \ + arch-arm/cortex-a15/bionic/stpcpy.S \ + arch-arm/cortex-a15/bionic/strcat.S \ + arch-arm/cortex-a15/bionic/__strcat_chk.S \ + arch-arm/cortex-a15/bionic/strcmp.S \ + arch-arm/cortex-a15/bionic/strcpy.S \ + arch-arm/cortex-a15/bionic/__strcpy_chk.S \ + arch-arm/cortex-a15/bionic/strlen.S \ + +libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memcmp.S \ + arch-arm/generic/bionic/memchr.S + +libc_bionic_src_files_arm += \ + arch-arm/denver/bionic/memmove.S \ diff --git a/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S b/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S new file mode 100644 index 0000000..c5bc98a --- /dev/null +++ b/libc/arch-arm/cortex-a53/bionic/__strcat_chk.S @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +// Indicate which memcpy base file to include. +#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S" + +#include "arch-arm/cortex-a15/bionic/__strcat_chk_common.S" diff --git a/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S b/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S new file mode 100644 index 0000000..1f8945d --- /dev/null +++ b/libc/arch-arm/cortex-a53/bionic/__strcpy_chk.S @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +// Indicate which memcpy base file to include. +#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S" + +#include "arch-arm/cortex-a15/bionic/__strcpy_chk_common.S" diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy.S b/libc/arch-arm/cortex-a53/bionic/memcpy.S new file mode 100644 index 0000000..664f574 --- /dev/null +++ b/libc/arch-arm/cortex-a53/bionic/memcpy.S @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +// Indicate which memcpy base file to include. +#define MEMCPY_BASE "arch-arm/cortex-a53/bionic/memcpy_base.S" + +#include "arch-arm/cortex-a15/bionic/memcpy_common.S" diff --git a/libc/arch-arm/cortex-a53/bionic/memcpy_base.S b/libc/arch-arm/cortex-a53/bionic/memcpy_base.S new file mode 100644 index 0000000..2749fc8 --- /dev/null +++ b/libc/arch-arm/cortex-a53/bionic/memcpy_base.S @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +.L_memcpy_base: + // Assumes that n >= 0, and dst, src are valid pointers. + cmp r2, #16 + blo .L_copy_less_than_16_unknown_align + +.L_copy_unknown_alignment: + // Unknown alignment of src and dst. + // Assumes that the first few bytes have already been prefetched. + + // Align destination to 128 bits. The mainloop store instructions + // require this alignment or they will throw an exception. + rsb r3, r0, #0 + ands r3, r3, #0xF + beq 2f + + // Copy up to 15 bytes (count in r3). + sub r2, r2, r3 + movs ip, r3, lsl #31 + + itt mi + ldrbmi lr, [r1], #1 + strbmi lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1], #1 + strbcs ip, [r0], #1 + strbcs lr, [r0], #1 + + movs ip, r3, lsl #29 + bge 1f + // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after. + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! +1: bcc 2f + // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after. + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0, :64]! + +2: // Make sure we have at least 64 bytes to copy. + subs r2, r2, #64 + blo 2f + +1: // The main loop copies 64 bytes at a time. + vld1.8 {d0 - d3}, [r1]! + vld1.8 {d4 - d7}, [r1]! + subs r2, r2, #64 + vstmia r0!, {d0 - d7} + pld [r1, #(64*10)] + bhs 1b + +2: // Fix-up the remaining count and make sure we have >= 32 bytes left. + adds r2, r2, #32 + blo 3f + + // 32 bytes. These cache lines were already preloaded. + vld1.8 {d0 - d3}, [r1]! + sub r2, r2, #32 + vst1.8 {d0 - d3}, [r0, :128]! +3: // Less than 32 left. + add r2, r2, #32 + tst r2, #0x10 + beq .L_copy_less_than_16_unknown_align + // Copies 16 bytes, destination 128 bits aligned. + vld1.8 {d0, d1}, [r1]! + vst1.8 {d0, d1}, [r0, :128]! + +.L_copy_less_than_16_unknown_align: + // Copy up to 15 bytes (count in r2). + movs ip, r2, lsl #29 + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: bge 2f + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! + +2: // Copy 0 to 4 bytes. + lsls r2, r2, #31 + itt ne + ldrbne lr, [r1], #1 + strbne lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1] + strbcs ip, [r0], #1 + strbcs lr, [r0] + + pop {r0, pc} diff --git a/libc/arch-arm/cortex-a53/cortex-a53.mk b/libc/arch-arm/cortex-a53/cortex-a53.mk index b5c337c..14aaa71 100644 --- a/libc/arch-arm/cortex-a53/cortex-a53.mk +++ b/libc/arch-arm/cortex-a53/cortex-a53.mk @@ -1 +1,21 @@ -include bionic/libc/arch-arm/cortex-a7/cortex-a7.mk +libc_bionic_src_files_arm += \ + arch-arm/cortex-a53/bionic/memcpy.S \ + arch-arm/cortex-a53/bionic/__strcat_chk.S \ + arch-arm/cortex-a53/bionic/__strcpy_chk.S \ + +libc_bionic_src_files_arm += \ + arch-arm/cortex-a7/bionic/memset.S \ + +libc_bionic_src_files_arm += \ + arch-arm/cortex-a15/bionic/stpcpy.S \ + arch-arm/cortex-a15/bionic/strcat.S \ + arch-arm/cortex-a15/bionic/strcmp.S \ + arch-arm/cortex-a15/bionic/strcpy.S \ + arch-arm/cortex-a15/bionic/strlen.S \ + +libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memchr.S \ + arch-arm/generic/bionic/memcmp.S \ + +libc_bionic_src_files_arm += \ + arch-arm/denver/bionic/memmove.S \ diff --git a/libc/arch-arm/cortex-a7/bionic/memset.S b/libc/arch-arm/cortex-a7/bionic/memset.S new file mode 100644 index 0000000..6365b06 --- /dev/null +++ b/libc/arch-arm/cortex-a7/bionic/memset.S @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <machine/cpu-features.h> +#include <private/bionic_asm.h> +#include <private/libc_events.h> + + /* + * Optimized memset() for ARM. + * + * memset() returns its first argument. + */ + + .fpu neon + .syntax unified + +ENTRY(__memset_chk) + cmp r2, r3 + bls .L_done + + // Preserve lr for backtrace. + push {lr} + .cfi_def_cfa_offset 4 + .cfi_rel_offset lr, 0 + + ldr r0, error_message + ldr r1, error_code +1: + add r0, pc + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW +error_message: + .word error_string-(1b+8) +END(__memset_chk) + +ENTRY(bzero) + mov r2, r1 + mov r1, #0 +.L_done: + // Fall through to memset... +END(bzero) + +ENTRY(memset) + mov r3, r0 + // At this point only d0, d1 are going to be used below. + vdup.8 q0, r1 + cmp r2, #16 + blo .L_set_less_than_16_unknown_align + +.L_check_alignment: + // Align destination to a double word to avoid the store crossing + // a cache line boundary. + ands ip, r3, #7 + bne .L_do_double_word_align + +.L_double_word_aligned: + // Duplicate since the less than 64 can use d2, d3. + vmov q1, q0 + subs r2, #64 + blo .L_set_less_than_64 + + // Duplicate the copy value so that we can store 64 bytes at a time. + vmov q2, q0 + vmov q3, q0 + +1: // Main loop stores 64 bytes at a time. + subs r2, #64 + vstmia r3!, {d0 - d7} + bge 1b + +.L_set_less_than_64: + // Restore r2 to the count of bytes left to set. + add r2, #64 + lsls ip, r2, #27 + bcc .L_set_less_than_32 + // Set 32 bytes. + vstmia r3!, {d0 - d3} + +.L_set_less_than_32: + bpl .L_set_less_than_16 + // Set 16 bytes. + vstmia r3!, {d0, d1} + +.L_set_less_than_16: + // Less than 16 bytes to set. + lsls ip, r2, #29 + bcc .L_set_less_than_8 + + // Set 8 bytes. + vstmia r3!, {d0} + +.L_set_less_than_8: + bpl .L_set_less_than_4 + // Set 4 bytes + vst1.32 {d0[0]}, [r3]! + +.L_set_less_than_4: + lsls ip, r2, #31 + it ne + strbne r1, [r3], #1 + itt cs + strbcs r1, [r3], #1 + strbcs r1, [r3] + bx lr + +.L_do_double_word_align: + rsb ip, ip, #8 + sub r2, r2, ip + + // Do this comparison now, otherwise we'll need to save a + // register to the stack since we've used all available + // registers. + cmp ip, #4 + blo 1f + + // Need to do a four byte copy. + movs ip, ip, lsl #31 + it mi + strbmi r1, [r3], #1 + itt cs + strbcs r1, [r3], #1 + strbcs r1, [r3], #1 + vst1.32 {d0[0]}, [r3]! + b .L_double_word_aligned + +1: + // No four byte copy. + movs ip, ip, lsl #31 + it mi + strbmi r1, [r3], #1 + itt cs + strbcs r1, [r3], #1 + strbcs r1, [r3], #1 + b .L_double_word_aligned + +.L_set_less_than_16_unknown_align: + // Set up to 15 bytes. + movs ip, r2, lsl #29 + bcc 1f + vst1.8 {d0}, [r3]! +1: bge 2f + vst1.32 {d0[0]}, [r3]! +2: movs ip, r2, lsl #31 + it mi + strbmi r1, [r3], #1 + itt cs + strbcs r1, [r3], #1 + strbcs r1, [r3], #1 + bx lr +END(memset) + + .data +error_string: + .string "memset: prevented write past end of buffer" diff --git a/libc/arch-arm/cortex-a7/cortex-a7.mk b/libc/arch-arm/cortex-a7/cortex-a7.mk index 9af03d9..3629a57 100644 --- a/libc/arch-arm/cortex-a7/cortex-a7.mk +++ b/libc/arch-arm/cortex-a7/cortex-a7.mk @@ -1 +1,19 @@ -include bionic/libc/arch-arm/cortex-a15/cortex-a15.mk +libc_bionic_src_files_arm += \ + arch-arm/cortex-a7/bionic/memset.S \ + +libc_bionic_src_files_arm += \ + arch-arm/cortex-a15/bionic/memcpy.S \ + arch-arm/cortex-a15/bionic/stpcpy.S \ + arch-arm/cortex-a15/bionic/strcat.S \ + arch-arm/cortex-a15/bionic/__strcat_chk.S \ + arch-arm/cortex-a15/bionic/strcmp.S \ + arch-arm/cortex-a15/bionic/strcpy.S \ + arch-arm/cortex-a15/bionic/__strcpy_chk.S \ + arch-arm/cortex-a15/bionic/strlen.S \ + +libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memchr.S \ + arch-arm/generic/bionic/memcmp.S \ + +libc_bionic_src_files_arm += \ + arch-arm/denver/bionic/memmove.S \ diff --git a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S index 5e81305..6ab5a69 100644 --- a/libc/arch-arm/cortex-a9/bionic/memcpy_base.S +++ b/libc/arch-arm/cortex-a9/bionic/memcpy_base.S @@ -133,8 +133,7 @@ ENTRY_PRIVATE(MEMCPY_BASE) strbcs ip, [r0], #1 strbcs lr, [r0], #1 - ldmfd sp!, {r0, lr} - bx lr + ldmfd sp!, {r0, pc} END(MEMCPY_BASE) ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED) diff --git a/libc/arch-arm/cortex-a9/bionic/memset.S b/libc/arch-arm/cortex-a9/bionic/memset.S index 8ee6ac2..b39fcc4 100644 --- a/libc/arch-arm/cortex-a9/bionic/memset.S +++ b/libc/arch-arm/cortex-a9/bionic/memset.S @@ -69,12 +69,9 @@ END(bzero) ENTRY(memset) // The neon memset only wins for less than 132. cmp r2, #132 - bhi __memset_large_copy - - stmfd sp!, {r0} - .cfi_def_cfa_offset 4 - .cfi_rel_offset r0, 0 + bhi .L_memset_large_copy + mov r3, r0 vdup.8 q0, r1 /* make sure we have at least 32 bytes to write */ @@ -84,7 +81,7 @@ ENTRY(memset) 1: /* The main loop writes 32 bytes at a time */ subs r2, r2, #32 - vst1.8 {d0 - d3}, [r0]! + vst1.8 {d0 - d3}, [r3]! bhs 1b 2: /* less than 32 left */ @@ -93,22 +90,20 @@ ENTRY(memset) beq 3f // writes 16 bytes, 128-bits aligned - vst1.8 {d0, d1}, [r0]! + vst1.8 {d0, d1}, [r3]! 3: /* write up to 15-bytes (count in r2) */ movs ip, r2, lsl #29 bcc 1f - vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r3]! 1: bge 2f - vst1.32 {d0[0]}, [r0]! + vst1.32 {d0[0]}, [r3]! 2: movs ip, r2, lsl #31 - strbmi r1, [r0], #1 - strbcs r1, [r0], #1 - strbcs r1, [r0], #1 - ldmfd sp!, {r0} + strbmi r1, [r3], #1 + strbcs r1, [r3], #1 + strbcs r1, [r3], #1 bx lr -END(memset) -ENTRY_PRIVATE(__memset_large_copy) +.L_memset_large_copy: /* compute the offset to align the destination * offset = (4-(src&3))&3 = -src & 3 */ @@ -136,8 +131,7 @@ ENTRY_PRIVATE(__memset_large_copy) strbcs r1, [r0], #1 strbmi r1, [r0], #1 subs r2, r2, r3 - popls {r0, r4-r7, lr} /* return */ - bxls lr + popls {r0, r4-r7, pc} /* return */ /* align the destination to a cache-line */ mov r12, r1 @@ -180,9 +174,8 @@ ENTRY_PRIVATE(__memset_large_copy) strhmi r1, [r0], #2 movs r2, r2, lsl #2 strbcs r1, [r0] - ldmfd sp!, {r0, r4-r7, lr} - bx lr -END(__memset_large_copy) + ldmfd sp!, {r0, r4-r7, pc} +END(memset) .data error_string: diff --git a/libc/arch-arm/cortex-a9/bionic/strcat.S b/libc/arch-arm/cortex-a9/bionic/strcat.S index f5a855e..9077a74 100644 --- a/libc/arch-arm/cortex-a9/bionic/strcat.S +++ b/libc/arch-arm/cortex-a9/bionic/strcat.S @@ -70,7 +70,7 @@ .macro m_scan_byte ldrb r3, [r0] - cbz r3, strcat_r0_scan_done + cbz r3, .Lstrcat_r0_scan_done add r0, #1 .endm // m_scan_byte @@ -84,10 +84,10 @@ ENTRY(strcat) // Quick check to see if src is empty. ldrb r2, [r1] pld [r1, #0] - cbnz r2, strcat_continue + cbnz r2, .Lstrcat_continue bx lr -strcat_continue: +.Lstrcat_continue: // To speed up really small dst strings, unroll checking the first 4 bytes. m_push m_scan_byte @@ -96,10 +96,10 @@ strcat_continue: m_scan_byte ands r3, r0, #7 - bne strcat_align_src + bne .Lstrcat_align_src .p2align 2 -strcat_mainloop: +.Lstrcat_mainloop: ldmia r0!, {r2, r3} pld [r0, #64] @@ -107,28 +107,28 @@ strcat_mainloop: sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcat_zero_in_first_register + bne .Lstrcat_zero_in_first_register sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcat_zero_in_second_register - b strcat_mainloop + bne .Lstrcat_zero_in_second_register + b .Lstrcat_mainloop -strcat_zero_in_first_register: +.Lstrcat_zero_in_first_register: sub r0, r0, #4 -strcat_zero_in_second_register: +.Lstrcat_zero_in_second_register: // Check for zero in byte 0. tst ip, #0x80 it ne subne r0, r0, #4 - bne strcat_r0_scan_done + bne .Lstrcat_r0_scan_done // Check for zero in byte 1. tst ip, #0x8000 it ne subne r0, r0, #3 - bne strcat_r0_scan_done + bne .Lstrcat_r0_scan_done // Check for zero in byte 2. tst ip, #0x800000 it ne @@ -137,33 +137,33 @@ strcat_zero_in_second_register: // Zero is in byte 3. subeq r0, r0, #1 -strcat_r0_scan_done: +.Lstrcat_r0_scan_done: // Unroll the first 8 bytes that will be copied. - m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r5, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r2, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r3, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r4, cmd=cbz, label=strcpy_finish - m_copy_byte reg=r5, cmd=cbnz, label=strcpy_continue - -strcpy_finish: + m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish + m_copy_byte reg=r5, cmd=cbz, label=.Lstrcpy_finish + m_copy_byte reg=r2, cmd=cbz, label=.Lstrcpy_finish + m_copy_byte reg=r3, cmd=cbz, label=.Lstrcpy_finish + m_copy_byte reg=r4, cmd=cbz, label=.Lstrcpy_finish + m_copy_byte reg=r5, cmd=cbnz, label=.Lstrcpy_continue + +.Lstrcpy_finish: m_ret inst=pop -strcpy_continue: +.Lstrcpy_continue: pld [r1, #0] ands r3, r0, #7 - bne strcpy_align_dst + bne .Lstrcpy_align_dst -strcpy_check_src_align: +.Lstrcpy_check_src_align: // At this point dst is aligned to a double word, check if src // is also aligned to a double word. ands r3, r1, #7 - bne strcpy_unaligned_copy + bne .Lstrcpy_unaligned_copy .p2align 2 -strcpy_mainloop: +.Lstrcpy_mainloop: ldmia r1!, {r2, r3} pld [r1, #64] @@ -171,17 +171,17 @@ strcpy_mainloop: sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .Lstrcpy_zero_in_first_register sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .Lstrcpy_zero_in_second_register stmia r0!, {r2, r3} - b strcpy_mainloop + b .Lstrcpy_mainloop -strcpy_zero_in_first_register: +.Lstrcpy_zero_in_first_register: lsls lr, ip, #17 itt ne strbne r2, [r0] @@ -198,7 +198,7 @@ strcpy_zero_in_first_register: strb r3, [r0] m_ret inst=pop -strcpy_zero_in_second_register: +.Lstrcpy_zero_in_second_register: lsls lr, ip, #17 ittt ne stmiane r0!, {r2} @@ -218,18 +218,18 @@ strcpy_zero_in_second_register: strb r4, [r0] m_ret inst=pop -strcpy_align_dst: +.Lstrcpy_align_dst: // Align to a double word (64 bits). rsb r3, r3, #8 lsls ip, r3, #31 - beq strcpy_align_to_32 + beq .Lstrcpy_align_to_32 ldrb r2, [r1], #1 strb r2, [r0], #1 - cbz r2, strcpy_complete + cbz r2, .Lstrcpy_complete -strcpy_align_to_32: - bcc strcpy_align_to_64 +.Lstrcpy_align_to_32: + bcc .Lstrcpy_align_to_64 ldrb r4, [r1], #1 strb r4, [r0], #1 @@ -242,76 +242,83 @@ strcpy_align_to_32: it eq m_ret inst=popeq -strcpy_align_to_64: +.Lstrcpy_align_to_64: tst r3, #4 - beq strcpy_check_src_align - ldr r2, [r1], #4 - - sub ip, r2, #0x01010101 - bic ip, ip, r2 - ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register - stmia r0!, {r2} - b strcpy_check_src_align + beq .Lstrcpy_check_src_align + // Read one byte at a time since we don't know the src alignment + // and we don't want to read into a different page. + ldrb r4, [r1], #1 + strb r4, [r0], #1 + cbz r4, .Lstrcpy_complete + ldrb r5, [r1], #1 + strb r5, [r0], #1 + cbz r5, .Lstrcpy_complete + ldrb r4, [r1], #1 + strb r4, [r0], #1 + cbz r4, .Lstrcpy_complete + ldrb r5, [r1], #1 + strb r5, [r0], #1 + cbz r5, .Lstrcpy_complete + b .Lstrcpy_check_src_align -strcpy_complete: +.Lstrcpy_complete: m_ret inst=pop -strcpy_unaligned_copy: +.Lstrcpy_unaligned_copy: // Dst is aligned to a double word, while src is at an unknown alignment. // There are 7 different versions of the unaligned copy code // to prevent overreading the src. The mainloop of every single version // will store 64 bits per loop. The difference is how much of src can // be read without potentially crossing a page boundary. tbb [pc, r3] -strcpy_unaligned_branchtable: +.Lstrcpy_unaligned_branchtable: .byte 0 - .byte ((strcpy_unalign7 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign6 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign5 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign4 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign3 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign2 - strcpy_unaligned_branchtable)/2) - .byte ((strcpy_unalign1 - strcpy_unaligned_branchtable)/2) + .byte ((.Lstrcpy_unalign7 - .Lstrcpy_unaligned_branchtable)/2) + .byte ((.Lstrcpy_unalign6 - .Lstrcpy_unaligned_branchtable)/2) + .byte ((.Lstrcpy_unalign5 - .Lstrcpy_unaligned_branchtable)/2) + .byte ((.Lstrcpy_unalign4 - .Lstrcpy_unaligned_branchtable)/2) + .byte ((.Lstrcpy_unalign3 - .Lstrcpy_unaligned_branchtable)/2) + .byte ((.Lstrcpy_unalign2 - .Lstrcpy_unaligned_branchtable)/2) + .byte ((.Lstrcpy_unalign1 - .Lstrcpy_unaligned_branchtable)/2) .p2align 2 // Can read 7 bytes before possibly crossing a page. -strcpy_unalign7: +.Lstrcpy_unalign7: ldr r2, [r1], #4 sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .Lstrcpy_zero_in_first_register ldrb r3, [r1] - cbz r3, strcpy_unalign7_copy5bytes + cbz r3, .Lstrcpy_unalign7_copy5bytes ldrb r4, [r1, #1] - cbz r4, strcpy_unalign7_copy6bytes + cbz r4, .Lstrcpy_unalign7_copy6bytes ldrb r5, [r1, #2] - cbz r5, strcpy_unalign7_copy7bytes + cbz r5, .Lstrcpy_unalign7_copy7bytes ldr r3, [r1], #4 pld [r1, #64] lsrs ip, r3, #24 stmia r0!, {r2, r3} - beq strcpy_unalign_return - b strcpy_unalign7 + beq .Lstrcpy_unalign_return + b .Lstrcpy_unalign7 -strcpy_unalign7_copy5bytes: +.Lstrcpy_unalign7_copy5bytes: stmia r0!, {r2} strb r3, [r0] -strcpy_unalign_return: +.Lstrcpy_unalign_return: m_ret inst=pop -strcpy_unalign7_copy6bytes: +.Lstrcpy_unalign7_copy6bytes: stmia r0!, {r2} strb r3, [r0], #1 strb r4, [r0], #1 m_ret inst=pop -strcpy_unalign7_copy7bytes: +.Lstrcpy_unalign7_copy7bytes: stmia r0!, {r2} strb r3, [r0], #1 strb r4, [r0], #1 @@ -320,30 +327,30 @@ strcpy_unalign7_copy7bytes: .p2align 2 // Can read 6 bytes before possibly crossing a page. -strcpy_unalign6: +.Lstrcpy_unalign6: ldr r2, [r1], #4 sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .Lstrcpy_zero_in_first_register ldrb r4, [r1] - cbz r4, strcpy_unalign_copy5bytes + cbz r4, .Lstrcpy_unalign_copy5bytes ldrb r5, [r1, #1] - cbz r5, strcpy_unalign_copy6bytes + cbz r5, .Lstrcpy_unalign_copy6bytes ldr r3, [r1], #4 pld [r1, #64] tst r3, #0xff0000 - beq strcpy_unalign6_copy7bytes + beq .Lstrcpy_unalign6_copy7bytes lsrs ip, r3, #24 stmia r0!, {r2, r3} - beq strcpy_unalign_return - b strcpy_unalign6 + beq .Lstrcpy_unalign_return + b .Lstrcpy_unalign6 -strcpy_unalign6_copy7bytes: +.Lstrcpy_unalign6_copy7bytes: stmia r0!, {r2} strh r3, [r0], #2 lsr r3, #16 @@ -352,16 +359,16 @@ strcpy_unalign6_copy7bytes: .p2align 2 // Can read 5 bytes before possibly crossing a page. -strcpy_unalign5: +.Lstrcpy_unalign5: ldr r2, [r1], #4 sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .Lstrcpy_zero_in_first_register ldrb r4, [r1] - cbz r4, strcpy_unalign_copy5bytes + cbz r4, .Lstrcpy_unalign_copy5bytes ldr r3, [r1], #4 @@ -370,17 +377,17 @@ strcpy_unalign5: sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .Lstrcpy_zero_in_second_register stmia r0!, {r2, r3} - b strcpy_unalign5 + b .Lstrcpy_unalign5 -strcpy_unalign_copy5bytes: +.Lstrcpy_unalign_copy5bytes: stmia r0!, {r2} strb r4, [r0] m_ret inst=pop -strcpy_unalign_copy6bytes: +.Lstrcpy_unalign_copy6bytes: stmia r0!, {r2} strb r4, [r0], #1 strb r5, [r0] @@ -388,13 +395,13 @@ strcpy_unalign_copy6bytes: .p2align 2 // Can read 4 bytes before possibly crossing a page. -strcpy_unalign4: +.Lstrcpy_unalign4: ldmia r1!, {r2} sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .Lstrcpy_zero_in_first_register ldmia r1!, {r3} pld [r1, #64] @@ -402,20 +409,20 @@ strcpy_unalign4: sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .Lstrcpy_zero_in_second_register stmia r0!, {r2, r3} - b strcpy_unalign4 + b .Lstrcpy_unalign4 .p2align 2 // Can read 3 bytes before possibly crossing a page. -strcpy_unalign3: +.Lstrcpy_unalign3: ldrb r2, [r1] - cbz r2, strcpy_unalign3_copy1byte + cbz r2, .Lstrcpy_unalign3_copy1byte ldrb r3, [r1, #1] - cbz r3, strcpy_unalign3_copy2bytes + cbz r3, .Lstrcpy_unalign3_copy2bytes ldrb r4, [r1, #2] - cbz r4, strcpy_unalign3_copy3bytes + cbz r4, .Lstrcpy_unalign3_copy3bytes ldr r2, [r1], #4 ldr r3, [r1], #4 @@ -423,26 +430,26 @@ strcpy_unalign3: pld [r1, #64] lsrs lr, r2, #24 - beq strcpy_unalign_copy4bytes + beq .Lstrcpy_unalign_copy4bytes sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .Lstrcpy_zero_in_second_register stmia r0!, {r2, r3} - b strcpy_unalign3 + b .Lstrcpy_unalign3 -strcpy_unalign3_copy1byte: +.Lstrcpy_unalign3_copy1byte: strb r2, [r0] m_ret inst=pop -strcpy_unalign3_copy2bytes: +.Lstrcpy_unalign3_copy2bytes: strb r2, [r0], #1 strb r3, [r0] m_ret inst=pop -strcpy_unalign3_copy3bytes: +.Lstrcpy_unalign3_copy3bytes: strb r2, [r0], #1 strb r3, [r0], #1 strb r4, [r0] @@ -450,34 +457,34 @@ strcpy_unalign3_copy3bytes: .p2align 2 // Can read 2 bytes before possibly crossing a page. -strcpy_unalign2: +.Lstrcpy_unalign2: ldrb r2, [r1] - cbz r2, strcpy_unalign_copy1byte + cbz r2, .Lstrcpy_unalign_copy1byte ldrb r3, [r1, #1] - cbz r3, strcpy_unalign_copy2bytes + cbz r3, .Lstrcpy_unalign_copy2bytes ldr r2, [r1], #4 ldr r3, [r1], #4 pld [r1, #64] tst r2, #0xff0000 - beq strcpy_unalign_copy3bytes + beq .Lstrcpy_unalign_copy3bytes lsrs ip, r2, #24 - beq strcpy_unalign_copy4bytes + beq .Lstrcpy_unalign_copy4bytes sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .Lstrcpy_zero_in_second_register stmia r0!, {r2, r3} - b strcpy_unalign2 + b .Lstrcpy_unalign2 .p2align 2 // Can read 1 byte before possibly crossing a page. -strcpy_unalign1: +.Lstrcpy_unalign1: ldrb r2, [r1] - cbz r2, strcpy_unalign_copy1byte + cbz r2, .Lstrcpy_unalign_copy1byte ldr r2, [r1], #4 ldr r3, [r1], #4 @@ -487,62 +494,62 @@ strcpy_unalign1: sub ip, r2, #0x01010101 bic ip, ip, r2 ands ip, ip, #0x80808080 - bne strcpy_zero_in_first_register + bne .Lstrcpy_zero_in_first_register sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcpy_zero_in_second_register + bne .Lstrcpy_zero_in_second_register stmia r0!, {r2, r3} - b strcpy_unalign1 + b .Lstrcpy_unalign1 -strcpy_unalign_copy1byte: +.Lstrcpy_unalign_copy1byte: strb r2, [r0] m_ret inst=pop -strcpy_unalign_copy2bytes: +.Lstrcpy_unalign_copy2bytes: strb r2, [r0], #1 strb r3, [r0] m_ret inst=pop -strcpy_unalign_copy3bytes: +.Lstrcpy_unalign_copy3bytes: strh r2, [r0], #2 lsr r2, #16 strb r2, [r0] m_ret inst=pop -strcpy_unalign_copy4bytes: +.Lstrcpy_unalign_copy4bytes: stmia r0, {r2} m_ret inst=pop -strcat_align_src: +.Lstrcat_align_src: // Align to a double word (64 bits). rsb r3, r3, #8 lsls ip, r3, #31 - beq strcat_align_to_32 + beq .Lstrcat_align_to_32 ldrb r2, [r0], #1 - cbz r2, strcat_r0_update + cbz r2, .Lstrcat_r0_update -strcat_align_to_32: - bcc strcat_align_to_64 +.Lstrcat_align_to_32: + bcc .Lstrcat_align_to_64 ldrb r2, [r0], #1 - cbz r2, strcat_r0_update + cbz r2, .Lstrcat_r0_update ldrb r2, [r0], #1 - cbz r2, strcat_r0_update + cbz r2, .Lstrcat_r0_update -strcat_align_to_64: +.Lstrcat_align_to_64: tst r3, #4 - beq strcat_mainloop + beq .Lstrcat_mainloop ldr r3, [r0], #4 sub ip, r3, #0x01010101 bic ip, ip, r3 ands ip, ip, #0x80808080 - bne strcat_zero_in_second_register - b strcat_mainloop + bne .Lstrcat_zero_in_second_register + b .Lstrcat_mainloop -strcat_r0_update: +.Lstrcat_r0_update: sub r0, r0, #1 - b strcat_r0_scan_done + b .Lstrcat_r0_scan_done END(strcat) diff --git a/libc/arch-arm/cortex-a9/bionic/string_copy.S b/libc/arch-arm/cortex-a9/bionic/string_copy.S index caf5a11..642db0f 100644 --- a/libc/arch-arm/cortex-a9/bionic/string_copy.S +++ b/libc/arch-arm/cortex-a9/bionic/string_copy.S @@ -244,13 +244,20 @@ ENTRY(strcpy) .Lstringcopy_align_to_64: tst r3, #4 beq .Lstringcopy_check_src_align - ldr r2, [r1], #4 - - sub ip, r2, #0x01010101 - bic ip, ip, r2 - ands ip, ip, #0x80808080 - bne .Lstringcopy_zero_in_first_register - stmia r0!, {r2} + // Read one byte at a time since we don't have any idea about the alignment + // of the source and we don't want to read into a different page. + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .Lstringcopy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .Lstringcopy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .Lstringcopy_complete + ldrb r2, [r1], #1 + strb r2, [r0], #1 + cbz r2, .Lstringcopy_complete b .Lstringcopy_check_src_align .Lstringcopy_complete: diff --git a/libc/arch-arm/cortex-a9/cortex-a9.mk b/libc/arch-arm/cortex-a9/cortex-a9.mk index 7b38de1..db4bcc7 100644 --- a/libc/arch-arm/cortex-a9/cortex-a9.mk +++ b/libc/arch-arm/cortex-a9/cortex-a9.mk @@ -10,6 +10,7 @@ libc_bionic_src_files_arm += \ arch-arm/cortex-a9/bionic/strlen.S \ libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memchr.S \ arch-arm/generic/bionic/memcmp.S \ libc_bionic_src_files_arm += \ diff --git a/libc/arch-arm/denver/denver.mk b/libc/arch-arm/denver/denver.mk index 5fddf95..e81f8c7 100644 --- a/libc/arch-arm/denver/denver.mk +++ b/libc/arch-arm/denver/denver.mk @@ -1,4 +1,5 @@ libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memchr.S \ arch-arm/generic/bionic/memcmp.S \ arch-arm/denver/bionic/memcpy.S \ arch-arm/denver/bionic/memmove.S \ diff --git a/libc/arch-arm/generic/bionic/memchr.S b/libc/arch-arm/generic/bionic/memchr.S new file mode 100644 index 0000000..cb00d82 --- /dev/null +++ b/libc/arch-arm/generic/bionic/memchr.S @@ -0,0 +1,155 @@ +/* Copyright (c) 2010-2015, Linaro Limited + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Linaro Limited nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Written by Dave Gilbert <david.gilbert@linaro.org> + + This memchr routine is optimised on a Cortex-A9 and should work on + all ARMv7 processors. It has a fast past for short sizes, and has + an optimised path for large data sets; the worst case is finding the + match early in a large data set. + + */ + +#include <private/bionic_asm.h> + +@ 2011-02-07 david.gilbert@linaro.org +@ Extracted from local git a5b438d861 +@ 2011-07-14 david.gilbert@linaro.org +@ Import endianness fix from local git ea786f1b +@ 2011-12-07 david.gilbert@linaro.org +@ Removed unneeded cbz from align loop + + .syntax unified + .arch armv7-a + +@ this lets us check a flag in a 00/ff byte easily in either endianness +#ifdef __ARMEB__ +#define CHARTSTMASK(c) 1<<(31-(c*8)) +#else +#define CHARTSTMASK(c) 1<<(c*8) +#endif + .text + .thumb + +@ --------------------------------------------------------------------------- + .thumb_func +ENTRY(memchr) + .p2align 4,,15 + @ r0 = start of memory to scan + @ r1 = character to look for + @ r2 = length + @ returns r0 = pointer to character or NULL if not found + and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char + + cmp r2,#16 @ If it's short don't bother with anything clever + blt 20f + + tst r0, #7 @ If it's already aligned skip the next bit + beq 10f + + @ Work up to an aligned point +5: + ldrb r3, [r0],#1 + subs r2, r2, #1 + cmp r3, r1 + beq 50f @ If it matches exit found + tst r0, #7 + bne 5b @ If not aligned yet then do next byte + +10: + @ At this point, we are aligned, we know we have at least 8 bytes to work with + push {r4,r5,r6,r7} + orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes + orr r1, r1, r1, lsl #16 + bic r4, r2, #7 @ Number of double words to work with + mvns r7, #0 @ all F's + movs r3, #0 + +15: + ldrd r5,r6,[r0],#8 + subs r4, r4, #8 + eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target + eor r6,r6, r1 + uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cbnz r6, 60f + bne 15b @ (Flags from the subs above) If not run out of bytes then go around again + + pop {r4,r5,r6,r7} + and r1,r1,#0xff @ Get r1 back to a single character from the expansion above + and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done + +20: + cbz r2, 40f @ 0 length or hit the end already then not found + +21: @ Post aligned section, or just a short call + ldrb r3,[r0],#1 + subs r2,r2,#1 + eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub + cbz r3, 50f + bne 21b @ on r2 flags + +40: + movs r0,#0 @ not found + bx lr + +50: + subs r0,r0,#1 @ found + bx lr + +60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was + @ r0 points to the start of the double word after the one that was tested + @ r5 has the 00/ff pattern for the first word, r6 has the chained value + cmp r5, #0 + itte eq + moveq r5, r6 @ the end is in the 2nd word + subeq r0,r0,#3 @ Points to 2nd byte of 2nd word + subne r0,r0,#7 @ or 2nd byte of 1st word + + @ r0 currently points to the 3rd byte of the word containing the hit + tst r5, # CHARTSTMASK(0) @ 1st character + bne 61f + adds r0,r0,#1 + tst r5, # CHARTSTMASK(1) @ 2nd character + ittt eq + addeq r0,r0,#1 + tsteq r5, # (3<<15) @ 2nd & 3rd character + @ If not the 3rd must be the last one + addeq r0,r0,#1 + +61: + pop {r4,r5,r6,r7} + subs r0,r0,#1 + bx lr +END(memchr) diff --git a/libc/arch-arm/generic/bionic/memcmp.S b/libc/arch-arm/generic/bionic/memcmp.S index c78dbd4..6643d55 100644 --- a/libc/arch-arm/generic/bionic/memcmp.S +++ b/libc/arch-arm/generic/bionic/memcmp.S @@ -221,8 +221,7 @@ ENTRY(memcmp) bne 8b 9: /* restore registers and return */ - ldmfd sp!, {r4, lr} - bx lr + ldmfd sp!, {r4, pc} 10: /* process less than 12 bytes */ cmp r2, #0 diff --git a/libc/arch-arm/generic/bionic/memcpy.S b/libc/arch-arm/generic/bionic/memcpy.S index ea5a399..65cba4c 100644 --- a/libc/arch-arm/generic/bionic/memcpy.S +++ b/libc/arch-arm/generic/bionic/memcpy.S @@ -194,8 +194,7 @@ ENTRY(memcpy) /* we're done! restore everything and return */ 1: ldmfd sp!, {r5-r11} - ldmfd sp!, {r0, r4, lr} - bx lr + ldmfd sp!, {r0, r4, pc} /********************************************************************/ @@ -385,8 +384,7 @@ ENTRY(memcpy) /* we're done! restore sp and spilled registers and return */ add sp, sp, #28 - ldmfd sp!, {r0, r4, lr} - bx lr + ldmfd sp!, {r0, r4, pc} END(memcpy) // Only reached when the __memcpy_chk check fails. diff --git a/libc/arch-arm/generic/bionic/memset.S b/libc/arch-arm/generic/bionic/memset.S index d17a9c4..b8eabbf 100644 --- a/libc/arch-arm/generic/bionic/memset.S +++ b/libc/arch-arm/generic/bionic/memset.S @@ -82,8 +82,7 @@ ENTRY(memset) strbcs r1, [r0], #1 strbmi r1, [r0], #1 subs r2, r2, r3 - popls {r0, r4-r7, lr} /* return */ - bxls lr + popls {r0, r4-r7, pc} /* return */ /* align the destination to a cache-line */ mov r12, r1 @@ -126,8 +125,7 @@ ENTRY(memset) strhmi r1, [r0], #2 movs r2, r2, lsl #2 strbcs r1, [r0] - ldmfd sp!, {r0, r4-r7, lr} - bx lr + ldmfd sp!, {r0, r4-r7, pc} END(memset) .data diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk index e49d6d2..016c882 100644 --- a/libc/arch-arm/generic/generic.mk +++ b/libc/arch-arm/generic/generic.mk @@ -1,4 +1,5 @@ libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memchr.S \ arch-arm/generic/bionic/memcmp.S \ arch-arm/generic/bionic/memcpy.S \ arch-arm/generic/bionic/memset.S \ diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S index 246f159..1a39c5b 100644 --- a/libc/arch-arm/krait/bionic/__strcat_chk.S +++ b/libc/arch-arm/krait/bionic/__strcat_chk.S @@ -40,7 +40,7 @@ ENTRY(__strcat_chk) pld [r0, #0] push {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 push {r4, r5} @@ -177,7 +177,7 @@ ENTRY(__strcat_chk) .L_strlen_done: add r2, r3, r4 cmp r2, lr - bhi __strcat_chk_failed + bhi .L_strcat_chk_failed // Set up the registers for the memcpy code. mov r1, r5 @@ -185,20 +185,17 @@ ENTRY(__strcat_chk) mov r2, r4 add r0, r0, r3 pop {r4, r5} -END(__strcat_chk) + .cfi_adjust_cfa_offset -8 + .cfi_restore r4 + .cfi_restore r5 -#define MEMCPY_BASE __strcat_chk_memcpy_base -#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__strcat_chk_failed) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 + // Undo the above cfi directives. .cfi_adjust_cfa_offset 8 .cfi_rel_offset r4, 0 .cfi_rel_offset r5, 4 - +.L_strcat_chk_failed: ldr r0, error_message ldr r1, error_code 1: @@ -208,7 +205,7 @@ error_code: .word BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__strcat_chk_failed) +END(__strcat_chk) .data error_string: diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S index db76686..00202f3 100644 --- a/libc/arch-arm/krait/bionic/__strcpy_chk.S +++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__strcpy_chk) pld [r0, #0] push {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 @@ -149,21 +149,14 @@ ENTRY(__strcpy_chk) pld [r1, #64] ldr r0, [sp] cmp r3, lr - bhs __strcpy_chk_failed + bhs .L_strcpy_chk_failed // Add 1 for copy length to get the string terminator. add r2, r3, #1 -END(__strcpy_chk) -#define MEMCPY_BASE __strcpy_chk_memcpy_base -#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__strcpy_chk_failed) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - +.L_strcpy_chk_failed: ldr r0, error_message ldr r1, error_code 1: @@ -173,7 +166,7 @@ error_code: .word BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__strcpy_chk_failed) +END(__strcpy_chk) .data error_string: diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S index 9ff46a8..5d27b57 100644 --- a/libc/arch-arm/krait/bionic/memcpy.S +++ b/libc/arch-arm/krait/bionic/memcpy.S @@ -45,7 +45,7 @@ ENTRY(__memcpy_chk) cmp r2, r3 - bhi __memcpy_chk_fail + bhi .L_memcpy_chk_fail // Fall through to memcpy... END(__memcpy_chk) @@ -53,19 +53,20 @@ END(__memcpy_chk) ENTRY(memcpy) pld [r1, #64] stmfd sp!, {r0, lr} - .cfi_def_cfa_offset 8 + .cfi_adjust_cfa_offset 8 .cfi_rel_offset r0, 0 .cfi_rel_offset lr, 4 -END(memcpy) -#define MEMCPY_BASE __memcpy_base -#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned #include "memcpy_base.S" -ENTRY_PRIVATE(__memcpy_chk_fail) + // Undo the cfi directives from above. + .cfi_adjust_cfa_offset -8 + .cfi_restore r0 + .cfi_restore lr +.L_memcpy_chk_fail: // Preserve lr for backtrace. push {lr} - .cfi_def_cfa_offset 4 + .cfi_adjust_cfa_offset 4 .cfi_rel_offset lr, 0 ldr r0, error_message @@ -77,7 +78,7 @@ error_code: .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW error_message: .word error_string-(1b+4) -END(__memcpy_chk_fail) +END(memcpy) .data error_string: diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S index 035dcf1..76c5a84 100644 --- a/libc/arch-arm/krait/bionic/memcpy_base.S +++ b/libc/arch-arm/krait/bionic/memcpy_base.S @@ -1,123 +1,191 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - - -/* - * This code assumes it is running on a processor that supports all arm v7 - * instructions, that supports neon instructions, and that has a 32 byte - * cache line. - */ - -// Assumes neon instructions and a cache line size of 32 bytes. - -ENTRY_PRIVATE(MEMCPY_BASE) - .cfi_def_cfa_offset 8 - .cfi_rel_offset r0, 0 - .cfi_rel_offset lr, 4 - - /* do we have at least 16-bytes to copy (needed for alignment below) */ - cmp r2, #16 - blo 5f - - /* align destination to cache-line for the write-buffer */ - rsb r3, r0, #0 - ands r3, r3, #0xF - beq 2f - - /* copy up to 15-bytes (count in r3) */ - sub r2, r2, r3 - movs ip, r3, lsl #31 - itt mi - ldrbmi lr, [r1], #1 - strbmi lr, [r0], #1 - itttt cs - ldrbcs ip, [r1], #1 - ldrbcs lr, [r1], #1 - strbcs ip, [r0], #1 - strbcs lr, [r0], #1 - movs ip, r3, lsl #29 - bge 1f - // copies 4 bytes, destination 32-bits aligned - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! -1: bcc 2f - // copies 8 bytes, destination 64-bits aligned - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0, :64]! - -2: /* make sure we have at least 64 bytes to copy */ - subs r2, r2, #64 - blo 2f - -1: /* The main loop copies 64 bytes at a time */ - vld1.8 {d0 - d3}, [r1]! - vld1.8 {d4 - d7}, [r1]! - pld [r1, #(32*8)] - subs r2, r2, #64 - vst1.8 {d0 - d3}, [r0, :128]! - vst1.8 {d4 - d7}, [r0, :128]! - bhs 1b - -2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ - adds r2, r2, #32 - blo 4f - - /* Copy 32 bytes. These cache lines were already preloaded */ - vld1.8 {d0 - d3}, [r1]! - sub r2, r2, #32 - vst1.8 {d0 - d3}, [r0, :128]! - -4: /* less than 32 left */ - add r2, r2, #32 - tst r2, #0x10 - beq 5f - // copies 16 bytes, 128-bits aligned - vld1.8 {d0, d1}, [r1]! - vst1.8 {d0, d1}, [r0, :128]! - -5: /* copy up to 15-bytes (count in r2) */ - movs ip, r2, lsl #29 - bcc 1f - vld1.8 {d0}, [r1]! - vst1.8 {d0}, [r0]! -1: bge 2f - vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! -2: movs ip, r2, lsl #31 - itt mi - ldrbmi r3, [r1], #1 - strbmi r3, [r0], #1 - itttt cs - ldrbcs ip, [r1], #1 - ldrbcs lr, [r1], #1 - strbcs ip, [r0], #1 - strbcs lr, [r0], #1 - - ldmfd sp!, {r0, lr} - bx lr -END(MEMCPY_BASE) +/*************************************************************************** + Copyright (c) 2009-2013 The Linux Foundation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of The Linux Foundation nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +/* Assumes neon instructions and a cache line size of 64 bytes. */ + +#include <machine/cpu-features.h> +#include <machine/asm.h> + +#define PLDOFFS (10) +#define PLDTHRESH (PLDOFFS) +#define BBTHRESH (4096/64) +#define PLDSIZE (64) + +#if (PLDOFFS < 1) +#error Routine does not support offsets less than 1 +#endif + +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif + + .text + .fpu neon + +.L_memcpy_base: + cmp r2, #4 + blt .L_neon_lt4 + cmp r2, #16 + blt .L_neon_lt16 + cmp r2, #32 + blt .L_neon_16 + cmp r2, #64 + blt .L_neon_copy_32_a + + mov r12, r2, lsr #6 + cmp r12, #PLDTHRESH + ble .L_neon_copy_64_loop_nopld + + push {r9, r10} + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset r9, 0 + .cfi_rel_offset r10, 4 + + cmp r12, #BBTHRESH + ble .L_neon_prime_pump + + add lr, r0, #0x400 + add r9, r1, #(PLDOFFS*PLDSIZE) + sub lr, lr, r9 + lsl lr, lr, #21 + lsr lr, lr, #21 + add lr, lr, #(PLDOFFS*PLDSIZE) + cmp r12, lr, lsr #6 + ble .L_neon_prime_pump + + itt gt + movgt r9, #(PLDOFFS) + rsbsgt r9, r9, lr, lsr #6 + ble .L_neon_prime_pump + + add r10, r1, lr + bic r10, #0x3F + + sub r12, r12, lr, lsr #6 + + cmp r9, r12 + itee le + suble r12, r12, r9 + movgt r9, r12 + movgt r12, #0 + + pld [r1, #((PLDOFFS-1)*PLDSIZE)] +.L_neon_copy_64_loop_outer_doublepld: + pld [r1, #((PLDOFFS)*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r9, r9, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .L_neon_copy_64_loop_outer_doublepld + cmp r12, #0 + beq .L_neon_pop_before_nopld + + cmp r12, #(512*1024/64) + blt .L_neon_copy_64_loop_outer + +.L_neon_copy_64_loop_ddr: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + pld [r10] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .L_neon_copy_64_loop_ddr + b .L_neon_pop_before_nopld + +.L_neon_prime_pump: + mov lr, #(PLDOFFS*PLDSIZE) + add r10, r1, #(PLDOFFS*PLDSIZE) + bic r10, #0x3F + sub r12, r12, #PLDOFFS + ldr r3, [r10, #(-1*PLDSIZE)] + +.L_neon_copy_64_loop_outer: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + ldr r3, [r10] + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + add r10, #64 + bne .L_neon_copy_64_loop_outer + +.L_neon_pop_before_nopld: + mov r12, lr, lsr #6 + pop {r9, r10} + .cfi_adjust_cfa_offset -8 + .cfi_restore r9 + .cfi_restore r10 + +.L_neon_copy_64_loop_nopld: + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .L_neon_copy_64_loop_nopld + ands r2, r2, #0x3f + beq .L_neon_exit + +.L_neon_copy_32_a: + movs r3, r2, lsl #27 + bcc .L_neon_16 + vld1.32 {q0,q1}, [r1]! + vst1.32 {q0,q1}, [r0]! + +.L_neon_16: + bpl .L_neon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + ands r2, r2, #0x0f + beq .L_neon_exit + +.L_neon_lt16: + movs r3, r2, lsl #29 + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: + bge .L_neon_lt4 + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! + +.L_neon_lt4: + movs r2, r2, lsl #31 + itt cs + ldrhcs r3, [r1], #2 + strhcs r3, [r0], #2 + itt mi + ldrbmi r3, [r1] + strbmi r3, [r0] + +.L_neon_exit: + pop {r0, pc} diff --git a/libc/arch-arm/krait/bionic/memmove.S b/libc/arch-arm/krait/bionic/memmove.S new file mode 100644 index 0000000..aea7315 --- /dev/null +++ b/libc/arch-arm/krait/bionic/memmove.S @@ -0,0 +1,219 @@ +/*************************************************************************** + Copyright (c) 2009-2014 The Linux Foundation. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of The Linux Foundation nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +/*************************************************************************** + * Neon memmove: Attempts to do a memmove with Neon registers if possible, + * Inputs: + * dest: The destination buffer + * src: The source buffer + * n: The size of the buffer to transfer + * Outputs: + * + ***************************************************************************/ + +#include <private/bionic_asm.h> +#include <private/libc_events.h> +/* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true + * TARGET_USE_KRAIT_PLD_SET := true + * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> + * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> + * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> + */ +#ifndef PLDOFFS +#define PLDOFFS (10) +#endif +#ifndef PLDTHRESH +#define PLDTHRESH (PLDOFFS) +#endif +#if (PLDOFFS < 5) +#error Routine does not support offsets less than 5 +#endif +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif +#ifndef PLDSIZE +#define PLDSIZE (64) +#endif + + .text + .syntax unified + .fpu neon + .thumb + .thumb_func + +//ENTRY(bcopy) +// //.cfi_startproc +// mov r12, r0 +// mov r0, r1 +// mov r1, r12 +// // Fall through to memmove +// //.cfi_endproc +//END(bcopy) + +ENTRY(memmove) +_memmove_words: + //.cfi_startproc + .save {r0, lr} + cmp r2, #0 + it ne + subsne r12, r0, r1 // Warning: do not combine these "it" blocks + it eq + bxeq lr +// memmove only if r1 < r0 < r1+r2 + cmp r0, r1 + itt ge + addge r12, r1, r2 + cmpge r12, r0 + it le + ble memcpy + cmp r2, #4 + it le + ble .Lneon_b2f_smallcopy_loop + push {r0, lr} + add r0, r0, r2 + add r1, r1, r2 + cmp r2, #64 + it ge + bge .Lneon_b2f_copy_64 + cmp r2, #32 + it ge + bge .Lneon_b2f_copy_32 + cmp r2, #8 + it ge + bge .Lneon_b2f_copy_8 + b .Lneon_b2f_copy_1 +.Lneon_b2f_copy_64: + mov r12, r2, lsr #6 + add r0, r0, #32 + add r1, r1, #32 + cmp r12, #PLDTHRESH + it le + ble .Lneon_b2f_copy_64_loop_nopld + sub r12, #PLDOFFS + sub lr, r1, #(PLDOFFS)*PLDSIZE +.Lneon_b2f_copy_64_loop_outer: + pld [lr] + sub r1, r1, #96 + sub r0, r0, #96 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1] + sub lr, lr, #64 + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0] + it ne + bne .Lneon_b2f_copy_64_loop_outer + mov r12, #PLDOFFS +.Lneon_b2f_copy_64_loop_nopld: + sub r1, r1, #96 + sub r0, r0, #96 + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1] + subs r12, r12, #1 + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0] + it ne + bne .Lneon_b2f_copy_64_loop_nopld + ands r2, r2, #0x3f + it eq + beq .Lneon_memmove_done + sub r1, r1, #32 + sub r0, r0, #32 + cmp r2, #32 + it lt + blt .Lneon_b2f_copy_8 +.Lneon_b2f_copy_32: + sub r1, r1, #32 + sub r0, r0, #32 + vld1.32 {q0, q1}, [r1] + vst1.32 {q0, q1}, [r0] + ands r2, r2, #0x1f + it eq + beq .Lneon_memmove_done +.Lneon_b2f_copy_8: + movs r12, r2, lsr #0x3 + it eq + beq .Lneon_b2f_copy_1 +.Lneon_b2f_copy_8_loop: + sub r1, r1, #8 + sub r0, r0, #8 + vld1.32 {d0}, [r1] + subs r12, r12, #1 + vst1.32 {d0}, [r0] + it ne + bne .Lneon_b2f_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_b2f_copy_1: + movs r12, r2, lsl #29 + itttt mi + submi r1, r1, #4 + submi r0, r0, #4 + ldrmi r3, [r1] + strmi r3, [r0] + movs r2, r2, lsl #31 + itttt cs + subcs r1, r1, #2 + subcs r0, r0, #2 + ldrhcs r3, [r1] + strhcs r3, [r0] + itttt mi + submi r1, r1, #1 + submi r0, r0, #1 + ldrbmi r12, [r1] + strbmi r12, [r0] +.Lneon_memmove_done: + pop {r0, pc} +.Lneon_b2f_smallcopy_loop: + // 4 bytes or less + add r1, r1, r2 + add r0, r0, r2 + movs r12, r2, lsl #29 + itttt mi + submi r1, r1, #4 + submi r0, r0, #4 + ldrmi r3, [r1] + strmi r3, [r0] + movs r2, r2, lsl #31 + itttt cs + subcs r1, r1, #2 + subcs r0, r0, #2 + ldrhcs r3, [r1] + strhcs r3, [r0] + itttt mi + submi r1, r1, #1 + submi r0, r0, #1 + ldrbmi r12, [r1] + strbmi r12, [r0] + bx lr +// .cfi_endproc +END(memmove) + diff --git a/libc/arch-arm/krait/bionic/memset.S b/libc/arch-arm/krait/bionic/memset.S index a4fbe17..ae05965 100644 --- a/libc/arch-arm/krait/bionic/memset.S +++ b/libc/arch-arm/krait/bionic/memset.S @@ -69,10 +69,7 @@ END(bzero) /* memset() returns its first argument. */ ENTRY(memset) - stmfd sp!, {r0} - .cfi_def_cfa_offset 4 - .cfi_rel_offset r0, 0 - + mov r3, r0 vdup.8 q0, r1 /* make sure we have at least 32 bytes to write */ @@ -82,7 +79,7 @@ ENTRY(memset) 1: /* The main loop writes 32 bytes at a time */ subs r2, r2, #32 - vst1.8 {d0 - d3}, [r0]! + vst1.8 {d0 - d3}, [r3]! bhs 1b 2: /* less than 32 left */ @@ -91,18 +88,17 @@ ENTRY(memset) beq 3f // writes 16 bytes, 128-bits aligned - vst1.8 {d0, d1}, [r0]! + vst1.8 {d0, d1}, [r3]! 3: /* write up to 15-bytes (count in r2) */ movs ip, r2, lsl #29 bcc 1f - vst1.8 {d0}, [r0]! + vst1.8 {d0}, [r3]! 1: bge 2f - vst1.32 {d0[0]}, [r0]! + vst1.32 {d0[0]}, [r3]! 2: movs ip, r2, lsl #31 - strbmi r1, [r0], #1 - strbcs r1, [r0], #1 - strbcs r1, [r0], #1 - ldmfd sp!, {r0} + strbmi r1, [r3], #1 + strbcs r1, [r3], #1 + strbcs r1, [r3], #1 bx lr END(memset) diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk index 88b4d66..5f5b414 100644 --- a/libc/arch-arm/krait/krait.mk +++ b/libc/arch-arm/krait/krait.mk @@ -1,9 +1,19 @@ libc_bionic_src_files_arm += \ - arch-arm/krait/bionic/memcpy.S \ arch-arm/krait/bionic/memset.S \ arch-arm/krait/bionic/strcmp.S \ arch-arm/krait/bionic/__strcat_chk.S \ arch-arm/krait/bionic/__strcpy_chk.S \ + arch-arm/krait/bionic/memmove.S + +#For some targets we don't need this optimization. +#Corresponding flag is defined in device specific folder. +ifeq ($(TARGET_CPU_MEMCPY_BASE_OPT_DISABLE),true) +libc_bionic_src_files_arm += \ + arch-arm/cortex-a15/bionic/memcpy.S +else +libc_bionic_src_files_arm += \ + arch-arm/krait/bionic/memcpy.S +endif # Use cortex-a15 versions of strcat/strcpy/strlen and standard memmove libc_bionic_src_files_arm += \ @@ -13,7 +23,7 @@ libc_bionic_src_files_arm += \ arch-arm/cortex-a15/bionic/strlen.S \ libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memchr.S \ arch-arm/generic/bionic/memcmp.S \ -libc_bionic_src_files_arm += \ - arch-arm/denver/bionic/memmove.S \ + diff --git a/libc/arch-arm/scorpion/scorpion.mk b/libc/arch-arm/scorpion/scorpion.mk new file mode 100644 index 0000000..ce18a7e --- /dev/null +++ b/libc/arch-arm/scorpion/scorpion.mk @@ -0,0 +1,18 @@ +# Use krait versions of memset/strcmp/memmove +libc_bionic_src_files_arm += \ + arch-arm/krait/bionic/memset.S \ + arch-arm/krait/bionic/strcmp.S \ + arch-arm/krait/bionic/memmove.S + +libc_bionic_src_files_arm += \ + arch-arm/cortex-a15/bionic/memcpy.S \ + arch-arm/cortex-a15/bionic/stpcpy.S \ + arch-arm/cortex-a15/bionic/strcat.S \ + arch-arm/cortex-a15/bionic/__strcat_chk.S \ + arch-arm/cortex-a15/bionic/strcpy.S \ + arch-arm/cortex-a15/bionic/__strcpy_chk.S \ + arch-arm/cortex-a15/bionic/strlen.S + +libc_bionic_src_files_arm += \ + arch-arm/generic/bionic/memchr.S \ + arch-arm/generic/bionic/memcmp.S diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk index 470a038..1b8d534 100644 --- a/libc/arch-arm64/arm64.mk +++ b/libc/arch-arm64/arm64.mk @@ -8,7 +8,6 @@ libc_bionic_src_files_arm64 += \ bionic/__memset_chk.cpp \ bionic/__strcpy_chk.cpp \ bionic/__strcat_chk.cpp \ - bionic/strrchr.cpp \ libc_freebsd_src_files_arm64 += \ upstream-freebsd/lib/libc/string/wcscat.c \ diff --git a/libc/arch-arm64/denver64/denver64.mk b/libc/arch-arm64/denver64/denver64.mk index d619c11..3c453bb 100644 --- a/libc/arch-arm64/denver64/denver64.mk +++ b/libc/arch-arm64/denver64/denver64.mk @@ -11,4 +11,5 @@ libc_bionic_src_files_arm64 += \ arch-arm64/generic/bionic/strlen.S \ arch-arm64/generic/bionic/strncmp.S \ arch-arm64/generic/bionic/strnlen.S \ + arch-arm64/generic/bionic/strrchr.S \ arch-arm64/generic/bionic/wmemmove.S diff --git a/libc/arch-arm64/generic/bionic/memcpy_base.S b/libc/arch-arm64/generic/bionic/memcpy_base.S index c5d42ce..f850624 100644 --- a/libc/arch-arm64/generic/bionic/memcpy_base.S +++ b/libc/arch-arm64/generic/bionic/memcpy_base.S @@ -1,4 +1,4 @@ -/* Copyright (c) 2012, Linaro Limited +/* Copyright (c) 2012-2013, Linaro Limited All rights reserved. Redistribution and use in source and binary forms, with or without @@ -22,158 +22,196 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +/* + * Copyright (c) 2015 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ /* Assumptions: * - * ARMv8-a, AArch64 - * Unaligned accesses + * ARMv8-a, AArch64, unaligned accesses. * */ +#include <private/bionic_asm.h> + #define dstin x0 #define src x1 #define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define tmp3 x5 -#define tmp3w w5 -#define dst x6 - -#define A_l x7 -#define A_h x8 -#define B_l x9 -#define B_h x10 -#define C_l x11 -#define C_h x12 -#define D_l x13 -#define D_h x14 - - mov dst, dstin - cmp count, #64 - b.ge .Lcpy_not_short - cmp count, #15 - b.le .Ltail15tiny - - /* Deal with small copies quickly by dropping straight into the - * exit block. */ -.Ltail63: - /* Copy up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq .Ltail15 - add dst, dst, tmp1 - add src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-48] - stp A_l, A_h, [dst, #-48] -1: - ldp A_l, A_h, [src, #-32] - stp A_l, A_h, [dst, #-32] -2: - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] - -.Ltail15: - ands count, count, #15 - beq 1f - add src, src, count - ldp A_l, A_h, [src, #-16] - add dst, dst, count - stp A_l, A_h, [dst, #-16] +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define A_hw w7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l src +#define E_h count +#define F_l srcend +#define F_h dst +#define tmp1 x9 + +#define L(l) .L ## l + +/* Copies are split into 3 main cases: small copies of up to 16 bytes, + medium copies of 17..96 bytes which are fully unrolled. Large copies + of more than 96 bytes align the destination and use an unrolled loop + processing 64 bytes per iteration. + Small and medium copies read all data before writing, allowing any + kind of overlap, and memmove tailcalls memcpy for these cases as + well as non-overlapping copies. +*/ + + prfm PLDL1KEEP, [src] + add srcend, src, count + add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) + cmp count, 96 + b.hi L(copy_long) + + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] 1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] ret -.Ltail15tiny: - /* Copy up to 15 bytes of data. Does not assume additional data - being copied. */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz count, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz count, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz count, #0, 1f - ldrb tmp1w, [src] - strb tmp1w, [dst] + .p2align 4 + + /* Small copies: 0..16 bytes. */ +L(copy16): + cmp count, 8 + b.lo 1f + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + .p2align 4 1: + tbz count, 2, 1f + ldr A_lw, [src] + ldr A_hw, [srcend, -4] + str A_lw, [dstin] + str A_hw, [dstend, -4] ret -.Lcpy_not_short: - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - neg tmp2, src - ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Copy more data than needed; it's faster than jumping - * around copying sub-Quadword quantities. We know that - * it can't overrun. */ - ldp A_l, A_h, [src] - add src, src, tmp2 - stp A_l, A_h, [dst] - add dst, dst, tmp2 - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le .Ltail63 -2: - subs count, count, #128 - b.ge .Lcpy_body_large - /* Less than 128 bytes to copy, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst] - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - add src, src, #64 - add dst, dst, #64 - b.ne .Ltail63 + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ +1: + cbz count, 2f + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] +2: ret + + .p2align 4 + /* Copy 64..96 bytes. Copy 64 bytes from the start and + 32 bytes from the end. */ +L(copy96): + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [src, 32] + ldp D_l, D_h, [src, 48] + ldp E_l, E_h, [srcend, -32] + ldp F_l, F_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin, 32] + stp D_l, D_h, [dstin, 48] + stp E_l, E_h, [dstend, -32] + stp F_l, F_h, [dstend, -16] ret - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -.Lcpy_body_large: - /* There are at least 128 bytes to copy. */ - ldp A_l, A_h, [src, #0] - sub dst, dst, #16 /* Pre-bias. */ - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ + /* Align DST to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + .p2align 4 +L(copy_long): + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls 2f 1: - stp A_l, A_h, [dst, #16] - ldp A_l, A_h, [src, #16] - stp B_l, B_h, [dst, #32] - ldp B_l, B_h, [src, #32] - stp C_l, C_h, [dst, #48] - ldp C_l, C_h, [src, #48] - stp D_l, D_h, [dst, #64]! - ldp D_l, D_h, [src, #64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #16] - stp B_l, B_h, [dst, #32] - stp C_l, C_h, [dst, #48] - stp D_l, D_h, [dst, #64] - add src, src, #16 - add dst, dst, #64 + 16 - tst count, #0x3f - b.ne .Ltail63 + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the end even if + there is just 1 byte left. */ +2: + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] ret diff --git a/libc/arch-arm64/generic/bionic/memmove.S b/libc/arch-arm64/generic/bionic/memmove.S index 8b366a3..c50112d 100644 --- a/libc/arch-arm64/generic/bionic/memmove.S +++ b/libc/arch-arm64/generic/bionic/memmove.S @@ -1,4 +1,4 @@ -/* Copyright (c) 2014, Linaro Limited +/* Copyright (c) 2013, Linaro Limited All rights reserved. Redistribution and use in source and binary forms, with or without @@ -22,319 +22,131 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +/* + * Copyright (c) 2015 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ /* Assumptions: * - * ARMv8-a, AArch64 - * Unaligned accesses - * wchar_t is 4 bytes + * ARMv8-a, AArch64, unaligned accesses, wchar_t is 4 bytes */ #include <private/bionic_asm.h> /* Parameters and result. */ -#ifdef BCOPY -#define origdstin x1 -#define origsrc x0 -#endif #define dstin x0 #define src x1 #define count x2 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define tmp3 x5 -#define tmp3w w5 -#define dst x6 - -#define A_l x7 -#define A_h x8 -#define B_l x9 -#define B_h x10 -#define C_l x11 -#define C_h x12 -#define D_l x13 -#define D_h x14 +#define srcend x3 +#define dstend x4 +#define tmp1 x5 +#define A_l x6 +#define A_h x7 +#define B_l x8 +#define B_h x9 +#define C_l x10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l count +#define E_h tmp1 + +/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps. + Larger backwards copies are also handled by memcpy. The only remaining + case is forward large copies. The destination is aligned, and an + unrolled loop processes 64 bytes per iteration. +*/ -#ifdef BCOPY -ENTRY(bcopy) - /* Swap src and dst so that a branch to memcpy doesn't cause issues. */ - mov tmp1, origsrc - mov origsrc, origdstin - mov origdstin, tmp1 -#elif defined(WMEMMOVE) +#if defined(WMEMMOVE) ENTRY(wmemmove) lsl count, count, #2 #else ENTRY(memmove) #endif - cmp dstin, src - b.lo .Ldownwards - add tmp1, src, count - cmp dstin, tmp1 - b.hs memcpy /* No overlap. */ - - /* Upwards move with potential overlap. - * Need to move from the tail backwards. SRC and DST point one - * byte beyond the remaining data to move. */ - add dst, dstin, count - add src, src, count - cmp count, #64 - b.ge .Lmov_not_short_up - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -.Ltail63up: - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq .Ltail15up - sub dst, dst, tmp1 - sub src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #32] - stp A_l, A_h, [dst, #32] -1: - ldp A_l, A_h, [src, #16] - stp A_l, A_h, [dst, #16] -2: - ldp A_l, A_h, [src] - stp A_l, A_h, [dst] -.Ltail15up: - /* Move up to 15 bytes of data. Does not assume additional data - * being moved. */ - tbz count, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz count, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz count, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz count, #0, 1f - ldrb tmp1w, [src, #-1] - strb tmp1w, [dst, #-1] -1: - ret - -.Lmov_not_short_up: - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - ands tmp2, src, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src, #-1]! - strb tmp1w, [dst, #-1]! -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le .Ltail63up + sub tmp1, dstin, src + cmp count, 96 + ccmp tmp1, count, 2, hi + b.hs memcpy + + cbz tmp1, 3f + add dstend, dstin, count + add srcend, src, count + + /* Align dstend to 16 byte alignment so that we don't cross cache line + boundaries on both loads and stores. There are at least 96 bytes + to copy, so copy 16 bytes unaligned and then align. The loop + copies 64 bytes per iteration and prefetches one iteration ahead. */ + + and tmp1, dstend, 15 + ldp D_l, D_h, [srcend, -16] + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls 2f + nop +1: + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi 1b + + /* Write the last full set of 64 bytes. The remainder is at most 64 + bytes, so it is safe to always copy 64 bytes from the start even if + there is just 1 byte left. */ 2: - subs count, count, #128 - b.ge .Lmov_body_large_up - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src, #-64]! - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst, #-64]! - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - b.ne .Ltail63up - ret - - /* Critical loop. Start at a new Icache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -.Lmov_body_large_up: - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #-16] - ldp B_l, B_h, [src, #-32] - ldp C_l, C_h, [src, #-48] - ldp D_l, D_h, [src, #-64]! -1: - stp A_l, A_h, [dst, #-16] - ldp A_l, A_h, [src, #-16] - stp B_l, B_h, [dst, #-32] - ldp B_l, B_h, [src, #-32] - stp C_l, C_h, [dst, #-48] - ldp C_l, C_h, [src, #-48] - stp D_l, D_h, [dst, #-64]! - ldp D_l, D_h, [src, #-64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #-16] - stp B_l, B_h, [dst, #-32] - stp C_l, C_h, [dst, #-48] - stp D_l, D_h, [dst, #-64]! - tst count, #0x3f - b.ne .Ltail63up - ret - - -.Ldownwards: - /* For a downwards move we can safely use memcpy provided that - * DST is more than 16 bytes away from SRC. */ - sub tmp1, src, #16 - cmp dstin, tmp1 - b.ls memcpy /* May overlap, but not critically. */ - - mov dst, dstin /* Preserve DSTIN for return value. */ - cmp count, #64 - b.ge .Lmov_not_short_down - - /* Deal with small moves quickly by dropping straight into the - * exit block. */ -.Ltail63down: - /* Move up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. */ - ands tmp1, count, #0x30 - b.eq .Ltail15down - add dst, dst, tmp1 - add src, src, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-48] - stp A_l, A_h, [dst, #-48] -1: - ldp A_l, A_h, [src, #-32] - stp A_l, A_h, [dst, #-32] -2: - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] -.Ltail15down: - /* Move up to 15 bytes of data. Does not assume additional data - being moved. */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz count, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz count, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz count, #0, 1f - ldrb tmp1w, [src] - strb tmp1w, [dst] -1: - ret - -.Lmov_not_short_down: - /* We don't much care about the alignment of DST, but we want SRC - * to be 128-bit (16 byte) aligned so that we don't cross cache line - * boundaries on both loads and stores. */ - neg tmp2, src - ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ - b.eq 2f - sub count, count, tmp2 - /* Move enough data to reach alignment; unlike memcpy, we have to - * be aware of the overlap, which means we can't move data twice. */ - tbz tmp2, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz tmp2, #2, 1f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -1: - tbz tmp2, #1, 1f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -1: - tbz tmp2, #0, 1f - ldrb tmp1w, [src], #1 - strb tmp1w, [dst], #1 -1: - - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le .Ltail63down -2: - subs count, count, #128 - b.ge .Lmov_body_large_down - /* Less than 128 bytes to move, so handle 64 here and then jump - * to the tail. */ - ldp A_l, A_h, [src] - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48] - stp A_l, A_h, [dst] - stp B_l, B_h, [dst, #16] - stp C_l, C_h, [dst, #32] - stp D_l, D_h, [dst, #48] - tst count, #0x3f - add src, src, #64 - add dst, dst, #64 - b.ne .Ltail63down - ret - - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. */ - .p2align 6 -.Lmov_body_large_down: - /* There are at least 128 bytes to move. */ - ldp A_l, A_h, [src, #0] - sub dst, dst, #16 /* Pre-bias. */ - ldp B_l, B_h, [src, #16] - ldp C_l, C_h, [src, #32] - ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ -1: - stp A_l, A_h, [dst, #16] - ldp A_l, A_h, [src, #16] - stp B_l, B_h, [dst, #32] - ldp B_l, B_h, [src, #32] - stp C_l, C_h, [dst, #48] - ldp C_l, C_h, [src, #48] - stp D_l, D_h, [dst, #64]! - ldp D_l, D_h, [src, #64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #16] - stp B_l, B_h, [dst, #32] - stp C_l, C_h, [dst, #48] - stp D_l, D_h, [dst, #64] - add src, src, #16 - add dst, dst, #64 + 16 - tst count, #0x3f - b.ne .Ltail63down - ret -#ifdef BCOPY -END(bcopy) -#elif defined(WMEMMOVE) + ldp E_l, E_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp E_l, E_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] +3: ret + +#if defined(WMEMMOVE) END(wmemmove) #else END(memmove) diff --git a/libc/arch-arm64/generic/bionic/memset.S b/libc/arch-arm64/generic/bionic/memset.S index 7c204b4..4b3b17b 100644 --- a/libc/arch-arm64/generic/bionic/memset.S +++ b/libc/arch-arm64/generic/bionic/memset.S @@ -1,4 +1,4 @@ -/* Copyright (c) 2012, Linaro Limited +/* Copyright (c) 2012-2013, Linaro Limited All rights reserved. Redistribution and use in source and binary forms, with or without @@ -22,226 +22,207 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +/* + * Copyright (c) 2015 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ /* Assumptions: * - * ARMv8-a, AArch64 - * Unaligned accesses + * ARMv8-a, AArch64, unaligned accesses * */ #include <private/bionic_asm.h> -/* By default we assume that the DC instruction can be used to zero - data blocks more efficiently. In some circumstances this might be - unsafe, for example in an asymmetric multiprocessor environment with - different DC clear lengths (neither the upper nor lower lengths are - safe to use). - - If code may be run in a virtualized environment, then define - MAYBE_VIRT. This will cause the code to cache the system register - values rather than re-reading them each call. */ - -#define dstin x0 -#ifdef BZERO -#define count x1 -#else -#define count x2 -#endif -#define val w1 -#define tmp1 x3 -#define tmp1w w3 -#define tmp2 x4 -#define tmp2w w4 -#define zva_len_x x5 -#define zva_len w5 -#define zva_bits_x x6 - -#define A_l x7 -#define A_lw w7 -#define dst x8 -#define tmp3w w9 - -#ifdef BZERO -ENTRY(bzero) -#else +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define tmp1 x5 +#define tmp1w w5 +#define tmp2 x6 +#define tmp2w w6 +#define zva_len x7 +#define zva_lenw w7 + +#define L(l) .L ## l + ENTRY(memset) -#endif - - mov dst, dstin /* Preserve return value. */ -#ifdef BZERO - b .Lzero_mem -#endif - ands A_lw, val, #255 - b.eq .Lzero_mem - orr A_lw, A_lw, A_lw, lsl #8 - orr A_lw, A_lw, A_lw, lsl #16 - orr A_l, A_l, A_l, lsl #32 -.Ltail_maybe_long: - cmp count, #64 - b.ge .Lnot_short -.Ltail_maybe_tiny: - cmp count, #15 - b.le .Ltail15tiny -.Ltail63: - ands tmp1, count, #0x30 - b.eq .Ltail15 - add dst, dst, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - stp A_l, A_l, [dst, #-48] -1: - stp A_l, A_l, [dst, #-32] -2: - stp A_l, A_l, [dst, #-16] - -.Ltail15: - and count, count, #15 - add dst, dst, count - stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ - ret -.Ltail15tiny: - /* Set up to 15 bytes. Does not assume earlier memory - being set. */ - tbz count, #3, 1f - str A_l, [dst], #8 -1: - tbz count, #2, 1f - str A_lw, [dst], #4 -1: - tbz count, #1, 1f - strh A_lw, [dst], #2 -1: - tbz count, #0, 1f - strb A_lw, [dst] -1: + dup v0.16B, valw + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + nop +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] ret - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line, this ensures the entire loop is in one line. */ - .p2align 6 -.Lnot_short: - neg tmp2, dst - ands tmp2, tmp2, #15 - b.eq 2f - /* Bring DST to 128-bit (16-byte) alignment. We know that there's - * more than that to set, so we simply store 16 bytes and advance by - * the amount required to reach alignment. */ - sub count, count, tmp2 - stp A_l, A_l, [dst] - add dst, dst, tmp2 - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le .Ltail63 -2: - sub dst, dst, #16 /* Pre-bias. */ - sub count, count, #64 -1: - stp A_l, A_l, [dst, #16] - stp A_l, A_l, [dst, #32] - stp A_l, A_l, [dst, #48] - stp A_l, A_l, [dst, #64]! - subs count, count, #64 - b.ge 1b - tst count, #0x3f - add dst, dst, #16 - b.ne .Ltail63 + .p2align 3 + nop +L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 256 + ccmp valw, 0, 0, cs + b.eq L(try_zva) +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + add dst, dst, 16 + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] +L(tail64): + subs count, count, 64 + b.hi 1b +2: stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] ret - /* For zeroing memory, check to see if we can use the ZVA feature to - * zero entire 'cache' lines. */ -.Lzero_mem: - mov A_l, #0 - cmp count, #63 - b.le .Ltail_maybe_tiny - neg tmp2, dst - ands tmp2, tmp2, #15 - b.eq 1f - sub count, count, tmp2 - stp A_l, A_l, [dst] - add dst, dst, tmp2 - cmp count, #63 - b.le .Ltail63 -1: - /* For zeroing small amounts of memory, it's not worth setting up - * the line-clear code. */ - cmp count, #128 - b.lt .Lnot_short -#ifdef MAYBE_VIRT - /* For efficiency when virtualized, we cache the ZVA capability. */ - adrp tmp2, .Lcache_clear - ldr zva_len, [tmp2, #:lo12:.Lcache_clear] - tbnz zva_len, #31, .Lnot_short - cbnz zva_len, .Lzero_by_line + .p2align 3 +L(try_zva): mrs tmp1, dczid_el0 - tbz tmp1, #4, 1f - /* ZVA not available. Remember this for next time. */ - mov zva_len, #~0 - str zva_len, [tmp2, #:lo12:.Lcache_clear] - b .Lnot_short -1: - mov tmp3w, #4 - and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ - lsl zva_len, tmp3w, zva_len - str zva_len, [tmp2, #:lo12:.Lcache_clear] -#else - mrs tmp1, dczid_el0 - tbnz tmp1, #4, .Lnot_short - mov tmp3w, #4 - and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ - lsl zva_len, tmp3w, zva_len -#endif - -.Lzero_by_line: - /* Compute how far we need to go to become suitably aligned. We're - * already at quad-word alignment. */ - cmp count, zva_len_x - b.lt .Lnot_short /* Not enough to reach alignment. */ - sub zva_bits_x, zva_len_x, #1 - neg tmp2, dst - ands tmp2, tmp2, zva_bits_x - b.eq 1f /* Already aligned. */ - /* Not aligned, check that there's enough to copy after alignment. */ - sub tmp1, count, tmp2 - cmp tmp1, #64 - ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ - b.lt .Lnot_short - /* We know that there's at least 64 bytes to zero and that it's safe - * to overrun by 64 bytes. */ - mov count, tmp1 -2: - stp A_l, A_l, [dst] - stp A_l, A_l, [dst, #16] - stp A_l, A_l, [dst, #32] - subs tmp2, tmp2, #64 - stp A_l, A_l, [dst, #48] - add dst, dst, #64 - b.ge 2b - /* We've overrun a bit, so adjust dst downwards. */ - add dst, dst, tmp2 -1: - sub count, count, zva_len_x -3: - dc zva, dst - add dst, dst, zva_len_x - subs count, count, zva_len_x - b.ge 3b - ands count, count, zva_bits_x - b.ne .Ltail_maybe_long + tbnz tmp1w, 4, L(no_zva) + and tmp1w, tmp1w, 15 + cmp tmp1w, 4 /* ZVA size is 64 bytes. */ + b.ne L(zva_128) + + /* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA. This is faster on some cores. + */ +L(zva_64): + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 + nop +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] ret -#ifdef BZERO -END(bzero) -#else + + .p2align 3 +L(zva_128): + cmp tmp1w, 5 /* ZVA size is 128 bytes. */ + b.ne L(zva_other) + + str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+128 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 128 + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] + stp q0, q0, [dstend, -96] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(zva_other): + mov tmp2w, 4 + lsl zva_lenw, tmp2w, tmp1w + add tmp1, zva_len, 64 /* Max alignment bytes written. */ + cmp count, tmp1 + blo L(no_zva) + + sub tmp2, zva_len, 1 + add tmp1, dst, zva_len + add dst, dst, 16 + subs count, tmp1, dst /* Actual alignment bytes to write. */ + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ + beq 2f +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b +2: mov dst, tmp1 + sub count, dstend, tmp1 /* Remaining bytes to write. */ + subs count, count, zva_len + b.lo 4f +3: dc zva, dst + add dst, dst, zva_len + subs count, count, zva_len + b.hs 3b +4: add count, count, zva_len + b L(tail64) + END(memset) -#endif - -#ifdef MAYBE_VIRT - .bss - .p2align 2 -.Lcache_clear: - .space 4 -#endif diff --git a/libc/arch-arm64/generic/bionic/strlen.S b/libc/arch-arm64/generic/bionic/strlen.S index 3bd9809..6e540fc 100644 --- a/libc/arch-arm64/generic/bionic/strlen.S +++ b/libc/arch-arm64/generic/bionic/strlen.S @@ -1,16 +1,16 @@ -/* Copyright (c) 2014, Linaro Limited +/* Copyright (c) 2013-2015, Linaro Limited All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. + notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. * Neither the name of the Linaro nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT @@ -22,16 +22,19 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. */ #include <private/bionic_asm.h> +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ + /* Arguments and results. */ #define srcin x0 #define len x0 @@ -40,87 +43,185 @@ #define src x1 #define data1 x2 #define data2 x3 -#define data2a x4 -#define has_nul1 x5 -#define has_nul2 x6 -#define tmp1 x7 -#define tmp2 x8 -#define tmp3 x9 -#define tmp4 x10 -#define zeroones x11 -#define pos x12 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define L(l) .L ## l + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. A faster check + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives + false hits for characters 129..255. */ #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 - /* Start of critial section -- keep to one 64Byte cache line. */ +#ifdef TEST_PAGE_CROSS +# define MIN_PAGE_SIZE 15 +#else +# define MIN_PAGE_SIZE 4096 +#endif + + /* Since strings are short on average, we check the first 16 bytes + of the string for a NUL character. In order to do an unaligned ldp + safely we have to do a page cross check first. If there is a NUL + byte we calculate the length from the 2 8-byte words using + conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 16 bytes, we align src so don't need + further page cross checks, and process 32 bytes per iteration + using the fast NUL check. If we encounter non-ASCII characters, + fallback to a second loop using the full NUL check. + + If the page cross check fails, we read 16 bytes from an aligned + address, remove any characters before the string, and continue + in the main loop using aligned loads. Since strings crossing a + page in the first 16 bytes are rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. + + AArch64 systems have a minimum page size of 4k. We don't bother + checking for larger page sizes - the cost of setting up the correct + page size is just not worth the extra gain from a small reduction in + the cases taking the slow path. Note that we only care about + whether the first fetch, which may be misaligned, crosses a page + boundary. */ + ENTRY(strlen) - mov zeroones, #REP8_01 - bic src, srcin, #15 - ands tmp1, srcin, #15 - b.ne .Lmisaligned - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ -.Lloop: - ldp data1, data2, [src], #16 -.Lrealigned: + and tmp1, srcin, MIN_PAGE_SIZE - 1 + mov zeroones, REP8_01 + cmp tmp1, MIN_PAGE_SIZE - 16 + b.gt L(page_cross) + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. + Since we expect strings to be small and early-exit, + byte-swap the data now so has_null1/2 will be correct. */ + rev data1, data1 + rev data2, data2 +#endif sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f + orr tmp2, data1, REP8_7f sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lloop - /* End of critical section -- keep to one 64Byte cache line. */ + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(main_loop_entry) - sub len, src, srcin - cbz has_nul1, .Lnul_in_data2 -#ifdef __AARCH64EB__ - mov data2, data1 -#endif - sub len, len, #8 - mov has_nul2, has_nul1 -.Lnul_in_data2: + /* Enter with C = has_nul1 == 0. */ + csel has_nul1, has_nul1, has_nul2, cc + mov len, 8 + rev has_nul1, has_nul1 + clz tmp1, has_nul1 + csel len, xzr, len, cc + add len, len, tmp1, lsr 3 + ret + + /* The inner loop processes 32 bytes per iteration and uses the fast + NUL check. If we encounter non-ASCII characters, use a second + loop with the accurate NUL check. */ + .p2align 4 +L(main_loop_entry): + bic src, srcin, 15 + sub src, src, 16 +L(main_loop): + ldp data1, data2, [src, 32]! +.Lpage_cross_entry: + sub tmp1, data1, zeroones + sub tmp3, data2, zeroones + orr tmp2, tmp1, tmp3 + tst tmp2, zeroones, lsl 7 + bne 1f + ldp data1, data2, [src, 16] + sub tmp1, data1, zeroones + sub tmp3, data2, zeroones + orr tmp2, tmp1, tmp3 + tst tmp2, zeroones, lsl 7 + beq L(main_loop) + add src, src, 16 +1: + /* The fast check failed, so do the slower, accurate NUL check. */ + orr tmp2, data1, REP8_7f + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(nonascii_loop) + + /* Enter with C = has_nul1 == 0. */ +L(tail): #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The + string is 0x01) means we cannot use has_nul1/2 directly. The easiest way to get the correct byte is to byte-swap the data and calculate the syndrome a second time. */ - rev data2, data2 - sub tmp1, data2, zeroones - orr tmp2, data2, #REP8_7f - bic has_nul2, tmp1, tmp2 + csel data1, data1, data2, cc + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + bic has_nul1, tmp1, tmp2 +#else + csel has_nul1, has_nul1, has_nul2, cc #endif - sub len, len, #8 - rev has_nul2, has_nul2 - clz pos, has_nul2 - add len, len, pos, lsr #3 /* Bits to bytes. */ + sub len, src, srcin + rev has_nul1, has_nul1 + add tmp2, len, 8 + clz tmp1, has_nul1 + csel len, len, tmp2, cc + add len, len, tmp1, lsr 3 ret -.Lmisaligned: - cmp tmp1, #8 - neg tmp1, tmp1 - ldp data1, data2, [src], #16 - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - mov tmp2, #~0 +L(nonascii_loop): + ldp data1, data2, [src, 16]! + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + bne L(tail) + ldp data1, data2, [src, 16]! + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(nonascii_loop) + b L(tail) + + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede + srcin to 0x7f, so we ignore any NUL bytes before the string. + Then continue in the aligned loop. */ +L(page_cross): + bic src, srcin, 15 + ldp data1, data2, [src] + lsl tmp1, srcin, 3 + mov tmp4, -1 #ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ + /* Big-endian. Early bytes are at MSB. */ + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ #else /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ #endif - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - csinv data1, data1, xzr, le - csel data2, data2, data2a, le - b .Lrealigned + orr tmp1, tmp1, REP8_80 + orn data1, data1, tmp1 + orn tmp2, data2, tmp1 + tst srcin, 8 + csel data1, data1, tmp4, eq + csel data2, data2, tmp2, eq + b L(page_cross_entry) END(strlen) diff --git a/libc/arch-arm64/generic/bionic/strrchr.S b/libc/arch-arm64/generic/bionic/strrchr.S new file mode 100644 index 0000000..46b5031 --- /dev/null +++ b/libc/arch-arm64/generic/bionic/strrchr.S @@ -0,0 +1,171 @@ +/* + strrchr - find last instance of a character in a string + + Copyright (c) 2014, ARM Limited + All rights Reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the company nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include <private/bionic_asm.h> + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 +#define src_match x6 +#define src_offset x7 +#define const_m1 x8 +#define tmp4 x9 +#define nul_match x10 +#define chr_match x11 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask_0 v7 +#define vrepmask_c v16 +#define vend1 v17 +#define vend2 v18 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character; bit 1 is set + iff the relevant byte matched the NUL end of string (we trigger + off bit0 for the special case of looking for NUL). Since the bits + in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination, and why. */ + +/* Locals and temporaries. */ + +ENTRY(strrchr) + /* Magic constant 0x40100401 to allow us to identify which lane + matches the requested byte. Magic constant 0x80200802 used + similarly for NUL termination. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask_c.4s, wtmp2 + mov src_offset, #0 + ands tmp1, srcin, #31 + add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ + b.eq .Laligned + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vhas_nul1.2d[0] + lsl tmp1, tmp1, #1 + mov const_m1, #~0 + mov chr_match, vhas_chr1.2d[0] + lsr tmp3, const_m1, tmp1 + + bic nul_match, nul_match, tmp3 // Mask padding bits. + bic chr_match, chr_match, tmp3 // Mask padding bits. + cbnz nul_match, .Ltail + +.Lloop: + cmp chr_match, #0 + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne +.Laligned: + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vend1.16b, vend1.16b // 128->64 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.2d[0] + mov chr_match, vhas_chr1.2d[0] + cbz nul_match, .Lloop + + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b + mov nul_match, vhas_nul1.2d[0] + +.Ltail: + /* Work out exactly where the string ends. */ + sub tmp4, nul_match, #1 + eor tmp4, tmp4, nul_match + ands chr_match, chr_match, tmp4 + /* And pick the values corresponding to the last match. */ + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne + + /* Count down from the top of the syndrome to find the last match. */ + clz tmp3, src_offset + /* Src_match points beyond the word containing the match, so we can + simply subtract half the bit-offset into the syndrome. Because + we are counting down, we need to go back one more character. */ + add tmp3, tmp3, #2 + sub result, src_match, tmp3, lsr #1 + /* But if the syndrome shows no match was found, then return NULL. */ + cmp src_offset, #0 + csel result, result, xzr, ne + + ret + +END(strrchr) diff --git a/libc/arch-arm64/generic/generic.mk b/libc/arch-arm64/generic/generic.mk index 1b595aa..4512dc5 100644 --- a/libc/arch-arm64/generic/generic.mk +++ b/libc/arch-arm64/generic/generic.mk @@ -11,4 +11,5 @@ libc_bionic_src_files_arm64 += \ arch-arm64/generic/bionic/strlen.S \ arch-arm64/generic/bionic/strncmp.S \ arch-arm64/generic/bionic/strnlen.S \ + arch-arm64/generic/bionic/strrchr.S \ arch-arm64/generic/bionic/wmemmove.S diff --git a/libc/arch-arm64/kryo/bionic/memcpy.S b/libc/arch-arm64/kryo/bionic/memcpy.S new file mode 100644 index 0000000..87e1b3b --- /dev/null +++ b/libc/arch-arm64/kryo/bionic/memcpy.S @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +// Prototype: void *memcpy (void *dst, const void *src, size_t count). + +#include <private/bionic_asm.h> +#include <private/libc_events.h> + +ENTRY(__memcpy_chk) + cmp x2, x3 + b.hi __memcpy_chk_fail + + // Fall through to memcpy... + b memcpy +END(__memcpy_chk) + + .align 6 +ENTRY(memcpy) + #include "memcpy_base.S" +END(memcpy) + +ENTRY_PRIVATE(__memcpy_chk_fail) + // Preserve for accurate backtrace. + stp x29, x30, [sp, -16]! + .cfi_def_cfa_offset 16 + .cfi_rel_offset x29, 0 + .cfi_rel_offset x30, 8 + + adrp x0, error_string + add x0, x0, :lo12:error_string + ldr x1, error_code + bl __fortify_chk_fail +error_code: + .word BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW +END(__memcpy_chk_fail) + + .data + .align 2 +error_string: + .string "memcpy: prevented write past end of buffer" diff --git a/libc/arch-arm64/kryo/bionic/memcpy_base.S b/libc/arch-arm64/kryo/bionic/memcpy_base.S new file mode 100644 index 0000000..0096bb7 --- /dev/null +++ b/libc/arch-arm64/kryo/bionic/memcpy_base.S @@ -0,0 +1,244 @@ +/* Copyright (c) 2015 The Linux Foundation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of The Linux Foundation nor the names of its contributors may + * be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef PLDOFFS +#undef PLDOFFS +#endif +#define PLDOFFS (16) + +#ifdef PLDTHRESH +#undef PLDTHRESH +#endif +#define PLDTHRESH (PLDOFFS) + +#ifdef BBTHRESH +#undef BBTHRESH +#endif +#define BBTHRESH (2048/128) + +#if (PLDOFFS < 1) +#error Routine does not support offsets less than 1 +#endif +#if (PLDTHRESH < PLDOFFS) +#error PLD threshold must be greater than or equal to the PLD offset +#endif + +#ifdef PLDSIZE +#undef PLDSIZE +#endif +#define PLDSIZE (128) + +kryo_bb_memcpy: + mov x11, x0 + cmp x2, #4 + blo kryo_bb_lt4 + cmp x2, #16 + blo kryo_bb_lt16 + cmp x2, #32 + blo kryo_bb_16 + cmp x2, #64 + blo kryo_bb_copy_32_a + cmp x2, #128 + blo kryo_bb_copy_64_a + + // we have at least 127 bytes to achieve 128-byte alignment + neg x3, x1 // calculate count to get SOURCE aligned + ands x3, x3, #0x7F + b.eq kryo_bb_source_aligned // already aligned + // alignment fixup, small to large (favorable alignment) + tbz x3, #0, 1f + ldrb w5, [x1], #1 + strb w5, [x0], #1 +1: tbz x3, #1, 2f + ldrh w6, [x1], #2 + strh w6, [x0], #2 +2: tbz x3, #2, 3f + ldr w8, [x1], #4 + str w8, [x0], #4 +3: tbz x3, #3, 4f + ldr x9, [x1], #8 + str x9, [x0], #8 +4: tbz x3, #4, 5f + ldr q7, [x1], #16 + str q7, [x0], #16 +5: tbz x3, #5, 55f + ldp q0, q1, [x1], #32 + stp q0, q1, [x0], #32 +55: tbz x3, #6, 6f + ldp q0, q1, [x1], #32 + ldp q2, q3, [x1], #32 + stp q0, q1, [x0], #32 + stp q2, q3, [x0], #32 +6: subs x2, x2, x3 // fixup count after alignment + b.eq kryo_bb_exit + cmp x2, #128 + blo kryo_bb_copy_64_a +kryo_bb_source_aligned: + lsr x12, x2, #7 + cmp x12, #PLDTHRESH + bls kryo_bb_copy_128_loop_nopld + + cmp x12, #BBTHRESH + bls kryo_bb_prime_pump + + add x14, x0, #0x400 + add x9, x1, #(PLDOFFS*PLDSIZE) + sub x14, x14, x9 + lsl x14, x14, #(21+32) + lsr x14, x14, #(21+32) + add x14, x14, #(PLDOFFS*PLDSIZE) + cmp x12, x14, lsr #7 + bls kryo_bb_prime_pump + + mov x9, #(PLDOFFS) + lsr x13, x14, #7 + subs x9, x13, x9 + bls kryo_bb_prime_pump + + add x10, x1, x14 + bic x10, x10, #0x7F // Round to multiple of PLDSIZE + + sub x12, x12, x14, lsr #7 + cmp x9, x12 + sub x13, x12, x9 + csel x12, x13, x12, LS + csel x9, x12, x9, HI + csel x12, xzr, x12, HI + + prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE)] + prfm PLDL1STRM, [x1, #((PLDOFFS-1)*PLDSIZE+64)] +kryo_bb_copy_128_loop_outer_doublepld: + prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)] + prfm PLDL1STRM, [x1, #((PLDOFFS)*PLDSIZE)+64] + subs x9, x9, #1 + ldp q0, q1, [x1], #32 + ldp q2, q3, [x1], #32 + ldp q4, q5, [x1], #32 + ldp q6, q7, [x1], #32 + prfm PLDL1KEEP, [x10] + prfm PLDL1KEEP, [x10, #64] + add x10, x10, #128 + stp q0, q1, [x0], #32 + stp q2, q3, [x0], #32 + stp q4, q5, [x0], #32 + stp q6, q7, [x0], #32 + bne kryo_bb_copy_128_loop_outer_doublepld + cmp x12, #0 + beq kryo_bb_pop_before_nopld + cmp x12, #(448*1024/128) + bls kryo_bb_copy_128_loop_outer + +kryo_bb_copy_128_loop_ddr: + subs x12, x12, #1 + ldr x3, [x10], #128 + ldp q0, q1, [x1], #32 + ldp q2, q3, [x1], #32 + ldp q4, q5, [x1], #32 + ldp q6, q7, [x1], #32 + stp q0, q1, [x0], #32 + stp q2, q3, [x0], #32 + stp q4, q5, [x0], #32 + stp q6, q7, [x0], #32 + bne kryo_bb_copy_128_loop_ddr + b kryo_bb_pop_before_nopld + +kryo_bb_prime_pump: + mov x14, #(PLDOFFS*PLDSIZE) + add x10, x1, #(PLDOFFS*PLDSIZE) + bic x10, x10, #0x7F + sub x12, x12, #PLDOFFS + prfm PLDL1KEEP, [x10, #(-1*PLDSIZE)] + prfm PLDL1KEEP, [x10, #(-1*PLDSIZE+64)] + cmp x12, #(448*1024/128) + bhi kryo_bb_copy_128_loop_ddr + +kryo_bb_copy_128_loop_outer: + subs x12, x12, #1 + prfm PLDL1KEEP, [x10] + prfm PLDL1KEEP, [x10, #64] + ldp q0, q1, [x1], #32 + ldp q2, q3, [x1], #32 + ldp q4, q5, [x1], #32 + ldp q6, q7, [x1], #32 + add x10, x10, #128 + stp q0, q1, [x0], #32 + stp q2, q3, [x0], #32 + stp q4, q5, [x0], #32 + stp q6, q7, [x0], #32 + bne kryo_bb_copy_128_loop_outer + +kryo_bb_pop_before_nopld: + lsr x12, x14, #7 +kryo_bb_copy_128_loop_nopld: + ldp q0, q1, [x1], #32 + ldp q2, q3, [x1], #32 + ldp q4, q5, [x1], #32 + ldp q6, q7, [x1], #32 + subs x12, x12, #1 + stp q0, q1, [x0], #32 + stp q2, q3, [x0], #32 + stp q4, q5, [x0], #32 + stp q6, q7, [x0], #32 + bne kryo_bb_copy_128_loop_nopld + ands x2, x2, #0x7f + beq kryo_bb_exit + +kryo_bb_copy_64_a: + tbz x2, #6, kryo_bb_copy_32_a + ldp q0, q1, [x1], #32 + ldp q2, q3, [x1], #32 + stp q0, q1, [x0], #32 + stp q2, q3, [x0], #32 +kryo_bb_copy_32_a: + tbz x2, #5, kryo_bb_16 + ldp q0, q1, [x1], #32 + stp q0, q1, [x0], #32 +kryo_bb_16: + tbz x2, #4, kryo_bb_lt16 + ldr q7, [x1], #16 + str q7, [x0], #16 + ands x2, x2, #0x0f + beq kryo_bb_exit +kryo_bb_lt16: + tbz x2, #3, kryo_bb_lt8 + ldr x3, [x1], #8 + str x3, [x0], #8 +kryo_bb_lt8: + tbz x2, #2, kryo_bb_lt4 + ldr w3, [x1], #4 + str w3, [x0], #4 +kryo_bb_lt4: + tbz x2, #1, kryo_bb_lt2 + ldrh w3, [x1], #2 + strh w3, [x0], #2 +kryo_bb_lt2: + tbz x2, #0, kryo_bb_exit + ldrb w3, [x1], #1 + strb w3, [x0], #1 +kryo_bb_exit: + mov x0, x11 + ret + diff --git a/libc/arch-arm64/kryo/kryo.mk b/libc/arch-arm64/kryo/kryo.mk new file mode 100644 index 0000000..1d901d0 --- /dev/null +++ b/libc/arch-arm64/kryo/kryo.mk @@ -0,0 +1,15 @@ +libc_bionic_src_files_arm64 += \ + arch-arm64/generic/bionic/memchr.S \ + arch-arm64/generic/bionic/memcmp.S \ + arch-arm64/kryo/bionic/memcpy.S \ + arch-arm64/generic/bionic/memmove.S \ + arch-arm64/generic/bionic/memset.S \ + arch-arm64/generic/bionic/stpcpy.S \ + arch-arm64/generic/bionic/strchr.S \ + arch-arm64/generic/bionic/strcmp.S \ + arch-arm64/generic/bionic/strcpy.S \ + arch-arm64/generic/bionic/strlen.S \ + arch-arm64/generic/bionic/strncmp.S \ + arch-arm64/generic/bionic/strnlen.S \ + arch-arm64/generic/bionic/strrchr.S \ + arch-arm64/generic/bionic/wmemmove.S diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S index 0dbffad..6a5afd6 100644 --- a/libc/arch-x86_64/string/sse2-memmove-slm.S +++ b/libc/arch-x86_64/string/sse2-memmove-slm.S @@ -91,9 +91,6 @@ name: \ .section .text.sse2,"ax",@progbits ENTRY (MEMMOVE) ENTRANCE -#ifdef USE_AS_BCOPY - xchg %rsi, %rdi -#endif mov %rdi, %rax /* Check whether we should copy backward or forward. */ diff --git a/libc/bionic/malloc_debug_check.cpp b/libc/bionic/malloc_debug_check.cpp index dee03fa..ad0e613 100644 --- a/libc/bionic/malloc_debug_check.cpp +++ b/libc/bionic/malloc_debug_check.cpp @@ -45,6 +45,7 @@ #include <time.h> #include <unistd.h> #include <unwind.h> +#include <signal.h> #include "debug_mapinfo.h" #include "debug_stacktrace.h" @@ -55,6 +56,14 @@ #include "private/libc_logging.h" #include "private/ScopedPthreadMutexLocker.h" +static unsigned int malloc_sig_enabled = 0; +static unsigned int min_allocation_report_limit; +static unsigned int max_allocation_limit; +static const char* process_name; +static size_t total_count = 0; +static bool isDumped = false; +static bool sigHandled = false; + #define MAX_BACKTRACE_DEPTH 16 #define ALLOCATION_TAG 0x1ee7d00d #define BACKLOG_TAG 0xbabecafe @@ -63,6 +72,11 @@ #define FRONT_GUARD_LEN (1<<5) #define REAR_GUARD 0xbb #define REAR_GUARD_LEN (1<<5) +#define FRONT_GUARD_SS 0xab +#define DEBUG_SIGNAL SIGWINCH + +static void malloc_sigaction(int signum, siginfo_t * sg, void * cxt); +static struct sigaction default_sa; static void log_message(const char* format, ...) { va_list args; @@ -135,9 +149,14 @@ static inline void init_front_guard(hdr_t* hdr) { memset(hdr->front_guard, FRONT_GUARD, FRONT_GUARD_LEN); } +static inline void set_snapshot(hdr_t* hdr) { + memset(hdr->front_guard, FRONT_GUARD_SS, FRONT_GUARD_LEN); +} + static inline bool is_front_guard_valid(hdr_t* hdr) { for (size_t i = 0; i < FRONT_GUARD_LEN; i++) { - if (hdr->front_guard[i] != FRONT_GUARD) { + if (!((hdr->front_guard[i] == FRONT_GUARD) || + (hdr->front_guard[i] == FRONT_GUARD_SS))) { return false; } } @@ -171,6 +190,9 @@ static inline bool is_rear_guard_valid(hdr_t* hdr) { } static inline void add_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) { + if (hdr->tag == ALLOCATION_TAG) { + total_count += hdr->size; + } hdr->prev = NULL; hdr->next = *head; if (*head) @@ -181,6 +203,9 @@ static inline void add_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) { } static inline int del_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) { + if (hdr->tag == ALLOCATION_TAG) { + total_count -= hdr->size; + } if (hdr->prev) { hdr->prev->next = hdr->next; } else { @@ -194,6 +219,25 @@ static inline int del_locked(hdr_t* hdr, hdr_t** tail, hdr_t** head) { return 0; } +static void snapshot_report_leaked_nodes() { + log_message("%s: %s\n", __FILE__, __FUNCTION__); + hdr_t * iterator = head; + size_t total_size = 0; + do { + if (iterator->front_guard[0] == FRONT_GUARD && + iterator->size >= min_allocation_report_limit) { + log_message("obj %p, size %d", iterator, iterator->size); + total_size += iterator->size; + log_backtrace(iterator->bt, iterator->bt_depth); + log_message("------------------------------"); // as an end marker + // Marking the node as we do not want to print it again. + set_snapshot(iterator); + } + iterator = iterator->next; + } while (iterator); + log_message("Total Pending allocations after last snapshot: %d", total_size); +} + static inline void add(hdr_t* hdr, size_t size) { ScopedPthreadMutexLocker locker(&lock); hdr->tag = ALLOCATION_TAG; @@ -202,6 +246,11 @@ static inline void add(hdr_t* hdr, size_t size) { init_rear_guard(hdr); ++g_allocated_block_count; add_locked(hdr, &tail, &head); + if ((total_count >= max_allocation_limit) && !isDumped && malloc_sig_enabled) { + isDumped = true; + sigHandled = true; // Need to bypass the snapshot + kill(getpid(), DEBUG_SIGNAL); + } } static inline int del(hdr_t* hdr) { @@ -233,7 +282,8 @@ static bool was_used_after_free(hdr_t* hdr) { static inline int check_guards(hdr_t* hdr, int* safe) { *safe = 1; if (!is_front_guard_valid(hdr)) { - if (hdr->front_guard[0] == FRONT_GUARD) { + if ((hdr->front_guard[0] == FRONT_GUARD) || + ((hdr->front_guard[0] == FRONT_GUARD_SS))) { log_message("+++ ALLOCATION %p SIZE %d HAS A CORRUPTED FRONT GUARD\n", user(hdr), hdr->size); } else { @@ -656,6 +706,42 @@ extern "C" bool malloc_debug_initialize(HashTable* hash_table, const MallocDebug __libc_format_log(ANDROID_LOG_INFO, "libc", "not gathering backtrace information\n"); } + if (__system_property_get("libc.debug.malloc", env)) { + if(atoi(env) == 40) malloc_sig_enabled = 1; + } + + if (malloc_sig_enabled) { + char debug_proc_size[PROP_VALUE_MAX]; + if (__system_property_get("libc.debug.malloc.maxprocsize", debug_proc_size)) + max_allocation_limit = atoi(debug_proc_size); + else + max_allocation_limit = 30 * 1024 * 1024; // In Bytes [Default is 30 MB] + if (__system_property_get("libc.debug.malloc.minalloclim", debug_proc_size)) + min_allocation_report_limit = atoi(debug_proc_size); + else + min_allocation_report_limit = 10 * 1024; // In Bytes [Default is 10 KB] + process_name = getprogname(); + } + +/* Initializes malloc debugging framework. + * See comments on MallocDebugInit in malloc_debug_common.h + */ + if (malloc_sig_enabled) { + struct sigaction sa; //local or static? + sa.sa_handler = NULL; + sa.sa_sigaction = malloc_sigaction; + sigemptyset(&sa.sa_mask); + sigaddset(&sa.sa_mask, DEBUG_SIGNAL); + sa.sa_flags = SA_SIGINFO; + sa.sa_restorer = NULL; + if (sigaction(DEBUG_SIGNAL, &sa, &default_sa) < 0) { + log_message("Failed to register signal handler w/ errno %s", strerror(errno)); + malloc_sig_enabled = 0; + } else { + log_message("Registered signal handler"); + sigHandled = false; + } + } if (g_backtrace_enabled) { backtrace_startup(); } @@ -668,9 +754,66 @@ extern "C" void malloc_debug_finalize(int malloc_debug_level) { if (malloc_debug_level == 10) { ReportMemoryLeaks(); } + if (malloc_sig_enabled) { + log_message("Deregister %d signal handler", DEBUG_SIGNAL); + sigaction(DEBUG_SIGNAL, &default_sa, NULL); + malloc_sig_enabled = 0; + sigHandled = false; + } if (g_backtrace_enabled) { backtrace_shutdown(); } pthread_setspecific(g_debug_calls_disabled, NULL); } + +static void snapshot_nodes_locked() { + log_message("%s: %s\n", __FILE__, __FUNCTION__); + hdr_t * iterator = head; + do { + if (iterator->front_guard[0] == FRONT_GUARD) { + set_snapshot(iterator); + } + iterator = iterator->next; + } while (iterator); +} + +static void malloc_sigaction(int signum, siginfo_t * info, void * context) +{ + log_message("%s: %s\n", __FILE__, __FUNCTION__); + log_message("%s got %d signal from PID: %d (context:%x)\n", + __func__, signum, info->si_pid, context); + + if (signum != DEBUG_SIGNAL) { + log_message("RECEIVED %d instead of %d\n", signum, DEBUG_SIGNAL); + return; + } + + log_message("Process under observation:%s", process_name); + log_message("Maximum process size limit:%d Bytes", max_allocation_limit); + log_message("Won't print allocation below %d Bytes", min_allocation_report_limit); + log_message("Total count: %d\n", total_count); + + if (!head) { + log_message("No allocations?"); + return; + } + // If sigHandled is false, meaning it's being handled first time + if (!sigHandled) { + sigHandled = true; + // Marking the nodes assuming that they should not be leaked nodes. + snapshot_nodes_locked(); + } else { + // We need to print new allocations now + log_message("Start dumping allocations of the process %s", process_name); + log_message("+++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ ***\n"); + + // Print allocations of the process + if (g_backtrace_enabled) + snapshot_report_leaked_nodes(); + + log_message("*** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++ *** +++\n"); + log_message("Completed dumping allocations of the process %s", process_name); + } + return; +} diff --git a/libc/bionic/malloc_debug_common.cpp b/libc/bionic/malloc_debug_common.cpp index ee796c6..12fc6dd 100644 --- a/libc/bionic/malloc_debug_common.cpp +++ b/libc/bionic/malloc_debug_common.cpp @@ -396,6 +396,9 @@ static void malloc_init_impl() { } so_name = "libc_malloc_debug_qemu.so"; break; + case 40: + so_name = "libc_malloc_debug_leak.so"; + break; default: error_log("%s: Debug level %d is unknown\n", getprogname(), g_malloc_debug_level); return; @@ -456,6 +459,9 @@ static void malloc_init_impl() { case 20: InitMalloc(malloc_impl_handle, &malloc_dispatch_table, "qemu_instrumented"); break; + case 40: + InitMalloc(malloc_impl_handle, &malloc_dispatch_table, "chk"); + break; default: break; } diff --git a/libc/bionic/mmap.cpp b/libc/bionic/mmap.cpp index 8f25a89..53e8b46 100644 --- a/libc/bionic/mmap.cpp +++ b/libc/bionic/mmap.cpp @@ -36,6 +36,11 @@ extern "C" void* __mmap2(void*, size_t, int, int, int, size_t); #define MMAP2_SHIFT 12 // 2**12 == 4096 +#ifdef LEGACY_MMAP +#define TO_64(a) ((a) & 0x00000000ffffffff) +#else +#define TO_64(a) (a) +#endif static bool kernel_has_MADV_MERGEABLE = true; @@ -60,5 +65,5 @@ void* mmap64(void* addr, size_t size, int prot, int flags, int fd, off64_t offse } void* mmap(void* addr, size_t size, int prot, int flags, int fd, off_t offset) { - return mmap64(addr, size, prot, flags, fd, static_cast<off64_t>(offset)); + return mmap64(addr, size, prot, flags, fd, TO_64(static_cast<off64_t>(offset))); } diff --git a/libc/include/paths.h b/libc/include/paths.h index 82c2804..7700cdd 100644 --- a/libc/include/paths.h +++ b/libc/include/paths.h @@ -33,6 +33,7 @@ #define _PATHS_H_ #define _PATH_BSHELL "/system/bin/sh" +#define _PATH_BSHELL2 "/sbin/sh" #define _PATH_CONSOLE "/dev/console" #define _PATH_DEFPATH "/sbin:/vendor/bin:/system/sbin:/system/bin:/system/xbin" #define _PATH_DEV "/dev/" diff --git a/libc/include/regex.h b/libc/include/regex.h index aec38e3..b06a515 100644 --- a/libc/include/regex.h +++ b/libc/include/regex.h @@ -42,8 +42,9 @@ #include <sys/cdefs.h> #include <sys/types.h> -/* types */ -typedef off_t regoff_t; +/* POSIX says regoff_t is at least as large as the larger of ptrdiff_t and + * ssize_t. BSD uses off_t, but that interacts badly with _FILE_OFFSET_BITS. */ +typedef ssize_t regoff_t; typedef struct { int re_magic; diff --git a/libc/kernel/uapi/linux/android_alarm.h b/libc/kernel/uapi/linux/android_alarm.h index 801a01e..9f2de28 100644 --- a/libc/kernel/uapi/linux/android_alarm.h +++ b/libc/kernel/uapi/linux/android_alarm.h @@ -28,28 +28,31 @@ enum android_alarm_type { /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ ANDROID_ALARM_ELAPSED_REALTIME, ANDROID_ALARM_SYSTEMTIME, + ANDROID_ALARM_RTC_POWEROFF_WAKEUP, ANDROID_ALARM_TYPE_COUNT, -}; /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ +}; enum android_alarm_return_flags { ANDROID_ALARM_RTC_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_WAKEUP, ANDROID_ALARM_RTC_MASK = 1U << ANDROID_ALARM_RTC, - ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP, /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ + ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP, ANDROID_ALARM_ELAPSED_REALTIME_MASK = 1U << ANDROID_ALARM_ELAPSED_REALTIME, ANDROID_ALARM_SYSTEMTIME_MASK = 1U << ANDROID_ALARM_SYSTEMTIME, + ANDROID_ALARM_RTC_POWEROFF_WAKEUP_MASK = 1U << ANDROID_ALARM_RTC_POWEROFF_WAKEUP, +/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ ANDROID_ALARM_TIME_CHANGE_MASK = 1U << 16 }; -/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ #define ANDROID_ALARM_CLEAR(type) _IO('a', 0 | ((type) << 4)) #define ANDROID_ALARM_WAIT _IO('a', 1) +/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ #define ALARM_IOW(c,type,size) _IOW('a', (c) | ((type) << 4), size) #define ANDROID_ALARM_SET(type) ALARM_IOW(2, type, struct timespec) -/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ #define ANDROID_ALARM_SET_AND_WAIT(type) ALARM_IOW(3, type, struct timespec) #define ANDROID_ALARM_GET_TIME(type) ALARM_IOW(4, type, struct timespec) +/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ #define ANDROID_ALARM_SET_RTC _IOW('a', 5, struct timespec) #define ANDROID_ALARM_BASE_CMD(cmd) (cmd & ~(_IOC(0, 0, 0xf0, 0))) -/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ #define ANDROID_ALARM_IOCTL_TO_TYPE(cmd) (_IOC_NR(cmd) >> 4) #endif +/* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ diff --git a/libc/kernel/uapi/linux/time.h b/libc/kernel/uapi/linux/time.h index bf245fc..5690d27 100644 --- a/libc/kernel/uapi/linux/time.h +++ b/libc/kernel/uapi/linux/time.h @@ -67,9 +67,10 @@ struct itimerval { #define CLOCK_SGI_CYCLE 10 /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ #define CLOCK_TAI 11 +#define CLOCK_POWEROFF_ALARM 12 #define MAX_CLOCKS 16 #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC) -#define CLOCKS_MONO CLOCK_MONOTONIC /* WARNING: DO NOT EDIT, AUTO-GENERATED CODE - SEE TOP FOR INSTRUCTIONS */ +#define CLOCKS_MONO CLOCK_MONOTONIC #define TIMER_ABSTIME 0x01 #endif diff --git a/libc/upstream-netbsd/lib/libc/gen/popen.c b/libc/upstream-netbsd/lib/libc/gen/popen.c index 593e346..b6ce47c 100644 --- a/libc/upstream-netbsd/lib/libc/gen/popen.c +++ b/libc/upstream-netbsd/lib/libc/gen/popen.c @@ -152,6 +152,8 @@ popen(const char *command, const char *type) } execl(_PATH_BSHELL, "sh", "-c", command, NULL); + if (errno == ENOENT) + execl(_PATH_BSHELL2, "sh", "-c", command, NULL); _exit(127); /* NOTREACHED */ } |