diff options
author | Christopher Ferris <cferris@google.com> | 2013-03-15 16:01:17 -0700 |
---|---|---|
committer | Gerrit Code Review <gerrit@cyanogenmod.org> | 2013-08-13 04:24:11 +0000 |
commit | 392d2cd535185b4013ae7279587aaae3972737d1 (patch) | |
tree | 83b70ad533b00a15c46249e00e454500a6bc8f3a | |
parent | 8f157dedc846f9f143540d88075222767e3d174a (diff) | |
download | bionic-392d2cd535185b4013ae7279587aaae3972737d1.zip bionic-392d2cd535185b4013ae7279587aaae3972737d1.tar.gz bionic-392d2cd535185b4013ae7279587aaae3972737d1.tar.bz2 |
libc: Update to latest cortexa15 memcpy code.
This uses the new code original submitted as memcpy.a15.S as
the base. However, the old code handled unaligned src/dst better
so that was spliced in. I optimized the original unaligned code by
removing a few unnecessary instructions. I optimized the a15 code by
rewriting the pre and post code. I also modified the main loop to add
a pld so that larger copies would not stall waiting for memory.
Test cases for the new memcpy:
- Copy all sized values from 0 to 1024 bytes, using whatever alignment
is returned by malloc.
For each alignment case described below, the test copied from 0 to 128
bytes.
- Src and dst pointers are both aligned to the same value, starting
at one going through every power of two up to and including 128.
- Src aligned to double word boundary, dst aligned to word boundary.
- Src aligned to word boundary, dst aligned to double word boundary.
- Src aligned to 16 bit boundary, dst aligned to word boundary.
- Src aligned to word boundary, dst aligned to 16 byte boundary.
- Src aligned to word boundary, dst aligned to 1 byte from a word
boundary.
- Src aligned to word boundary, dst aligned to 2 bytes from a word
boundary.
- Src aligned to word boundary, dst aligned to 3 bytes from a word
boundary.
- Src aligned to 1 byte from a word boundary, dst aligned to a word
boundary.
- Src aligned to 2 bytes from a word boundary, dst aligned to a word
boundary.
- Src aligned to 3 bytes from a word boundary, dst aligned to a word
boundary.
Cases to verify the unaligned source code properly aligns to a 16 bit
boundary.
- Src aligned to 1 byte from a 128 bit boundary, dst aligned to
4 + 128 bit boundary.
- Src aligned to 1 byte from a 128 bit boundary, dst aligned to
8 + 128 bit boundary.
- Src aligned to 1 byte from a 128 bit boundary, dst aligned to
12 + 128 bit boundary.
- Src aligned to 1 byte from a 128 bit boundary, dst aligned to
16 + 128 bit boundary.
In all cases, a two byte fencepost was placed at the end of the
destination to verify that only the requested number of bytes were copied.
Bug: 8005082
Merge from internal master.
(cherry-picked from commit 21ede92d794969f22cacbdb9f557818f1c5712b5)
Change-Id: Ief70c9e6dc8c6473ae245b6570b2c266fed9618c
Add missing branch in memcpy.S dst aligned case.
Merge from internal master.
(cherry-picked from commit 6ffaa931c362602a2b606a610c92326a425a876e)
Change-Id: Ifdcf01fd122866cf0d4c5b5f7a997803561d7889
Rewrite memset for cortexa15 to use strd.
Merge from internal master.
(cherry-picked from commit 7ffad9c120054eedebd5f56f8bed01144e93eafa)
Change-Id: Ia67f2a545399f4fa37b63d5634a3565e4f5482f9
-rw-r--r-- | libc/arch-arm/cortex-a15/bionic/memcpy.S | 312 | ||||
-rw-r--r-- | libc/arch-arm/cortex-a15/bionic/memset.S | 160 |
2 files changed, 354 insertions, 118 deletions
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S index 16187b5..d297064 100644 --- a/libc/arch-arm/cortex-a15/bionic/memcpy.S +++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S @@ -25,80 +25,109 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +/* + * Copyright (c) 2013 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ -/* Assumes neon instructions and a cache line size of 64 bytes. */ + /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */ + + // This version is tuned for the Cortex-A15 processor. #include <machine/cpu-features.h> #include <machine/asm.h> -/* - * This code assumes it is running on a processor that supports all arm v7 - * instructions, that supports neon instructions, and that has a 64 byte - * cache line. - */ - .text + .syntax unified .fpu neon -#define CACHE_LINE_SIZE 64 +#define CACHE_LINE_SIZE 64 ENTRY(memcpy) - .save {r0, lr} - /* start preloading as early as possible */ - pld [r1, #(CACHE_LINE_SIZE*0)] - stmfd sp!, {r0, lr} - pld [r1, #(CACHE_LINE_SIZE*1)] + // Assumes that n >= 0, and dst, src are valid pointers. + // For any sizes less than 832 use the neon code that doesn't + // care about the src alignment. This avoids any checks + // for src alignment, and offers the best improvement since + // smaller sized copies are dominated by the overhead of + // the pre and post main loop. + // For larger copies, if src and dst cannot both be aligned to + // word boundaries, use the neon code. + // For all other copies, align dst to a double word boundary + // and copy using LDRD/STRD instructions. + + // Save registers (r0 holds the return value): + // optimized push {r0, lr}. + .save {r0, lr} + pld [r1, #(CACHE_LINE_SIZE*16)] + push {r0, lr} - /* do we have at least 16-bytes to copy (needed for alignment below) */ - cmp r2, #16 - blo 5f + cmp r2, #16 + blo copy_less_than_16_unknown_align - /* align destination to cache-line for the write-buffer */ + cmp r2, #832 + bge check_alignment + +copy_unknown_alignment: + // Unknown alignment of src and dst. + // Assumes that the first few bytes have already been prefetched. + + // Align destination to 128 bits. The mainloop store instructions + // require this alignment or they will throw an exception. rsb r3, r0, #0 ands r3, r3, #0xF - beq 0f + beq 2f - /* copy up to 15-bytes (count in r3) */ + // Copy up to 15 bytes (count in r3). sub r2, r2, r3 movs ip, r3, lsl #31 - ldrmib lr, [r1], #1 - strmib lr, [r0], #1 - ldrcsb ip, [r1], #1 - ldrcsb lr, [r1], #1 - strcsb ip, [r0], #1 - strcsb lr, [r0], #1 + + itt mi + ldrbmi lr, [r1], #1 + strbmi lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1], #1 + strbcs ip, [r0], #1 + strbcs lr, [r0], #1 + movs ip, r3, lsl #29 bge 1f - // copies 4 bytes, destination 32-bits aligned + // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after. vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! 1: bcc 2f - // copies 8 bytes, destination 64-bits aligned + // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after. vld1.8 {d0}, [r1]! vst1.8 {d0}, [r0, :64]! -2: - -0: /* preload immediately the next cache line, which we may need */ - pld [r1, #(CACHE_LINE_SIZE*0)] - pld [r1, #(CACHE_LINE_SIZE*1)] - /* make sure we have at least 64 bytes to copy */ +2: // Make sure we have at least 64 bytes to copy. subs r2, r2, #64 blo 2f - /* Preload all the cache lines we need. - * NOTE: The number of pld below depends on CACHE_LINE_SIZE, - * ideally we would increase the distance in the main loop to - * avoid the goofy code below. In practice this doesn't seem to make - * a big difference. - * NOTE: The value CACHE_LINE_SIZE * 4 was chosen through - * experimentation. - */ - pld [r1, #(CACHE_LINE_SIZE*2)] - pld [r1, #(CACHE_LINE_SIZE*3)] - pld [r1, #(CACHE_LINE_SIZE*4)] - -1: /* The main loop copies 64 bytes at a time */ +1: // The main loop copies 64 bytes at a time. vld1.8 {d0 - d3}, [r1]! vld1.8 {d4 - d7}, [r1]! pld [r1, #(CACHE_LINE_SIZE*4)] @@ -107,25 +136,24 @@ ENTRY(memcpy) vst1.8 {d4 - d7}, [r0, :128]! bhs 1b -2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ - add r2, r2, #64 - subs r2, r2, #32 - blo 4f +2: // Fix-up the remaining count and make sure we have >= 32 bytes left. + adds r2, r2, #32 + blo 3f -3: /* 32 bytes at a time. These cache lines were already preloaded */ + // 32 bytes. These cache lines were already preloaded. vld1.8 {d0 - d3}, [r1]! - subs r2, r2, #32 + sub r2, r2, #32 vst1.8 {d0 - d3}, [r0, :128]! - bhs 3b -4: /* less than 32 left */ +3: // Less than 32 left. add r2, r2, #32 tst r2, #0x10 - beq 5f - // copies 16 bytes, 128-bits aligned + beq copy_less_than_16_unknown_align + // Copies 16 bytes, destination 128 bits aligned. vld1.8 {d0, d1}, [r1]! vst1.8 {d0, d1}, [r0, :128]! -5: /* copy up to 15-bytes (count in r2) */ +copy_less_than_16_unknown_align: + // Copy up to 15 bytes (count in r2). movs ip, r2, lsl #29 bcc 1f vld1.8 {d0}, [r1]! @@ -133,14 +161,164 @@ ENTRY(memcpy) 1: bge 2f vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! -2: movs ip, r2, lsl #31 - ldrmib r3, [r1], #1 - ldrcsb ip, [r1], #1 - ldrcsb lr, [r1], #1 - strmib r3, [r0], #1 - strcsb ip, [r0], #1 - strcsb lr, [r0], #1 - - ldmfd sp!, {r0, lr} - bx lr + +2: // Copy 0 to 4 bytes. + lsls r2, r2, #31 + itt ne + ldrbne lr, [r1], #1 + strbne lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1] + strbcs ip, [r0], #1 + strbcs lr, [r0] + + pop {r0, pc} + +check_alignment: + // If src and dst cannot both be aligned to a word boundary, + // use the unaligned copy version. + eor r3, r0, r1 + ands r3, r3, #0x3 + bne copy_unknown_alignment + + // To try and improve performance, stack layout changed, + // i.e., not keeping the stack looking like users expect + // (highest numbered register at highest address). + // TODO: Add debug frame directives. + // We don't need exception unwind directives, because the code below + // does not throw any exceptions and does not call any other functions. + // Generally, newlib functions like this lack debug information for + // assembler source. + .save {r4, r5} + strd r4, r5, [sp, #-8]! + .save {r6, r7} + strd r6, r7, [sp, #-8]! + .save {r8, r9} + strd r8, r9, [sp, #-8]! + + // Optimized for already aligned dst code. + ands ip, r0, #3 + bne dst_not_word_aligned + +word_aligned: + // Align the destination buffer to 8 bytes, to make sure double + // loads and stores don't cross a cache line boundary, + // as they are then more expensive even if the data is in the cache + // (require two load/store issue cycles instead of one). + // If only one of the buffers is not 8 bytes aligned, + // then it's more important to align dst than src, + // because there is more penalty for stores + // than loads that cross a cacheline boundary. + // This check and realignment are only done if there is >= 832 + // bytes to copy. + + // Dst is word aligned, but check if it is already double word aligned. + ands r3, r0, #4 + beq 1f + ldr r3, [r1], #4 + str r3, [r0], #4 + sub r2, #4 + +1: // Can only get here if > 64 bytes to copy, so don't do check r2. + sub r2, #64 + +2: // Every loop iteration copies 64 bytes. + .irp offset, #0, #8, #16, #24, #32 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + + ldrd r4, r5, [r1, #40] + ldrd r6, r7, [r1, #48] + ldrd r8, r9, [r1, #56] + + // Keep the pld as far from the next load as possible. + // The amount to prefetch was determined experimentally using + // large sizes, and verifying the prefetch size does not affect + // the smaller copies too much. + // WARNING: If the ldrd and strd instructions get too far away + // from each other, performance suffers. Three loads + // in a row is the best tradeoff. + pld [r1, #(CACHE_LINE_SIZE*16)] + strd r4, r5, [r0, #40] + strd r6, r7, [r0, #48] + strd r8, r9, [r0, #56] + + add r0, r0, #64 + add r1, r1, #64 + subs r2, r2, #64 + bge 2b + + // Fix-up the remaining count and make sure we have >= 32 bytes left. + adds r2, r2, #32 + blo 4f + + // Copy 32 bytes. These cache lines were already preloaded. + .irp offset, #0, #8, #16, #24 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + add r1, r1, #32 + add r0, r0, #32 + sub r2, r2, #32 +4: // Less than 32 left. + add r2, r2, #32 + tst r2, #0x10 + beq 5f + // Copy 16 bytes. + .irp offset, #0, #8 + ldrd r4, r5, [r1, \offset] + strd r4, r5, [r0, \offset] + .endr + add r1, r1, #16 + add r0, r0, #16 + +5: // Copy up to 15 bytes (count in r2). + movs ip, r2, lsl #29 + bcc 1f + // Copy 8 bytes. + ldrd r4, r5, [r1], #8 + strd r4, r5, [r0], #8 +1: bge 2f + // Copy 4 bytes. + ldr r4, [r1], #4 + str r4, [r0], #4 +2: // Copy 0 to 4 bytes. + lsls r2, r2, #31 + itt ne + ldrbne lr, [r1], #1 + strbne lr, [r0], #1 + itttt cs + ldrbcs ip, [r1], #1 + ldrbcs lr, [r1] + strbcs ip, [r0], #1 + strbcs lr, [r0] + + // Restore registers: optimized pop {r0, pc} + ldrd r8, r9, [sp], #8 + ldrd r6, r7, [sp], #8 + ldrd r4, r5, [sp], #8 + pop {r0, pc} + +dst_not_word_aligned: + // Align dst to word. + rsb ip, ip, #4 + cmp ip, #2 + + itt gt + ldrbgt lr, [r1], #1 + strbgt lr, [r0], #1 + + itt ge + ldrbge lr, [r1], #1 + strbge lr, [r0], #1 + + ldrb lr, [r1], #1 + strb lr, [r0], #1 + + sub r2, r2, ip + + // Src is guaranteed to be at least word aligned by this point. + b word_aligned END(memcpy) diff --git a/libc/arch-arm/cortex-a15/bionic/memset.S b/libc/arch-arm/cortex-a15/bionic/memset.S index 7bb3297..2e1ad54 100644 --- a/libc/arch-arm/cortex-a15/bionic/memset.S +++ b/libc/arch-arm/cortex-a15/bionic/memset.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2008 The Android Open Source Project + * Copyright (C) 2013 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,11 +35,12 @@ * memset() returns its first argument. */ - .fpu neon + .fpu neon + .syntax unified ENTRY(bzero) - mov r2, r1 - mov r1, #0 + mov r2, r1 + mov r1, #0 // Fall through to memset... END(bzero) @@ -47,60 +48,117 @@ ENTRY(memset) .save {r0} stmfd sp!, {r0} - vdup.8 q0, r1 - - /* do we have at least 16-bytes to write (needed for alignment below) */ + // The new algorithm is slower for copies < 16 so use the old + // neon code in that case. cmp r2, #16 - blo 3f - - /* align destination to 16 bytes for the write-buffer */ - rsb r3, r0, #0 - ands r3, r3, #0xF - beq 2f - - /* write up to 15-bytes (count in r3) */ - sub r2, r2, r3 - movs ip, r3, lsl #31 - strmib r1, [r0], #1 - strcsb r1, [r0], #1 - strcsb r1, [r0], #1 - movs ip, r3, lsl #29 - bge 1f - - // writes 4 bytes, 32-bits aligned - vst1.32 {d0[0]}, [r0, :32]! -1: bcc 2f - - // writes 8 bytes, 64-bits aligned - vst1.8 {d0}, [r0, :64]! -2: - /* make sure we have at least 32 bytes to write */ - subs r2, r2, #32 - blo 2f - vmov q1, q0 - -1: /* The main loop writes 32 bytes at a time */ - subs r2, r2, #32 - vst1.8 {d0 - d3}, [r0, :128]! - bhs 1b - -2: /* less than 32 left */ - add r2, r2, #32 - tst r2, #0x10 - beq 3f - - // writes 16 bytes, 128-bits aligned - vst1.8 {d0, d1}, [r0, :128]! -3: /* write up to 15-bytes (count in r2) */ + blo set_less_than_16_unknown_align + + // Use strd which requires an even and odd register so move the + // values so that: + // r0 and r1 contain the memset value + // r2 is the number of bytes to set + // r3 is the destination pointer + mov r3, r0 + + // Copy the byte value in every byte of r1. + mov r1, r1, lsl #24 + orr r1, r1, r1, lsr #8 + orr r1, r1, r1, lsr #16 + +check_alignment: + // Align destination to a double word to avoid the strd crossing + // a cache line boundary. + ands ip, r3, #7 + bne do_double_word_align + +double_word_aligned: + mov r0, r1 + + subs r2, #64 + blo set_less_than_64 + +1: // Main loop sets 64 bytes at a time. + .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 + strd r0, r1, [r3, \offset] + .endr + + add r3, #64 + subs r2, #64 + bge 1b + +set_less_than_64: + // Restore r2 to the count of bytes left to set. + add r2, #64 + lsls ip, r2, #27 + bcc set_less_than_32 + // Set 32 bytes. + .irp offset, #0, #8, #16, #24 + strd r0, r1, [r3, \offset] + .endr + add r3, #32 + +set_less_than_32: + bpl set_less_than_16 + // Set 16 bytes. + .irp offset, #0, #8 + strd r0, r1, [r3, \offset] + .endr + add r3, #16 + +set_less_than_16: + // Less than 16 bytes to set. + lsls ip, r2, #29 + bcc set_less_than_8 + + // Set 8 bytes. + strd r0, r1, [r3], #8 + +set_less_than_8: + bpl set_less_than_4 + // Set 4 bytes + str r1, [r3], #4 + +set_less_than_4: + lsls ip, r2, #31 + it ne + strbne r1, [r3], #1 + itt cs + strbcs r1, [r3], #1 + strbcs r1, [r3] + + ldmfd sp!, {r0} + bx lr + +do_double_word_align: + rsb ip, ip, #8 + sub r2, r2, ip + movs r0, ip, lsl #31 + it mi + strbmi r1, [r3], #1 + itt cs + strbcs r1, [r3], #1 + strbcs r1, [r3], #1 + + // Dst is at least word aligned by this point. + cmp ip, #4 + blo double_word_aligned + str r1, [r3], #4 + b double_word_aligned + +set_less_than_16_unknown_align: + // Set up to 15 bytes. + vdup.8 d0, r1 movs ip, r2, lsl #29 bcc 1f vst1.8 {d0}, [r0]! 1: bge 2f vst1.32 {d0[0]}, [r0]! 2: movs ip, r2, lsl #31 - strmib r1, [r0], #1 - strcsb r1, [r0], #1 - strcsb r1, [r0], #1 + it mi + strbmi r1, [r0], #1 + itt cs + strbcs r1, [r0], #1 + strbcs r1, [r0], #1 ldmfd sp!, {r0} bx lr END(memset) |