diff options
author | Mathias Agopian <mathias@google.com> | 2009-10-28 02:54:37 -0700 |
---|---|---|
committer | Mathias Agopian <mathias@google.com> | 2009-10-28 03:17:02 -0700 |
commit | 199f9d923804d74e021dd80e48ec75c0a96dba77 (patch) | |
tree | 3a00bb9e267cf952d7d1140ff9a39ca07ee6c994 | |
parent | 763ac28357f604e0e4196e0a7ad5b0f5cdcf274a (diff) | |
download | bionic-199f9d923804d74e021dd80e48ec75c0a96dba77.zip bionic-199f9d923804d74e021dd80e48ec75c0a96dba77.tar.gz bionic-199f9d923804d74e021dd80e48ec75c0a96dba77.tar.bz2 |
Improve memcpy performance from 290 MiB/s to 340 MiB/s (17% improvment)
use 64 bytes cache lines, reduce the main loop to 64-bytes instead of
128 bytes and adjust the prefetch distance to the optimal value.
-rw-r--r-- | libc/arch-arm/bionic/memcpy.S | 41 |
1 files changed, 16 insertions, 25 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index f5cc67b..024d885 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -37,8 +37,9 @@ .type memcpy, %function .align 4 -/* a prefetch distance of 32*4 works best experimentally */ -#define PREFETCH_DISTANCE (32*4) +/* a prefetch distance of 4 cache-lines works best experimentally */ +#define CACHE_LINE_SIZE 64 +#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4) memcpy: .fnstart @@ -46,8 +47,8 @@ memcpy: stmfd sp!, {r0, lr} /* start preloading as early as possible */ - pld [r1, #0] - pld [r1, #32] + pld [r1, #(CACHE_LINE_SIZE*0)] + pld [r1, #(CACHE_LINE_SIZE*1)] /* do we have at least 16-bytes to copy (needed for alignment below) */ cmp r2, #16 @@ -79,13 +80,11 @@ memcpy: 2: 0: /* preload immediately the next cache line, which we may need */ - pld [r1, #(32*0)] - pld [r1, #(32*1)] - pld [r1, #(32*2)] - pld [r1, #(32*3)] + pld [r1, #(CACHE_LINE_SIZE*0)] + pld [r1, #(CACHE_LINE_SIZE*1)] - /* make sure we have at least 128 bytes to copy */ - subs r2, r2, #128 + /* make sure we have at least 64 bytes to copy */ + subs r2, r2, #64 blo 2f /* preload all the cache lines we need. @@ -94,29 +93,21 @@ memcpy: * avoid the goofy code below. In practice this doesn't seem to make * a big difference. */ - pld [r1, #(PREFETCH_DISTANCE + 32*0)] - pld [r1, #(PREFETCH_DISTANCE + 32*1)] - pld [r1, #(PREFETCH_DISTANCE + 32*2)] - pld [r1, #(PREFETCH_DISTANCE + 32*3)] + pld [r1, #(CACHE_LINE_SIZE*2)] + pld [r1, #(CACHE_LINE_SIZE*3)] + pld [r1, #(PREFETCH_DISTANCE)] -1: /* The main loop copies 128 bytes at a time */ +1: /* The main loop copies 64 bytes at a time */ vld1.8 {d0 - d3}, [r1]! vld1.8 {d4 - d7}, [r1]! - vld1.8 {d16 - d19}, [r1]! - vld1.8 {d20 - d23}, [r1]! - pld [r1, #(PREFETCH_DISTANCE + 32*0)] - pld [r1, #(PREFETCH_DISTANCE + 32*1)] - pld [r1, #(PREFETCH_DISTANCE + 32*2)] - pld [r1, #(PREFETCH_DISTANCE + 32*3)] - subs r2, r2, #128 + pld [r1, #(PREFETCH_DISTANCE)] + subs r2, r2, #64 vst1.8 {d0 - d3}, [r0, :128]! vst1.8 {d4 - d7}, [r0, :128]! - vst1.8 {d16 - d19}, [r0, :128]! - vst1.8 {d20 - d23}, [r0, :128]! bhs 1b 2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ - add r2, r2, #128 + add r2, r2, #64 subs r2, r2, #32 blo 4f |