diff options
author | Mathias Agopian <mathias@google.com> | 2009-10-29 19:53:39 -0700 |
---|---|---|
committer | Android Git Automerger <android-git-automerger@android.com> | 2009-10-29 19:53:39 -0700 |
commit | ecac54db21256952b4b4a9c6f17604ec31743183 (patch) | |
tree | 4b1142d9fa5d9c2e557e11d045381f3d60b4e8ba /libc/arch-arm | |
parent | 9e78de3e3c90ec3c2970431d8eae7378fdc0dac6 (diff) | |
parent | 3ba822cc3f0a252db73cf63cb8390e46fc0ceb0a (diff) | |
download | bionic-ecac54db21256952b4b4a9c6f17604ec31743183.zip bionic-ecac54db21256952b4b4a9c6f17604ec31743183.tar.gz bionic-ecac54db21256952b4b4a9c6f17604ec31743183.tar.bz2 |
am 3ba822cc: am 199f9d92: Improve memcpy performance from 290 MiB/s to 340 MiB/s (17% improvment)
Merge commit '3ba822cc3f0a252db73cf63cb8390e46fc0ceb0a' into eclair-mr2-plus-aosp
* commit '3ba822cc3f0a252db73cf63cb8390e46fc0ceb0a':
Improve memcpy performance from 290 MiB/s to 340 MiB/s (17% improvment)
Diffstat (limited to 'libc/arch-arm')
-rw-r--r-- | libc/arch-arm/bionic/memcpy.S | 41 |
1 files changed, 16 insertions, 25 deletions
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index f5cc67b..024d885 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -37,8 +37,9 @@ .type memcpy, %function .align 4 -/* a prefetch distance of 32*4 works best experimentally */ -#define PREFETCH_DISTANCE (32*4) +/* a prefetch distance of 4 cache-lines works best experimentally */ +#define CACHE_LINE_SIZE 64 +#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4) memcpy: .fnstart @@ -46,8 +47,8 @@ memcpy: stmfd sp!, {r0, lr} /* start preloading as early as possible */ - pld [r1, #0] - pld [r1, #32] + pld [r1, #(CACHE_LINE_SIZE*0)] + pld [r1, #(CACHE_LINE_SIZE*1)] /* do we have at least 16-bytes to copy (needed for alignment below) */ cmp r2, #16 @@ -79,13 +80,11 @@ memcpy: 2: 0: /* preload immediately the next cache line, which we may need */ - pld [r1, #(32*0)] - pld [r1, #(32*1)] - pld [r1, #(32*2)] - pld [r1, #(32*3)] + pld [r1, #(CACHE_LINE_SIZE*0)] + pld [r1, #(CACHE_LINE_SIZE*1)] - /* make sure we have at least 128 bytes to copy */ - subs r2, r2, #128 + /* make sure we have at least 64 bytes to copy */ + subs r2, r2, #64 blo 2f /* preload all the cache lines we need. @@ -94,29 +93,21 @@ memcpy: * avoid the goofy code below. In practice this doesn't seem to make * a big difference. */ - pld [r1, #(PREFETCH_DISTANCE + 32*0)] - pld [r1, #(PREFETCH_DISTANCE + 32*1)] - pld [r1, #(PREFETCH_DISTANCE + 32*2)] - pld [r1, #(PREFETCH_DISTANCE + 32*3)] + pld [r1, #(CACHE_LINE_SIZE*2)] + pld [r1, #(CACHE_LINE_SIZE*3)] + pld [r1, #(PREFETCH_DISTANCE)] -1: /* The main loop copies 128 bytes at a time */ +1: /* The main loop copies 64 bytes at a time */ vld1.8 {d0 - d3}, [r1]! vld1.8 {d4 - d7}, [r1]! - vld1.8 {d16 - d19}, [r1]! - vld1.8 {d20 - d23}, [r1]! - pld [r1, #(PREFETCH_DISTANCE + 32*0)] - pld [r1, #(PREFETCH_DISTANCE + 32*1)] - pld [r1, #(PREFETCH_DISTANCE + 32*2)] - pld [r1, #(PREFETCH_DISTANCE + 32*3)] - subs r2, r2, #128 + pld [r1, #(PREFETCH_DISTANCE)] + subs r2, r2, #64 vst1.8 {d0 - d3}, [r0, :128]! vst1.8 {d4 - d7}, [r0, :128]! - vst1.8 {d16 - d19}, [r0, :128]! - vst1.8 {d20 - d23}, [r0, :128]! bhs 1b 2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ - add r2, r2, #128 + add r2, r2, #64 subs r2, r2, #32 blo 4f |