From 9895f9429cb489ba271c06767531083ae4c4bcbe Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 21 Nov 2007 22:46:14 +0900 Subject: sh: clear/copy_page renames in lib and lib64. Signed-off-by: Paul Mundt --- arch/sh/lib64/Makefile | 2 +- arch/sh/lib64/clear_page.S | 54 ++++++++++++++++++++++++++++ arch/sh/lib64/copy_page.S | 89 ++++++++++++++++++++++++++++++++++++++++++++++ arch/sh/lib64/page_clear.S | 54 ---------------------------- arch/sh/lib64/page_copy.S | 89 ---------------------------------------------- 5 files changed, 144 insertions(+), 144 deletions(-) create mode 100644 arch/sh/lib64/clear_page.S create mode 100644 arch/sh/lib64/copy_page.S delete mode 100644 arch/sh/lib64/page_clear.S delete mode 100644 arch/sh/lib64/page_copy.S (limited to 'arch/sh/lib64') diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile index 2f4086a..9950966 100644 --- a/arch/sh/lib64/Makefile +++ b/arch/sh/lib64/Makefile @@ -11,5 +11,5 @@ # Panic should really be compiled as PIC lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o copy_user_memcpy.o \ - page_copy.o page_clear.o + copy_page.o clear_page.o diff --git a/arch/sh/lib64/clear_page.S b/arch/sh/lib64/clear_page.S new file mode 100644 index 0000000..007ab48 --- /dev/null +++ b/arch/sh/lib64/clear_page.S @@ -0,0 +1,54 @@ +/* + Copyright 2003 Richard Curnow, SuperH (UK) Ltd. + + This file is subject to the terms and conditions of the GNU General Public + License. See the file "COPYING" in the main directory of this archive + for more details. + + Tight version of memset for the case of just clearing a page. It turns out + that having the alloco's spaced out slightly due to the increment/branch + pair causes them to contend less for access to the cache. Similarly, + keeping the stores apart from the allocos causes less contention. => Do two + separate loops. Do multiple stores per loop to amortise the + increment/branch cost a little. + + Parameters: + r2 : source effective address (start of page) + + Always clears 4096 bytes. + + Note : alloco guarded by synco to avoid TAKum03020 erratum + +*/ + + .section .text..SHmedia32,"ax" + .little + + .balign 8 + .global clear_page +clear_page: + pta/l 1f, tr1 + pta/l 2f, tr2 + ptabs/l r18, tr0 + + movi 4096, r7 + add r2, r7, r7 + add r2, r63, r6 +1: + alloco r6, 0 + synco ! TAKum03020 + addi r6, 32, r6 + bgt/l r7, r6, tr1 + + add r2, r63, r6 +2: + st.q r6, 0, r63 + st.q r6, 8, r63 + st.q r6, 16, r63 + st.q r6, 24, r63 + addi r6, 32, r6 + bgt/l r7, r6, tr2 + + blink tr0, r63 + + diff --git a/arch/sh/lib64/copy_page.S b/arch/sh/lib64/copy_page.S new file mode 100644 index 0000000..0ec6fca --- /dev/null +++ b/arch/sh/lib64/copy_page.S @@ -0,0 +1,89 @@ +/* + Copyright 2003 Richard Curnow, SuperH (UK) Ltd. + + This file is subject to the terms and conditions of the GNU General Public + License. See the file "COPYING" in the main directory of this archive + for more details. + + Tight version of mempy for the case of just copying a page. + Prefetch strategy empirically optimised against RTL simulations + of SH5-101 cut2 eval chip with Cayman board DDR memory. + + Parameters: + r2 : destination effective address (start of page) + r3 : source effective address (start of page) + + Always copies 4096 bytes. + + Points to review. + * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. + It seems like the prefetch needs to be at at least 4 lines ahead to get + the data into the cache in time, and the allocos contend with outstanding + prefetches for the same cache set, so it's better to have the numbers + different. + */ + + .section .text..SHmedia32,"ax" + .little + + .balign 8 + .global copy_page +copy_page: + + /* Copy 4096 bytes worth of data from r3 to r2. + Do prefetches 4 lines ahead. + Do alloco 2 lines ahead */ + + pta 1f, tr1 + pta 2f, tr2 + pta 3f, tr3 + ptabs r18, tr0 + +#if 0 + /* TAKum03020 */ + ld.q r3, 0x00, r63 + ld.q r3, 0x20, r63 + ld.q r3, 0x40, r63 + ld.q r3, 0x60, r63 +#endif + alloco r2, 0x00 + synco ! TAKum03020 + alloco r2, 0x20 + synco ! TAKum03020 + + movi 3968, r6 + add r2, r6, r6 + addi r6, 64, r7 + addi r7, 64, r8 + sub r3, r2, r60 + addi r60, 8, r61 + addi r61, 8, r62 + addi r62, 8, r23 + addi r60, 0x80, r22 + +/* Minimal code size. The extra branches inside the loop don't cost much + because they overlap with the time spent waiting for prefetches to + complete. */ +1: +#if 0 + /* TAKum03020 */ + bge/u r2, r6, tr2 ! skip prefetch for last 4 lines + ldx.q r2, r22, r63 ! prefetch 4 lines hence +#endif +2: + bge/u r2, r7, tr3 ! skip alloco for last 2 lines + alloco r2, 0x40 ! alloc destination line 2 lines ahead + synco ! TAKum03020 +3: + ldx.q r2, r60, r36 + ldx.q r2, r61, r37 + ldx.q r2, r62, r38 + ldx.q r2, r23, r39 + st.q r2, 0, r36 + st.q r2, 8, r37 + st.q r2, 16, r38 + st.q r2, 24, r39 + addi r2, 32, r2 + bgt/l r8, r2, tr1 + + blink tr0, r63 ! return diff --git a/arch/sh/lib64/page_clear.S b/arch/sh/lib64/page_clear.S deleted file mode 100644 index 007ab48..0000000 --- a/arch/sh/lib64/page_clear.S +++ /dev/null @@ -1,54 +0,0 @@ -/* - Copyright 2003 Richard Curnow, SuperH (UK) Ltd. - - This file is subject to the terms and conditions of the GNU General Public - License. See the file "COPYING" in the main directory of this archive - for more details. - - Tight version of memset for the case of just clearing a page. It turns out - that having the alloco's spaced out slightly due to the increment/branch - pair causes them to contend less for access to the cache. Similarly, - keeping the stores apart from the allocos causes less contention. => Do two - separate loops. Do multiple stores per loop to amortise the - increment/branch cost a little. - - Parameters: - r2 : source effective address (start of page) - - Always clears 4096 bytes. - - Note : alloco guarded by synco to avoid TAKum03020 erratum - -*/ - - .section .text..SHmedia32,"ax" - .little - - .balign 8 - .global clear_page -clear_page: - pta/l 1f, tr1 - pta/l 2f, tr2 - ptabs/l r18, tr0 - - movi 4096, r7 - add r2, r7, r7 - add r2, r63, r6 -1: - alloco r6, 0 - synco ! TAKum03020 - addi r6, 32, r6 - bgt/l r7, r6, tr1 - - add r2, r63, r6 -2: - st.q r6, 0, r63 - st.q r6, 8, r63 - st.q r6, 16, r63 - st.q r6, 24, r63 - addi r6, 32, r6 - bgt/l r7, r6, tr2 - - blink tr0, r63 - - diff --git a/arch/sh/lib64/page_copy.S b/arch/sh/lib64/page_copy.S deleted file mode 100644 index 0ec6fca..0000000 --- a/arch/sh/lib64/page_copy.S +++ /dev/null @@ -1,89 +0,0 @@ -/* - Copyright 2003 Richard Curnow, SuperH (UK) Ltd. - - This file is subject to the terms and conditions of the GNU General Public - License. See the file "COPYING" in the main directory of this archive - for more details. - - Tight version of mempy for the case of just copying a page. - Prefetch strategy empirically optimised against RTL simulations - of SH5-101 cut2 eval chip with Cayman board DDR memory. - - Parameters: - r2 : destination effective address (start of page) - r3 : source effective address (start of page) - - Always copies 4096 bytes. - - Points to review. - * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. - It seems like the prefetch needs to be at at least 4 lines ahead to get - the data into the cache in time, and the allocos contend with outstanding - prefetches for the same cache set, so it's better to have the numbers - different. - */ - - .section .text..SHmedia32,"ax" - .little - - .balign 8 - .global copy_page -copy_page: - - /* Copy 4096 bytes worth of data from r3 to r2. - Do prefetches 4 lines ahead. - Do alloco 2 lines ahead */ - - pta 1f, tr1 - pta 2f, tr2 - pta 3f, tr3 - ptabs r18, tr0 - -#if 0 - /* TAKum03020 */ - ld.q r3, 0x00, r63 - ld.q r3, 0x20, r63 - ld.q r3, 0x40, r63 - ld.q r3, 0x60, r63 -#endif - alloco r2, 0x00 - synco ! TAKum03020 - alloco r2, 0x20 - synco ! TAKum03020 - - movi 3968, r6 - add r2, r6, r6 - addi r6, 64, r7 - addi r7, 64, r8 - sub r3, r2, r60 - addi r60, 8, r61 - addi r61, 8, r62 - addi r62, 8, r23 - addi r60, 0x80, r22 - -/* Minimal code size. The extra branches inside the loop don't cost much - because they overlap with the time spent waiting for prefetches to - complete. */ -1: -#if 0 - /* TAKum03020 */ - bge/u r2, r6, tr2 ! skip prefetch for last 4 lines - ldx.q r2, r22, r63 ! prefetch 4 lines hence -#endif -2: - bge/u r2, r7, tr3 ! skip alloco for last 2 lines - alloco r2, 0x40 ! alloc destination line 2 lines ahead - synco ! TAKum03020 -3: - ldx.q r2, r60, r36 - ldx.q r2, r61, r37 - ldx.q r2, r62, r38 - ldx.q r2, r23, r39 - st.q r2, 0, r36 - st.q r2, 8, r37 - st.q r2, 16, r38 - st.q r2, 24, r39 - addi r2, 32, r2 - bgt/l r8, r2, tr1 - - blink tr0, r63 ! return -- cgit v1.1