diff options
Diffstat (limited to 'libc/arch-mips/string')
-rw-r--r-- | libc/arch-mips/string/memcpy.S | 423 | ||||
-rw-r--r-- | libc/arch-mips/string/memset.S | 323 | ||||
-rw-r--r-- | libc/arch-mips/string/mips-string-ops.h | 148 | ||||
-rw-r--r-- | libc/arch-mips/string/mips_strlen.c | 223 |
4 files changed, 1117 insertions, 0 deletions
diff --git a/libc/arch-mips/string/memcpy.S b/libc/arch-mips/string/memcpy.S new file mode 100644 index 0000000..aabdfcf --- /dev/null +++ b/libc/arch-mips/string/memcpy.S @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2009 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/************************************************************************ + * + * memcpy.S + * Version: "043009" + * + ************************************************************************/ + + +/************************************************************************ + * Include files + ************************************************************************/ + +#include "machine/asm.h" + + +/* + * This routine could be optimized for MIPS64. The current code only + * uses MIPS32 instructions. + */ +#if defined(__MIPSEB__) +# define LWHI lwl /* high part is left in big-endian */ +# define SWHI swl /* high part is left in big-endian */ +# define LWLO lwr /* low part is right in big-endian */ +# define SWLO swr /* low part is right in big-endian */ +#endif + +#if defined(__MIPSEL__) +# define LWHI lwr /* high part is right in little-endian */ +# define SWHI swr /* high part is right in little-endian */ +# define LWLO lwl /* low part is left in big-endian */ +# define SWLO swl /* low part is left in big-endian */ +#endif + +LEAF(memcpy,0) + + .set noreorder + .set noat +/* + * Below we handle the case where memcpy is called with overlapping src and dst. + * Although memcpy is not required to handle this case, some parts of Android like Skia + * rely on such usage. We call memmove to handle such cases. + */ + subu t0,a0,a1 + sra AT,t0,31 + xor t1,t0,AT + subu t0,t1,AT + sltu AT,t0,a2 + beq AT,zero,.Lmemcpy + la t9,memmove + jr t9 + nop +.Lmemcpy: + slti AT,a2,8 + bne AT,zero,.Llast8 + move v0,a0 # memcpy returns the dst pointer + +# Test if the src and dst are word-aligned, or can be made word-aligned + xor t8,a1,a0 + andi t8,t8,0x3 # t8 is a0/a1 word-displacement + + bne t8,zero,.Lunaligned + negu a3,a0 + + andi a3,a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned + beq a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned + subu a2,a2,a3 # now a2 is the remining bytes count + + LWHI t8,0(a1) + addu a1,a1,a3 + SWHI t8,0(a0) + addu a0,a0,a3 + +# Now the dst/src are mutually word-aligned with word-aligned addresses +.Lchk16w: + andi t8,a2,0x3f # any whole 64-byte chunks? + # t8 is the byte count after 64-byte chunks + + beq a2,t8,.Lchk8w # if a2==t8, no 64-byte chunks + # There will be at most 1 32-byte chunk after it + subu a3,a2,t8 # subtract from a2 the reminder + # Here a3 counts bytes in 16w chunks + addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks + + addu t0,a0,a2 # t0 is the "past the end" address + +# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past +# the "t0-32" address +# This means: for x=128 the last "safe" a0 address is "t0-160" +# Alternatively, for x=64 the last "safe" a0 address is "t0-96" +# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit + subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address + + pref 0,0(a1) # bring the first line of src, addr 0 + pref 0,32(a1) # bring the second line of src, addr 32 + pref 0,64(a1) # bring the third line of src, addr 64 + pref 30,32(a0) # safe, as we have at least 64 bytes ahead +# In case the a0 > t9 don't use "pref 30" at all + sgtu v1,a0,t9 + bgtz v1,.Lloop16w # skip "pref 30,64(a0)" for too short arrays + nop +# otherwise, start with using pref30 + pref 30,64(a0) +.Lloop16w: + pref 0,96(a1) + lw t0,0(a1) + bgtz v1,.Lskip_pref30_96 # skip "pref 30,96(a0)" + lw t1,4(a1) + pref 30,96(a0) # continue setting up the dest, addr 96 +.Lskip_pref30_96: + lw t2,8(a1) + lw t3,12(a1) + lw t4,16(a1) + lw t5,20(a1) + lw t6,24(a1) + lw t7,28(a1) + pref 0,128(a1) # bring the next lines of src, addr 128 + + sw t0,0(a0) + sw t1,4(a0) + sw t2,8(a0) + sw t3,12(a0) + sw t4,16(a0) + sw t5,20(a0) + sw t6,24(a0) + sw t7,28(a0) + + lw t0,32(a1) + bgtz v1,.Lskip_pref30_128 # skip "pref 30,128(a0)" + lw t1,36(a1) + pref 30,128(a0) # continue setting up the dest, addr 128 +.Lskip_pref30_128: + lw t2,40(a1) + lw t3,44(a1) + lw t4,48(a1) + lw t5,52(a1) + lw t6,56(a1) + lw t7,60(a1) + pref 0, 160(a1) # bring the next lines of src, addr 160 + + sw t0,32(a0) + sw t1,36(a0) + sw t2,40(a0) + sw t3,44(a0) + sw t4,48(a0) + sw t5,52(a0) + sw t6,56(a0) + sw t7,60(a0) + + addiu a0,a0,64 # adding 64 to dest + sgtu v1,a0,t9 + bne a0,a3,.Lloop16w + addiu a1,a1,64 # adding 64 to src + move a2,t8 + +# Here we have src and dest word-aligned but less than 64-bytes to go + +.Lchk8w: + pref 0, 0x0(a1) + andi t8,a2,0x1f # is there a 32-byte chunk? + # the t8 is the reminder count past 32-bytes + beq a2,t8,.Lchk1w # when a2=t8, no 32-byte chunk + nop + + lw t0,0(a1) + lw t1,4(a1) + lw t2,8(a1) + lw t3,12(a1) + lw t4,16(a1) + lw t5,20(a1) + lw t6,24(a1) + lw t7,28(a1) + addiu a1,a1,32 + + sw t0,0(a0) + sw t1,4(a0) + sw t2,8(a0) + sw t3,12(a0) + sw t4,16(a0) + sw t5,20(a0) + sw t6,24(a0) + sw t7,28(a0) + addiu a0,a0,32 + +.Lchk1w: + andi a2,t8,0x3 # now a2 is the reminder past 1w chunks + beq a2,t8,.Llast8 + subu a3,t8,a2 # a3 is count of bytes in 1w chunks + addu a3,a0,a3 # now a3 is the dst address past the 1w chunks + +# copying in words (4-byte chunks) +.LwordCopy_loop: + lw t3,0(a1) # the first t3 may be equal t0 ... optimize? + addiu a1,a1,4 + addiu a0,a0,4 + bne a0,a3,.LwordCopy_loop + sw t3,-4(a0) + +# For the last (<8) bytes +.Llast8: + blez a2,.Lleave + addu a3,a0,a2 # a3 is the last dst address +.Llast8loop: + lb v1,0(a1) + addiu a1,a1,1 + addiu a0,a0,1 + bne a0,a3,.Llast8loop + sb v1,-1(a0) + +.Lleave: + j ra + nop + +# +# UNALIGNED case +# + +.Lunaligned: + # got here with a3="negu a0" + andi a3,a3,0x3 # test if the a0 is word aligned + beqz a3,.Lua_chk16w + subu a2,a2,a3 # bytes left after initial a3 bytes + + LWHI v1,0(a1) + LWLO v1,3(a1) + addu a1,a1,a3 # a3 may be here 1, 2 or 3 + SWHI v1,0(a0) + addu a0,a0,a3 # below the dst will be word aligned (NOTE1) + +.Lua_chk16w: + andi t8,a2,0x3f # any whole 64-byte chunks? + # t8 is the byte count after 64-byte chunks + beq a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks + # There will be at most 1 32-byte chunk after it + subu a3,a2,t8 # subtract from a2 the reminder + # Here a3 counts bytes in 16w chunks + addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks + + addu t0,a0,a2 # t0 is the "past the end" address + + subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address + + pref 0,0(a1) # bring the first line of src, addr 0 + pref 0,32(a1) # bring the second line of src, addr 32 + pref 0,64(a1) # bring the third line of src, addr 64 + pref 30,32(a0) # safe, as we have at least 64 bytes ahead +# In case the a0 > t9 don't use "pref 30" at all + sgtu v1,a0,t9 + bgtz v1,.Lua_loop16w # skip "pref 30,64(a0)" for too short arrays + nop +# otherwise, start with using pref30 + pref 30,64(a0) +.Lua_loop16w: + pref 0,96(a1) + LWHI t0,0(a1) + LWLO t0,3(a1) + LWHI t1,4(a1) + bgtz v1,.Lua_skip_pref30_96 + LWLO t1,7(a1) + pref 30,96(a0) # continue setting up the dest, addr 96 +.Lua_skip_pref30_96: + LWHI t2,8(a1) + LWLO t2,11(a1) + LWHI t3,12(a1) + LWLO t3,15(a1) + LWHI t4,16(a1) + LWLO t4,19(a1) + LWHI t5,20(a1) + LWLO t5,23(a1) + LWHI t6,24(a1) + LWLO t6,27(a1) + LWHI t7,28(a1) + LWLO t7,31(a1) + pref 0,128(a1) # bring the next lines of src, addr 128 + + sw t0,0(a0) + sw t1,4(a0) + sw t2,8(a0) + sw t3,12(a0) + sw t4,16(a0) + sw t5,20(a0) + sw t6,24(a0) + sw t7,28(a0) + + LWHI t0,32(a1) + LWLO t0,35(a1) + LWHI t1,36(a1) + bgtz v1,.Lua_skip_pref30_128 + LWLO t1,39(a1) + pref 30,128(a0) # continue setting up the dest, addr 128 +.Lua_skip_pref30_128: + LWHI t2,40(a1) + LWLO t2,43(a1) + LWHI t3,44(a1) + LWLO t3,47(a1) + LWHI t4,48(a1) + LWLO t4,51(a1) + LWHI t5,52(a1) + LWLO t5,55(a1) + LWHI t6,56(a1) + LWLO t6,59(a1) + LWHI t7,60(a1) + LWLO t7,63(a1) + pref 0, 160(a1) # bring the next lines of src, addr 160 + + sw t0,32(a0) + sw t1,36(a0) + sw t2,40(a0) + sw t3,44(a0) + sw t4,48(a0) + sw t5,52(a0) + sw t6,56(a0) + sw t7,60(a0) + + addiu a0,a0,64 # adding 64 to dest + sgtu v1,a0,t9 + bne a0,a3,.Lua_loop16w + addiu a1,a1,64 # adding 64 to src + move a2,t8 + +# Here we have src and dest word-aligned but less than 64-bytes to go + +.Lua_chk8w: + pref 0, 0x0(a1) + andi t8,a2,0x1f # is there a 32-byte chunk? + # the t8 is the reminder count + beq a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk + nop + + LWHI t0,0(a1) + LWLO t0,3(a1) + LWHI t1,4(a1) + LWLO t1,7(a1) + LWHI t2,8(a1) + LWLO t2,11(a1) + LWHI t3,12(a1) + LWLO t3,15(a1) + LWHI t4,16(a1) + LWLO t4,19(a1) + LWHI t5,20(a1) + LWLO t5,23(a1) + LWHI t6,24(a1) + LWLO t6,27(a1) + LWHI t7,28(a1) + LWLO t7,31(a1) + addiu a1,a1,32 + + sw t0,0(a0) + sw t1,4(a0) + sw t2,8(a0) + sw t3,12(a0) + sw t4,16(a0) + sw t5,20(a0) + sw t6,24(a0) + sw t7,28(a0) + addiu a0,a0,32 + +.Lua_chk1w: + andi a2,t8,0x3 # now a2 is the reminder past 1w chunks + beq a2,t8,.Lua_smallCopy + subu a3,t8,a2 # a3 is count of bytes in 1w chunks + addu a3,a0,a3 # now a3 is the dst address past the 1w chunks + +# copying in words (4-byte chunks) +.Lua_wordCopy_loop: + LWHI v1,0(a1) + LWLO v1,3(a1) + addiu a1,a1,4 + addiu a0,a0,4 # note: dst=a0 is word aligned here, see NOTE1 + bne a0,a3,.Lua_wordCopy_loop + sw v1,-4(a0) + +# Now less than 4 bytes (value in a2) left to copy +.Lua_smallCopy: + beqz a2,.Lleave + addu a3,a0,a2 # a3 is the last dst address +.Lua_smallCopy_loop: + lb v1,0(a1) + addiu a1,a1,1 + addiu a0,a0,1 + bne a0,a3,.Lua_smallCopy_loop + sb v1,-1(a0) + + j ra + nop + + .set at + .set reorder + +END(memcpy) + + +/************************************************************************ + * Implementation : Static functions + ************************************************************************/ diff --git a/libc/arch-mips/string/memset.S b/libc/arch-mips/string/memset.S new file mode 100644 index 0000000..a1c5055 --- /dev/null +++ b/libc/arch-mips/string/memset.S @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2009 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/************************************************************************ + * + * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops + * Version: "043009" + * + ************************************************************************/ + + +/************************************************************************ + * Include files + ************************************************************************/ + +#include "machine/asm.h" + +/* + * This routine could be optimized for MIPS64. The current code only + * uses MIPS32 instructions. + */ + +#if defined(__MIPSEB__) +# define SWHI swl /* high part is left in big-endian */ +# define SWLO swr /* low part is right in big-endian */ +#endif + +#if defined(__MIPSEL__) +# define SWHI swr /* high part is right in little-endian */ +# define SWLO swl /* low part is left in little-endian */ +#endif + +#if !(defined(XGPROF) || defined(XPROF)) +#undef SETUP_GP +#define SETUP_GP +#endif + +#ifdef NDEBUG +#define DBG # +#else +#define DBG +#endif + +/* + * void _memset16(uint16_t* dst, uint16_t value, size_t size); + */ + +LEAF(_memset16,0) + .set noreorder +DBG /* Check parameters */ +DBG andi t0,a0,1 # a0 must be halfword aligned +DBG tne t0,zero +DBG andi t2,a2,1 # a2 must be even +DBG tne t2,zero + +#ifdef FIXARGS + # ensure count is even +#if (__mips==32) && (__mips_isa_rev>=2) + ins a2,zero,0,1 +#else + ori a2,1 + xori a2,1 +#endif +#endif + +#if (__mips==32) && (__mips_isa_rev>=2) + ins a1,a1,16,16 +#else + andi a1,0xffff + sll t3,a1,16 + or a1,t3 +#endif + + beqz a2,.Ldone + andi t1,a0,2 + beqz t1,.Lalignok + addu t0,a0,a2 # t0 is the "past the end" address + sh a1,0(a0) # store one halfword to get aligned + addu a0,2 + subu a2,2 +.Lalignok: + slti t1,a2,4 # .Laligned for 4 or more bytes + beqz t1,.Laligned + sne t1,a2,2 # one more halfword? + bnez t1,.Ldone + nop + sh a1,0(a0) +.Ldone: + j ra + nop + .set reorder +END(_memset16) + +/* + * void _memset32(uint32_t* dst, uint32_t value, size_t size); + */ + +LEAF(_memset32,0) + .set noreorder +DBG /* Check parameters */ +DBG andi t0,a0,3 # a0 must be word aligned +DBG tne t0,zero +DBG andi t2,a2,3 # a2 must be a multiple of 4 bytes +DBG tne t2,zero + +#ifdef FIXARGS + # ensure count is a multiple of 4 +#if (__mips==32) && (__mips_isa_rev>=2) + ins $a2,$0,0,2 +#else + ori a2,3 + xori a2,3 +#endif +#endif + + bnez a2,.Laligned # any work to do? + addu t0,a0,a2 # t0 is the "past the end" address + + j ra + nop + .set reorder +END(_memset32) + +LEAF(memset,0) + + .set noreorder + .set noat + + addu t0,a0,a2 # t0 is the "past the end" address + slti AT,a2,4 # is a2 less than 4? + bne AT,zero,.Llast4 # if yes, go to last4 + move v0,a0 # memset returns the dst pointer + + beq a1,zero,.Lset0 + subu v1,zero,a0 + + # smear byte into 32 bit word +#if (__mips==32) && (__mips_isa_rev>=2) + ins a1, a1, 8, 8 # Replicate fill byte into half-word. + ins a1, a1, 16, 16 # Replicate fill byte into word. +#else + and a1,0xff + sll AT,a1,8 + or a1,AT + sll AT,a1,16 + or a1,AT +#endif + +.Lset0: + andi v1,v1,0x3 # word-unaligned address? + beq v1,zero,.Laligned # v1 is the unalignment count + subu a2,a2,v1 + SWHI a1,0(a0) + addu a0,a0,v1 + +# Here we have the "word-aligned" a0 (until the "last4") +.Laligned: + andi t8,a2,0x3f # any 64-byte chunks? + # t8 is the byte count past 64-byte chunks + beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks + # There will be at most 1 32-byte chunk then + subu a3,a2,t8 # subtract from a2 the reminder + # Here a3 counts bytes in 16w chunks + addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks + +# Find out, if there are any 64-byte chunks after which will be still at least +# 96 bytes left. The value "96" is calculated as needed buffer for +# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after +# incrementing "a0" by 64. +# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk. +# + sltiu v1,a2,160 + bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)" + subu t7,a2,96 # subtract "pref 30 unsafe" region + # below we have at least 1 64-byte chunk which is "pref 30 safe" + andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder + subu t5,t7,t6 # subtract from t7 the reminder + # Here t5 counts bytes in 16w "safe" chunks + addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks + +# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line +# pref 30,0(a0) +# Here we are in the region, where it is safe to use "pref 30,64(a0)" +.Lloop16w: + addiu a0,a0,64 + pref 30,-32(a0) # continue setting up the dest, addr 64-32 + sw a1,-64(a0) + sw a1,-60(a0) + sw a1,-56(a0) + sw a1,-52(a0) + sw a1,-48(a0) + sw a1,-44(a0) + sw a1,-40(a0) + sw a1,-36(a0) + nop + nop # the extra nop instructions help to balance + nop # cycles needed for "store" + "fill" + "evict" + nop # For 64byte store there are needed 8 fill + nop # and 8 evict cycles, i.e. at least 32 instr. + nop + nop + pref 30,0(a0) # continue setting up the dest, addr 64-0 + sw a1,-32(a0) + sw a1,-28(a0) + sw a1,-24(a0) + sw a1,-20(a0) + sw a1,-16(a0) + sw a1,-12(a0) + sw a1,-8(a0) + sw a1,-4(a0) + nop + nop + nop + nop # NOTE: adding 14 nop-s instead of 12 nop-s + nop # gives better results for "fast" memory + nop + bne a0,t4,.Lloop16w + nop + + beq a0,a3,.Lchk8w # maybe no more 64-byte chunks? + nop # this "delayed slot" is useless ... + +.Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks + addiu a0,a0,64 + sw a1,-64(a0) + sw a1,-60(a0) + sw a1,-56(a0) + sw a1,-52(a0) + sw a1,-48(a0) + sw a1,-44(a0) + sw a1,-40(a0) + sw a1,-36(a0) + sw a1,-32(a0) + sw a1,-28(a0) + sw a1,-24(a0) + sw a1,-20(a0) + sw a1,-16(a0) + sw a1,-12(a0) + sw a1,-8(a0) + bne a0,a3,.Lloop16w_nopref30 + sw a1,-4(a0) + +.Lchk8w: # t8 here is the byte count past 64-byte chunks + + andi t7,t8,0x1f # is there a 32-byte chunk? + # the t7 is the reminder count past 32-bytes + beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk + move a2,t7 + + sw a1,0(a0) + sw a1,4(a0) + sw a1,8(a0) + sw a1,12(a0) + sw a1,16(a0) + sw a1,20(a0) + sw a1,24(a0) + sw a1,28(a0) + addiu a0,a0,32 + +.Lchk1w: + andi t8,a2,0x3 # now t8 is the reminder past 1w chunks + beq a2,t8,.Llast4aligned + subu a3,a2,t8 # a3 is the count of bytes in 1w chunks + addu a3,a0,a3 # now a3 is the dst address past the 1w chunks + +# copying in words (4-byte chunks) +.LwordCopy_loop: + addiu a0,a0,4 + bne a0,a3,.LwordCopy_loop + sw a1,-4(a0) + +# store last 0-3 bytes +# this will repeat the last store if the memset finishes on a word boundary +.Llast4aligned: + j ra + SWLO a1,-1(t0) + +.Llast4: + beq a0,t0,.Llast4e +.Llast4l: + addiu a0,a0,1 + bne a0,t0,.Llast4l + sb a1,-1(a0) +.Llast4e: + j ra + nop + + .set at + .set reorder + +END(memset) + + +/************************************************************************ + * Implementation : Static functions + ************************************************************************/ + diff --git a/libc/arch-mips/string/mips-string-ops.h b/libc/arch-mips/string/mips-string-ops.h new file mode 100644 index 0000000..50f7e3a --- /dev/null +++ b/libc/arch-mips/string/mips-string-ops.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2010 MIPS Technologies, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with + * the distribution. + * * Neither the name of MIPS Technologies Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __MIPS_STRING_OPS_H +#define __MIPS_STRING_OPS_H + /* This definition of the byte bitfields uses the + assumption that the layout of the bitfields is + equivalent to the layout in memory. Generally, + for the MIPS ABIs, this is true. If you compile + the strcmp.c file with -DSMOKE_TEST_NEW_STRCMP, + this assumption will be tested. + + Also, regardless of char signedness, ANSI C dictates that + strcmp() treats each character as unsigned char. For + strlen and the like, signedness doesn't matter. + + Also, this code assumes that there are 8-bits per 'char'. */ + +#if __mips64 +typedef struct bits +{ + unsigned B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8; +} bits_t; +#else +typedef struct bits +{ + unsigned B0:8, B1:8, B2:8, B3:8; +} bits_t; +#endif + +#ifndef _ULW + /* for MIPS GCC, there is no unaligned builtins - so this code forces + the compiler to treat the pointer access as unaligned. */ +struct ulw +{ + unsigned b; +} __attribute__ ((packed)); + +#define _ULW(__x) ((struct ulw *) ((char *)(&__x)))->b; +#endif + +/* This union assumes that small structures can be in registers. If + not, then memory accesses will be done - not optimal, but ok. */ +typedef union +{ + unsigned v; + bits_t b; +} bitfields_t; + +#ifndef detect_zero +/* __mips_dsp, __mips_dspr2, and __mips64 are predefined by + the compiler, based on command line options. */ +#if (__mips_dsp || __mips_dspr2) && !__mips64 +#define __mips_using_dsp 1 + +/* DSP 4-lane (8 unsigned bits per line) subtract and saturate + * Intrinsic operation. How this works: + * Given a 4-byte string of "ABC\0", subtract this as + * an unsigned integer from 0x01010101: + * 0x01010101 + * - 0x41424300 + * ----------- + ( 0xbfbebe01 <-- answer without saturation + * 0x00000001 <-- answer with saturation + * When this 4-lane vector is treated as an unsigned int value, + * a non-zero answer indicates the presence of a zero in the + * original 4-byte argument. */ + +typedef signed char v4i8 __attribute__ ((vector_size (4))); + +#define detect_zero(__x,__y,__01s,__80s)\ + ((unsigned) __builtin_mips_subu_s_qb((v4i8) __01s,(v4i8) __x)) + + /* sets all 4 lanes to requested byte. */ +#define set_byte_lanes(__x) ((unsigned) __builtin_mips_repl_qb(__x)) + + /* sets all 4 lanes to 0x01. */ +#define def_and_set_01(__x) unsigned __x = (unsigned) __builtin_mips_repl_qb(0x01) + + /* sets all 4 lanes to 0x80. Not needed when subu_s.qb used. */ +#define def_and_set_80(__x) /* do nothing */ + +#else + /* this version, originally published in the 80's, uses + a reverse-carry-set like determination of the zero byte. + The steps are, for __x = 0x31ff0001: + __x - _01s = 0x30fdff00 + ~__x = 0xce00fffe + ((__x - _01s) & ~__x) = 0x0000ff00 + x & _80s = 0x00008000 <- byte 3 was zero + Some implementaions naively assume that characters are + always 7-bit unsigned ASCII. With that assumption, the + "& ~x" is usually discarded. Since character strings + are 8-bit, the and is needed to catch the case of + a false positive when the byte is 0x80. */ + +#define detect_zero(__x,__y,_01s,_80s)\ + ((unsigned) (((__x) - _01s) & ~(__x)) & _80s) + +#if __mips64 +#define def_and_set_80(__x) unsigned __x = 0x8080808080808080ul +#define def_and_set_01(__x) unsigned __x = 0x0101010101010101ul +#else +#define def_and_set_80(__x) unsigned __x = 0x80808080ul +#define def_and_set_01(__x) unsigned __x = 0x01010101ul +#endif + +#endif +#endif + +/* dealing with 'void *' conversions without using extra variables. */ +#define get_byte(__x,__idx) (((unsigned char *) (__x))[__idx]) +#define set_byte(__x,__idx,__fill) ((unsigned char *) (__x))[__idx] = (__fill) +#define get_word(__x,__idx) (((unsigned *) (__x))[__idx]) +#define set_word(__x,__idx,__fill) ((unsigned *) (__x))[__idx] = (__fill) +#define inc_ptr_as(__type,__x,__inc) __x = (void *) (((__type) __x) + (__inc)) +#define cvt_ptr_to(__type,__x) ((__type) (__x)) + +#endif diff --git a/libc/arch-mips/string/mips_strlen.c b/libc/arch-mips/string/mips_strlen.c new file mode 100644 index 0000000..9fb7e6a --- /dev/null +++ b/libc/arch-mips/string/mips_strlen.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2010 MIPS Technologies, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with + * the distribution. + * * Neither the name of MIPS Technologies Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mips-string-ops.h" + +#define do_strlen_word(__av) {\ + if (detect_zero(x,x,_01s,_80s)) break;\ + x = __av;\ + cnt += sizeof (unsigned);\ + } + +#define do_strlen_byte(__x) {\ + if ((bx.b.B##__x) == 0) break;\ + ++cnt;\ + } + +#if SMOKE_TEST_MIPS_STRLEN +#define strlen my_strlen +#endif + +int +strlen (const void *_a) +{ + int cnt = 0; + unsigned x; + + /* align the string to word boundary so we can do word at a time. */ + if ((cvt_ptr_to (unsigned, _a) & (sizeof (unsigned) - 1)) != 0) + { + if ((cvt_ptr_to (unsigned, _a) & 1) != 0) + { + if (get_byte (_a, 0) == 0) + return cnt; + /* set bit 1 so 2-bytes are checked and incremented. */ + inc_ptr_as (char *, _a, 1); + ++cnt; + } + if ((cvt_ptr_to (unsigned, _a) & 2) != 0) + { + if (get_byte (_a, 0) == 0) + return cnt + 0; + if (get_byte (_a, 1) == 0) + return cnt + 1; + inc_ptr_as (char *, _a, 2); + cnt += 2; + } + } + +#if __mips64 +#error strlen: mips64 check for 4-byte alignment not implemented. +#endif + + if (1) + { + def_and_set_01 (_01s); + def_and_set_80 (_80s); + + /* as advantagous as it is to performance, this code cannot pre-load + the following word, nor can it prefetch the next line at the start + of the loop since the string can be at the end of a page with the + following page unmapped. There are tests in the suite to catch + any attempt to go beyond the current word. */ + x = get_word (_a, 0); + while (1) + { + /* doing 8 words should cover most strings. */ + do_strlen_word (get_word (_a, 1)); + do_strlen_word (get_word (_a, 2)); + do_strlen_word (get_word (_a, 3)); + do_strlen_word (get_word (_a, 4)); + do_strlen_word (get_word (_a, 5)); + do_strlen_word (get_word (_a, 6)); + do_strlen_word (get_word (_a, 7)); + do_strlen_word (get_word (_a, 8)); + inc_ptr_as (unsigned *, _a, 8); + } + } + while (1) + { + /* pull apart the last word processed and find the zero. */ + bitfields_t bx; + bx.v = x; +#if __mips64 + do_strlen_byte (0); + do_strlen_byte (1); + do_strlen_byte (2); + do_strlen_byte (3); + do_strlen_byte (4); + do_strlen_byte (5); + do_strlen_byte (6); +#else + do_strlen_byte (0); + do_strlen_byte (1); + do_strlen_byte (2); +#endif + /* last byte is zero */ + break; + } + return cnt; +} + +#undef do_strlen_byte +#undef do_strlen_word + +#if SMOKE_TEST_MIPS_STRLEN +#include <stdio.h> +char str1[] = "DHRYSTONE PROGRAM, 1'ST STRING"; +char str2[] = "DHRYSTONE PROGRAM, 2'ST STRING"; + +char str3[] = "another string"; +char str4[] = "another"; + +char str5[] = "somes tring"; +char str6[] = "somes_tring"; + +char str7[16], str8[16]; + +static char * +chk (unsigned mine, unsigned libs, int *errors) +{ + static char answer[1024]; + char *result = mine == libs ? "PASS" : "FAIL"; + sprintf (answer, "new_strlen=%d: lib_strlen=%d: %s!", mine, libs, result); + if (mine != libs) + (*errors)++; + return answer; +} + +int +main (int argc, char **argv) +{ + int errors = 0; + /* set -1 in one position */ + str6[5] = 0xff; + /* set zero in same position with junk in following 3 */ + str7[0] = str8[0] = 0; + str7[1] = 0xff; + str7[2] = 'a'; + str7[3] = 2; + str8[1] = 's'; + str8[2] = -2; + str8[3] = 0; + + fprintf (stderr, "========== mips_strlen%s test...\n", + argv[0] ? argv[0] : "unknown strlen"); +#define P(__x,__y) {\ + int a = my_strlen(__x + __y);\ + int b = (strlen)(__x + __y) /* library version */;\ + fprintf(stderr,"%s+%d: %s\n",#__x,__y,chk(a,b,&errors));\ + } + + P (str1, 0); + P (str1, 1); + P (str1, 2); + P (str1, 3); + + P (str2, 0); + P (str2, 1); + P (str2, 2); + P (str2, 3); + + P (str3, 0); + P (str3, 1); + P (str3, 2); + P (str3, 3); + + P (str4, 0); + P (str4, 1); + P (str4, 2); + P (str4, 3); + + P (str5, 0); + P (str5, 1); + P (str5, 2); + P (str5, 3); + + P (str6, 0); + P (str6, 1); + P (str6, 2); + P (str6, 3); + + P (str7, 0); + P (str7, 1); + P (str7, 2); + P (str7, 3); + + P (str8, 0); + P (str8, 1); + P (str8, 2); + P (str8, 3); + + return errors; +} +#endif |