diff options
| author | Elliott Hughes <enh@google.com> | 2012-05-10 10:14:43 -0700 |
|---|---|---|
| committer | android code review <noreply-gerritcodereview@google.com> | 2012-05-10 10:14:44 -0700 |
| commit | 4f05d1c758ba141c617f25251a661ecb66627e9e (patch) | |
| tree | f96095e394863c80a61ecb25ef6083a1122c3c0b /libc | |
| parent | edb7cad9b764f029c5faac2750f749d3d84bd86a (diff) | |
| parent | c47703a521abab120100673d5281f71bc8ba9a49 (diff) | |
| download | bionic-4f05d1c758ba141c617f25251a661ecb66627e9e.zip bionic-4f05d1c758ba141c617f25251a661ecb66627e9e.tar.gz bionic-4f05d1c758ba141c617f25251a661ecb66627e9e.tar.bz2 | |
Merge "bionic/x86: Optimization for memcpy"
Diffstat (limited to 'libc')
| -rw-r--r-- | libc/arch-x86/string/ssse3-memcpy5.S | 2541 |
1 files changed, 1963 insertions, 578 deletions
diff --git a/libc/arch-x86/string/ssse3-memcpy5.S b/libc/arch-x86/string/ssse3-memcpy5.S index 1bf6d22..b0612a6 100644 --- a/libc/arch-x86/string/ssse3-memcpy5.S +++ b/libc/arch-x86/string/ssse3-memcpy5.S @@ -29,23 +29,19 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MEMCPY -# define MEMCPY ssse3_memcpy5 +# define MEMCPY ssse3_memcpy5 #endif #ifndef L # define L(label) .L##label #endif -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif - #ifndef cfi_startproc -# define cfi_startproc .cfi_startproc +# define cfi_startproc .cfi_startproc #endif #ifndef cfi_endproc -# define cfi_endproc .cfi_endproc +# define cfi_endproc .cfi_endproc #endif #ifndef cfi_rel_offset @@ -53,33 +49,25 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifndef cfi_restore -# define cfi_restore(reg) .cfi_restore reg +# define cfi_restore(reg) .cfi_restore reg #endif #ifndef cfi_adjust_cfa_offset # define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off #endif -#ifndef cfi_remember_state -# define cfi_remember_state .cfi_remember_state -#endif - -#ifndef cfi_restore_state -# define cfi_restore_state .cfi_restore_state -#endif - #ifndef ENTRY -# define ENTRY(name) \ - .type name, @function; \ - .globl name; \ - .p2align 4; \ -name: \ +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ cfi_startproc #endif #ifndef END -# define END(name) \ - cfi_endproc; \ +# define END(name) \ + cfi_endproc; \ .size name, .-name #endif @@ -93,12 +81,12 @@ name: \ # define LEN SRC+4 #endif -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ cfi_rel_offset (REG, 0) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ cfi_restore (REG) #define PUSH(REG) pushl REG; CFI_PUSH (REG) @@ -110,38 +98,26 @@ name: \ # define RETURN_END POP (%ebx); ret # define RETURN RETURN_END; CFI_PUSH (%ebx) # define JMPTBL(I, B) I - B +# undef __i686 + +# define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx - -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ - addl $(TABLE - .), %ebx - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx - - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx, INDEX, SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx #else + # define PARMS 4 # define ENTRANCE # define RETURN_END ret @@ -149,15 +125,11 @@ __i686.get_pc_thunk.bx: # define JMPTBL(I, B) I /* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(, INDEX, SCALE) #endif .section .text.ssse3,"ax",@progbits @@ -174,6 +146,8 @@ ENTRY (MEMCPY) cmp $32, %ecx jae L(memmove_bwd) jmp L(bk_write_less32bytes_2) + + .p2align 4 L(memmove_bwd): add %ecx, %eax cmp %eax, %edx @@ -194,21 +168,25 @@ L(fwd_write_less32bytes): add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) #ifndef USE_AS_MEMMOVE + .p2align 4 L(bk_write): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) #endif - ALIGN (4) -/* ECX > 32 and EDX is 4 byte aligned. */ + .p2align 4 L(48bytesormore): +#ifndef USE_AS_MEMMOVE + movlpd (%eax), %xmm0 + movlpd 8(%eax), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) +#else movdqu (%eax), %xmm0 +#endif PUSH (%edi) movl %edx, %edi and $-16, %edx - PUSH (%esi) - cfi_remember_state add $16, %edx - movl %edi, %esi sub %edx, %edi add %edi, %ecx sub %edi, %eax @@ -217,7 +195,7 @@ L(48bytesormore): cmp $SHARED_CACHE_SIZE_HALF, %ecx #else # if (defined SHARED || defined __PIC__) - call __i686.get_pc_thunk.bx + SETUP_PIC_REG(bx) add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx # else @@ -229,19 +207,20 @@ L(48bytesormore): jae L(large_page) and $0xf, %edi jz L(shl_0) - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_0): - movdqu %xmm0, (%esi) +#ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +#endif xor %edi, %edi - POP (%esi) cmp $127, %ecx ja L(shl_0_gobble) lea -32(%ecx), %ecx + + .p2align 4 L(shl_0_loop): movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -273,32 +252,35 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi + L(shl_0_end): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx add %edi, %eax POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) CFI_PUSH (%edi) -L(shl_0_gobble): + .p2align 4 +L(shl_0_gobble): #ifdef DATA_CACHE_SIZE_HALF cmp $DATA_CACHE_SIZE_HALF, %ecx #else # if (defined SHARED || defined __PIC__) - call __i686.get_pc_thunk.bx + SETUP_PIC_REG(bx) add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx # else cmp __x86_data_cache_size_half, %ecx # endif #endif - - POP (%edi) + POP (%edi) lea -128(%ecx), %ecx jae L(shl_0_gobble_mem_loop) + + .p2align 4 L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 movdqa 0x10(%eax), %xmm1 @@ -328,17 +310,15 @@ L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 sub $0x40, %ecx movdqa 0x10(%eax), %xmm1 - movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) - movdqa 0x20(%eax), %xmm0 movdqa 0x30(%eax), %xmm1 add $0x40, %eax - movdqa %xmm0, 0x20(%edx) movdqa %xmm1, 0x30(%edx) add $0x40, %edx + L(shl_0_cache_less_64bytes): cmp $0x20, %ecx jb L(shl_0_cache_less_32bytes) @@ -349,6 +329,7 @@ L(shl_0_cache_less_64bytes): movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) add $0x20, %edx + L(shl_0_cache_less_32bytes): cmp $0x10, %ecx jb L(shl_0_cache_less_16bytes) @@ -357,13 +338,13 @@ L(shl_0_cache_less_32bytes): add $0x10, %eax movdqa %xmm0, (%edx) add $0x10, %edx + L(shl_0_cache_less_16bytes): add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) + .p2align 4 L(shl_0_gobble_mem_loop): prefetcht0 0x1c0(%eax) prefetcht0 0x280(%eax) @@ -408,6 +389,7 @@ L(shl_0_gobble_mem_loop): movdqa %xmm0, 0x20(%edx) movdqa %xmm1, 0x30(%edx) add $0x40, %edx + L(shl_0_mem_less_64bytes): cmp $0x20, %ecx jb L(shl_0_mem_less_32bytes) @@ -418,6 +400,7 @@ L(shl_0_mem_less_64bytes): movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) add $0x20, %edx + L(shl_0_mem_less_32bytes): cmp $0x10, %ecx jb L(shl_0_mem_less_16bytes) @@ -426,24 +409,84 @@ L(shl_0_mem_less_32bytes): add $0x10, %eax movdqa %xmm0, (%edx) add $0x10, %edx + L(shl_0_mem_less_16bytes): add %ecx, %edx add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_1): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -1(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -1(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_1_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl1LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + movaps 47(%eax), %xmm4 + movaps 63(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + palignr $1, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $1, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $1, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl1LoopStart) + +L(Shl1LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_1_no_prefetch): + lea -32(%ecx), %ecx lea -1(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_1_loop): + .p2align 4 +L(sh_1_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -453,8 +496,7 @@ L(shl_1_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_1_end) + jb L(sh_1_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -465,30 +507,90 @@ L(shl_1_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_1_no_prefetch_loop) - jae L(shl_1_loop) - -L(shl_1_end): +L(sh_1_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 1(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_2): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -2(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -2(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_2_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl2LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + movaps 46(%eax), %xmm4 + movaps 62(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + palignr $2, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $2, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $2, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl2LoopStart) + +L(Shl2LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_2_no_prefetch): + lea -32(%ecx), %ecx lea -2(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_2_loop): + .p2align 4 +L(sh_2_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -498,8 +600,7 @@ L(shl_2_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_2_end) + jb L(sh_2_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -510,30 +611,90 @@ L(shl_2_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_2_no_prefetch_loop) - jae L(shl_2_loop) - -L(shl_2_end): +L(sh_2_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 2(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_3): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -3(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -3(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_3_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl3LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + movaps 45(%eax), %xmm4 + movaps 61(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + palignr $3, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $3, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $3, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl3LoopStart) + +L(Shl3LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_3_no_prefetch): + lea -32(%ecx), %ecx lea -3(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_3_loop): + .p2align 4 +L(sh_3_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -544,7 +705,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_3_end) + jb L(sh_3_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -556,29 +717,90 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_3_loop) + jae L(sh_3_no_prefetch_loop) -L(shl_3_end): +L(sh_3_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 3(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_4): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -4(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -4(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_4_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl4LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + movaps 44(%eax), %xmm4 + movaps 60(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + palignr $4, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $4, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $4, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl4LoopStart) + +L(Shl4LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_4_no_prefetch): + lea -32(%ecx), %ecx lea -4(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_4_loop): + .p2align 4 +L(sh_4_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -589,7 +811,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_4_end) + jb L(sh_4_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -601,29 +823,90 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_4_loop) + jae L(sh_4_no_prefetch_loop) -L(shl_4_end): +L(sh_4_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 4(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_5): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -5(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -5(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_5_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl5LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + movaps 43(%eax), %xmm4 + movaps 59(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + palignr $5, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $5, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $5, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl5LoopStart) + +L(Shl5LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_5_no_prefetch): + lea -32(%ecx), %ecx lea -5(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_5_loop): + .p2align 4 +L(sh_5_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -634,7 +917,7 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_5_end) + jb L(sh_5_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -646,29 +929,90 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_5_loop) + jae L(sh_5_no_prefetch_loop) -L(shl_5_end): +L(sh_5_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 5(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_6): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -6(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -6(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_6_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl6LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + movaps 42(%eax), %xmm4 + movaps 58(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + palignr $6, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $6, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $6, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl6LoopStart) + +L(Shl6LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_6_no_prefetch): + lea -32(%ecx), %ecx lea -6(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_6_loop): + .p2align 4 +L(sh_6_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -679,7 +1023,7 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_6_end) + jb L(sh_6_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -691,29 +1035,90 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_6_loop) + jae L(sh_6_no_prefetch_loop) -L(shl_6_end): +L(sh_6_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 6(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_7): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -7(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -7(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_7_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl7LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + movaps 41(%eax), %xmm4 + movaps 57(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + palignr $7, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $7, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $7, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl7LoopStart) + +L(Shl7LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_7_no_prefetch): + lea -32(%ecx), %ecx lea -7(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_7_loop): + .p2align 4 +L(sh_7_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -723,8 +1128,7 @@ L(shl_7_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_7_end) + jb L(sh_7_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -735,30 +1139,90 @@ L(shl_7_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_7_no_prefetch_loop) - jae L(shl_7_loop) - -L(shl_7_end): +L(sh_7_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 7(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_8): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -8(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -8(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_8_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl8LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + movaps 40(%eax), %xmm4 + movaps 56(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + palignr $8, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $8, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $8, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl8LoopStart) + +L(LoopLeave8): + add $32, %ecx + jle L(shl_end_0) + + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_8_no_prefetch): + lea -32(%ecx), %ecx lea -8(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_8_loop): + .p2align 4 +L(sh_8_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -768,8 +1232,7 @@ L(shl_8_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_8_end) + jb L(sh_8_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -780,30 +1243,91 @@ L(shl_8_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_8_no_prefetch_loop) - jae L(shl_8_loop) - -L(shl_8_end): +L(sh_8_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 8(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_9): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -9(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -9(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_9_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl9LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + movaps 39(%eax), %xmm4 + movaps 55(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + palignr $9, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $9, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $9, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl9LoopStart) + +L(Shl9LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_9_no_prefetch): + lea -32(%ecx), %ecx lea -9(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_9_loop): + .p2align 4 +L(sh_9_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -813,8 +1337,7 @@ L(shl_9_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_9_end) + jb L(sh_9_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -825,30 +1348,91 @@ L(shl_9_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_9_no_prefetch_loop) - jae L(shl_9_loop) - -L(shl_9_end): +L(sh_9_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 9(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_10): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -10(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -10(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_10_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl10LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + movaps 38(%eax), %xmm4 + movaps 54(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + palignr $10, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $10, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $10, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl10LoopStart) + +L(Shl10LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_10_no_prefetch): + lea -32(%ecx), %ecx lea -10(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_10_loop): + .p2align 4 +L(sh_10_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -858,8 +1442,7 @@ L(shl_10_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_10_end) + jb L(sh_10_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -870,30 +1453,91 @@ L(shl_10_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_10_no_prefetch_loop) - jae L(shl_10_loop) - -L(shl_10_end): +L(sh_10_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 10(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_11): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -11(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -11(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_11_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl11LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + movaps 37(%eax), %xmm4 + movaps 53(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + palignr $11, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $11, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $11, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl11LoopStart) + +L(Shl11LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_11_no_prefetch): + lea -32(%ecx), %ecx lea -11(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_11_loop): + .p2align 4 +L(sh_11_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -903,8 +1547,7 @@ L(shl_11_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_11_end) + jb L(sh_11_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -915,30 +1558,91 @@ L(shl_11_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_11_no_prefetch_loop) - jae L(shl_11_loop) - -L(shl_11_end): +L(sh_11_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 11(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_12): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -12(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -12(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_12_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl12LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + movaps 36(%eax), %xmm4 + movaps 52(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + palignr $12, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $12, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $12, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl12LoopStart) + +L(Shl12LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_12_no_prefetch): + lea -32(%ecx), %ecx lea -12(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_12_loop): + .p2align 4 +L(sh_12_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -948,8 +1652,7 @@ L(shl_12_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_12_end) + jb L(sh_12_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -960,30 +1663,91 @@ L(shl_12_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_12_no_prefetch_loop) - jae L(shl_12_loop) - -L(shl_12_end): +L(sh_12_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 12(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_13): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -13(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -13(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_13_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl13LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + movaps 35(%eax), %xmm4 + movaps 51(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + palignr $13, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $13, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $13, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl13LoopStart) + +L(Shl13LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_13_no_prefetch): + lea -32(%ecx), %ecx lea -13(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_13_loop): + .p2align 4 +L(sh_13_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -993,8 +1757,7 @@ L(shl_13_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_13_end) + jb L(sh_13_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1005,30 +1768,91 @@ L(shl_13_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_13_no_prefetch_loop) - jae L(shl_13_loop) - -L(shl_13_end): +L(sh_13_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 13(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_14): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -14(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -14(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_14_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl14LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + movaps 34(%eax), %xmm4 + movaps 50(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + palignr $14, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $14, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $14, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl14LoopStart) + +L(Shl14LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_14_no_prefetch): + lea -32(%ecx), %ecx lea -14(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_14_loop): + .p2align 4 +L(sh_14_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -1038,8 +1862,7 @@ L(shl_14_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_14_end) + jb L(sh_14_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1050,30 +1873,91 @@ L(shl_14_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_14_no_prefetch_loop) - jae L(shl_14_loop) - -L(shl_14_end): +L(sh_14_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 14(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_15): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +#ifndef USE_AS_MEMMOVE + movaps -15(%eax), %xmm1 +#else + movl DEST+4(%esp), %edi + movaps -15(%eax), %xmm1 + movdqu %xmm0, (%edi) +#endif +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# if (defined SHARED || defined __PIC__) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + jb L(sh_15_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl15LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + movaps 33(%eax), %xmm4 + movaps 49(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + palignr $15, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $15, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $15, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl15LoopStart) + +L(Shl15LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_15_no_prefetch): + lea -32(%ecx), %ecx lea -15(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_15_loop): + .p2align 4 +L(sh_15_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -1083,8 +1967,7 @@ L(shl_15_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_15_end) + jb L(sh_15_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1095,52 +1978,70 @@ L(shl_15_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_15_no_prefetch_loop) - jae L(shl_15_loop) - -L(shl_15_end): +L(sh_15_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 15(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + CFI_PUSH (%edi) - ALIGN (4) + .p2align 4 +L(shl_end_0): + lea 32(%ecx), %ecx + lea (%edx, %ecx), %edx + lea (%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 L(fwd_write_44bytes): - movl -44(%eax), %ecx - movl %ecx, -44(%edx) -L(fwd_write_40bytes): - movl -40(%eax), %ecx - movl %ecx, -40(%edx) + movq -44(%eax), %xmm0 + movq %xmm0, -44(%edx) L(fwd_write_36bytes): - movl -36(%eax), %ecx - movl %ecx, -36(%edx) -L(fwd_write_32bytes): - movl -32(%eax), %ecx - movl %ecx, -32(%edx) + movq -36(%eax), %xmm0 + movq %xmm0, -36(%edx) L(fwd_write_28bytes): - movl -28(%eax), %ecx - movl %ecx, -28(%edx) -L(fwd_write_24bytes): - movl -24(%eax), %ecx - movl %ecx, -24(%edx) + movq -28(%eax), %xmm0 + movq %xmm0, -28(%edx) L(fwd_write_20bytes): - movl -20(%eax), %ecx - movl %ecx, -20(%edx) -L(fwd_write_16bytes): - movl -16(%eax), %ecx - movl %ecx, -16(%edx) + movq -20(%eax), %xmm0 + movq %xmm0, -20(%edx) L(fwd_write_12bytes): - movl -12(%eax), %ecx - movl %ecx, -12(%edx) -L(fwd_write_8bytes): - movl -8(%eax), %ecx - movl %ecx, -8(%edx) + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) L(fwd_write_4bytes): movl -4(%eax), %ecx movl %ecx, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_40bytes): + movq -40(%eax), %xmm0 + movq %xmm0, -40(%edx) +L(fwd_write_32bytes): + movq -32(%eax), %xmm0 + movq %xmm0, -32(%edx) +L(fwd_write_24bytes): + movq -24(%eax), %xmm0 + movq %xmm0, -24(%edx) +L(fwd_write_16bytes): + movq -16(%eax), %xmm0 + movq %xmm0, -16(%edx) +L(fwd_write_8bytes): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) L(fwd_write_0bytes): #ifndef USE_AS_BCOPY # ifdef USE_AS_MEMPCPY @@ -1151,7 +2052,7 @@ L(fwd_write_0bytes): #endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_5bytes): movl -5(%eax), %ecx movl -4(%eax), %eax @@ -1166,39 +2067,51 @@ L(fwd_write_5bytes): #endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_45bytes): - movl -45(%eax), %ecx - movl %ecx, -45(%edx) -L(fwd_write_41bytes): - movl -41(%eax), %ecx - movl %ecx, -41(%edx) + movq -45(%eax), %xmm0 + movq %xmm0, -45(%edx) L(fwd_write_37bytes): - movl -37(%eax), %ecx - movl %ecx, -37(%edx) -L(fwd_write_33bytes): - movl -33(%eax), %ecx - movl %ecx, -33(%edx) + movq -37(%eax), %xmm0 + movq %xmm0, -37(%edx) L(fwd_write_29bytes): - movl -29(%eax), %ecx - movl %ecx, -29(%edx) -L(fwd_write_25bytes): - movl -25(%eax), %ecx - movl %ecx, -25(%edx) + movq -29(%eax), %xmm0 + movq %xmm0, -29(%edx) L(fwd_write_21bytes): - movl -21(%eax), %ecx - movl %ecx, -21(%edx) -L(fwd_write_17bytes): - movl -17(%eax), %ecx - movl %ecx, -17(%edx) + movq -21(%eax), %xmm0 + movq %xmm0, -21(%edx) L(fwd_write_13bytes): - movl -13(%eax), %ecx - movl %ecx, -13(%edx) -L(fwd_write_9bytes): - movl -9(%eax), %ecx - movl %ecx, -9(%edx) + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) movl -5(%eax), %ecx movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_41bytes): + movq -41(%eax), %xmm0 + movq %xmm0, -41(%edx) +L(fwd_write_33bytes): + movq -33(%eax), %xmm0 + movq %xmm0, -33(%edx) +L(fwd_write_25bytes): + movq -25(%eax), %xmm0 + movq %xmm0, -25(%edx) +L(fwd_write_17bytes): + movq -17(%eax), %xmm0 + movq %xmm0, -17(%edx) +L(fwd_write_9bytes): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) L(fwd_write_1bytes): movzbl -1(%eax), %ecx movb %cl, -1(%edx) @@ -1211,40 +2124,52 @@ L(fwd_write_1bytes): #endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_46bytes): - movl -46(%eax), %ecx - movl %ecx, -46(%edx) -L(fwd_write_42bytes): - movl -42(%eax), %ecx - movl %ecx, -42(%edx) + movq -46(%eax), %xmm0 + movq %xmm0, -46(%edx) L(fwd_write_38bytes): - movl -38(%eax), %ecx - movl %ecx, -38(%edx) -L(fwd_write_34bytes): - movl -34(%eax), %ecx - movl %ecx, -34(%edx) + movq -38(%eax), %xmm0 + movq %xmm0, -38(%edx) L(fwd_write_30bytes): - movl -30(%eax), %ecx - movl %ecx, -30(%edx) -L(fwd_write_26bytes): - movl -26(%eax), %ecx - movl %ecx, -26(%edx) + movq -30(%eax), %xmm0 + movq %xmm0, -30(%edx) L(fwd_write_22bytes): - movl -22(%eax), %ecx - movl %ecx, -22(%edx) -L(fwd_write_18bytes): - movl -18(%eax), %ecx - movl %ecx, -18(%edx) + movq -22(%eax), %xmm0 + movq %xmm0, -22(%edx) L(fwd_write_14bytes): - movl -14(%eax), %ecx - movl %ecx, -14(%edx) -L(fwd_write_10bytes): - movl -10(%eax), %ecx - movl %ecx, -10(%edx) + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) L(fwd_write_6bytes): movl -6(%eax), %ecx movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_42bytes): + movq -42(%eax), %xmm0 + movq %xmm0, -42(%edx) +L(fwd_write_34bytes): + movq -34(%eax), %xmm0 + movq %xmm0, -34(%edx) +L(fwd_write_26bytes): + movq -26(%eax), %xmm0 + movq %xmm0, -26(%edx) +L(fwd_write_18bytes): + movq -18(%eax), %xmm0 + movq %xmm0, -18(%edx) +L(fwd_write_10bytes): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) L(fwd_write_2bytes): movzwl -2(%eax), %ecx movw %cx, -2(%edx) @@ -1257,40 +2182,54 @@ L(fwd_write_2bytes): #endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_47bytes): - movl -47(%eax), %ecx - movl %ecx, -47(%edx) -L(fwd_write_43bytes): - movl -43(%eax), %ecx - movl %ecx, -43(%edx) + movq -47(%eax), %xmm0 + movq %xmm0, -47(%edx) L(fwd_write_39bytes): - movl -39(%eax), %ecx - movl %ecx, -39(%edx) -L(fwd_write_35bytes): - movl -35(%eax), %ecx - movl %ecx, -35(%edx) + movq -39(%eax), %xmm0 + movq %xmm0, -39(%edx) L(fwd_write_31bytes): - movl -31(%eax), %ecx - movl %ecx, -31(%edx) -L(fwd_write_27bytes): - movl -27(%eax), %ecx - movl %ecx, -27(%edx) + movq -31(%eax), %xmm0 + movq %xmm0, -31(%edx) L(fwd_write_23bytes): - movl -23(%eax), %ecx - movl %ecx, -23(%edx) -L(fwd_write_19bytes): - movl -19(%eax), %ecx - movl %ecx, -19(%edx) + movq -23(%eax), %xmm0 + movq %xmm0, -23(%edx) L(fwd_write_15bytes): - movl -15(%eax), %ecx - movl %ecx, -15(%edx) -L(fwd_write_11bytes): - movl -11(%eax), %ecx - movl %ecx, -11(%edx) + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) L(fwd_write_7bytes): movl -7(%eax), %ecx movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_43bytes): + movq -43(%eax), %xmm0 + movq %xmm0, -43(%edx) +L(fwd_write_35bytes): + movq -35(%eax), %xmm0 + movq %xmm0, -35(%edx) +L(fwd_write_27bytes): + movq -27(%eax), %xmm0 + movq %xmm0, -27(%edx) +L(fwd_write_19bytes): + movq -19(%eax), %xmm0 + movq %xmm0, -19(%edx) +L(fwd_write_11bytes): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) L(fwd_write_3bytes): movzwl -3(%eax), %ecx movzbl -1(%eax), %eax @@ -1303,20 +2242,374 @@ L(fwd_write_3bytes): movl DEST(%esp), %eax # endif #endif + RETURN + + .p2align 4 +L(fwd_write_40bytes_align): + movdqa -40(%eax), %xmm0 + movdqa %xmm0, -40(%edx) +L(fwd_write_24bytes_align): + movdqa -24(%eax), %xmm0 + movdqa %xmm0, -24(%edx) +L(fwd_write_8bytes_align): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes_align): +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_32bytes_align): + movdqa -32(%eax), %xmm0 + movdqa %xmm0, -32(%edx) +L(fwd_write_16bytes_align): + movdqa -16(%eax), %xmm0 + movdqa %xmm0, -16(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_5bytes_align): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_45bytes_align): + movdqa -45(%eax), %xmm0 + movdqa %xmm0, -45(%edx) +L(fwd_write_29bytes_align): + movdqa -29(%eax), %xmm0 + movdqa %xmm0, -29(%edx) +L(fwd_write_13bytes_align): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_37bytes_align): + movdqa -37(%eax), %xmm0 + movdqa %xmm0, -37(%edx) +L(fwd_write_21bytes_align): + movdqa -21(%eax), %xmm0 + movdqa %xmm0, -21(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_41bytes_align): + movdqa -41(%eax), %xmm0 + movdqa %xmm0, -41(%edx) +L(fwd_write_25bytes_align): + movdqa -25(%eax), %xmm0 + movdqa %xmm0, -25(%edx) +L(fwd_write_9bytes_align): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes_align): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_33bytes_align): + movdqa -33(%eax), %xmm0 + movdqa %xmm0, -33(%edx) +L(fwd_write_17bytes_align): + movdqa -17(%eax), %xmm0 + movdqa %xmm0, -17(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_46bytes_align): + movdqa -46(%eax), %xmm0 + movdqa %xmm0, -46(%edx) +L(fwd_write_30bytes_align): + movdqa -30(%eax), %xmm0 + movdqa %xmm0, -30(%edx) +L(fwd_write_14bytes_align): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes_align): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_38bytes_align): + movdqa -38(%eax), %xmm0 + movdqa %xmm0, -38(%edx) +L(fwd_write_22bytes_align): + movdqa -22(%eax), %xmm0 + movdqa %xmm0, -22(%edx) + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_42bytes_align): + movdqa -42(%eax), %xmm0 + movdqa %xmm0, -42(%edx) +L(fwd_write_26bytes_align): + movdqa -26(%eax), %xmm0 + movdqa %xmm0, -26(%edx) +L(fwd_write_10bytes_align): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes_align): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_34bytes_align): + movdqa -34(%eax), %xmm0 + movdqa %xmm0, -34(%edx) +L(fwd_write_18bytes_align): + movdqa -18(%eax), %xmm0 + movdqa %xmm0, -18(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_47bytes_align): + movdqa -47(%eax), %xmm0 + movdqa %xmm0, -47(%edx) +L(fwd_write_31bytes_align): + movdqa -31(%eax), %xmm0 + movdqa %xmm0, -31(%edx) +L(fwd_write_15bytes_align): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes_align): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_39bytes_align): + movdqa -39(%eax), %xmm0 + movdqa %xmm0, -39(%edx) +L(fwd_write_23bytes_align): + movdqa -23(%eax), %xmm0 + movdqa %xmm0, -23(%edx) + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_43bytes_align): + movdqa -43(%eax), %xmm0 + movdqa %xmm0, -43(%edx) +L(fwd_write_27bytes_align): + movdqa -27(%eax), %xmm0 + movdqa %xmm0, -27(%edx) +L(fwd_write_11bytes_align): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes_align): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_35bytes_align): + movdqa -35(%eax), %xmm0 + movdqa %xmm0, -35(%edx) +L(fwd_write_19bytes_align): + movdqa -19(%eax), %xmm0 + movdqa %xmm0, -19(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_44bytes_align): + movdqa -44(%eax), %xmm0 + movdqa %xmm0, -44(%edx) +L(fwd_write_28bytes_align): + movdqa -28(%eax), %xmm0 + movdqa %xmm0, -28(%edx) +L(fwd_write_12bytes_align): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes_align): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + .p2align 4 +L(fwd_write_36bytes_align): + movdqa -36(%eax), %xmm0 + movdqa %xmm0, -36(%edx) +L(fwd_write_20bytes_align): + movdqa -20(%eax), %xmm0 + movdqa %xmm0, -20(%edx) + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif RETURN_END - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(large_page): movdqu (%eax), %xmm1 +#ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +#endif lea 16(%eax), %eax - movdqu %xmm0, (%esi) movntdq %xmm1, (%edx) lea 16(%edx), %edx - POP (%esi) lea -0x90(%ecx), %ecx POP (%edi) + + .p2align 4 L(large_page_loop): movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 @@ -1371,38 +2664,22 @@ L(large_page_less_32bytes): sfence BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) + .p2align 4 L(bk_write_44bytes): - movl 40(%eax), %ecx - movl %ecx, 40(%edx) -L(bk_write_40bytes): - movl 36(%eax), %ecx - movl %ecx, 36(%edx) + movq 36(%eax), %xmm0 + movq %xmm0, 36(%edx) L(bk_write_36bytes): - movl 32(%eax), %ecx - movl %ecx, 32(%edx) -L(bk_write_32bytes): - movl 28(%eax), %ecx - movl %ecx, 28(%edx) + movq 28(%eax), %xmm0 + movq %xmm0, 28(%edx) L(bk_write_28bytes): - movl 24(%eax), %ecx - movl %ecx, 24(%edx) -L(bk_write_24bytes): - movl 20(%eax), %ecx - movl %ecx, 20(%edx) + movq 20(%eax), %xmm0 + movq %xmm0, 20(%edx) L(bk_write_20bytes): - movl 16(%eax), %ecx - movl %ecx, 16(%edx) -L(bk_write_16bytes): - movl 12(%eax), %ecx - movl %ecx, 12(%edx) + movq 12(%eax), %xmm0 + movq %xmm0, 12(%edx) L(bk_write_12bytes): - movl 8(%eax), %ecx - movl %ecx, 8(%edx) -L(bk_write_8bytes): - movl 4(%eax), %ecx - movl %ecx, 4(%edx) + movq 4(%eax), %xmm0 + movq %xmm0, 4(%edx) L(bk_write_4bytes): movl (%eax), %ecx movl %ecx, (%edx) @@ -1416,37 +2693,47 @@ L(bk_write_0bytes): #endif RETURN - ALIGN (4) + .p2align 4 +L(bk_write_40bytes): + movq 32(%eax), %xmm0 + movq %xmm0, 32(%edx) +L(bk_write_32bytes): + movq 24(%eax), %xmm0 + movq %xmm0, 24(%edx) +L(bk_write_24bytes): + movq 16(%eax), %xmm0 + movq %xmm0, 16(%edx) +L(bk_write_16bytes): + movq 8(%eax), %xmm0 + movq %xmm0, 8(%edx) +L(bk_write_8bytes): + movq (%eax), %xmm0 + movq %xmm0, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + .p2align 4 L(bk_write_45bytes): - movl 41(%eax), %ecx - movl %ecx, 41(%edx) -L(bk_write_41bytes): - movl 37(%eax), %ecx - movl %ecx, 37(%edx) + movq 37(%eax), %xmm0 + movq %xmm0, 37(%edx) L(bk_write_37bytes): - movl 33(%eax), %ecx - movl %ecx, 33(%edx) -L(bk_write_33bytes): - movl 29(%eax), %ecx - movl %ecx, 29(%edx) + movq 29(%eax), %xmm0 + movq %xmm0, 29(%edx) L(bk_write_29bytes): - movl 25(%eax), %ecx - movl %ecx, 25(%edx) -L(bk_write_25bytes): - movl 21(%eax), %ecx - movl %ecx, 21(%edx) + movq 21(%eax), %xmm0 + movq %xmm0, 21(%edx) L(bk_write_21bytes): - movl 17(%eax), %ecx - movl %ecx, 17(%edx) -L(bk_write_17bytes): - movl 13(%eax), %ecx - movl %ecx, 13(%edx) + movq 13(%eax), %xmm0 + movq %xmm0, 13(%edx) L(bk_write_13bytes): - movl 9(%eax), %ecx - movl %ecx, 9(%edx) -L(bk_write_9bytes): - movl 5(%eax), %ecx - movl %ecx, 5(%edx) + movq 5(%eax), %xmm0 + movq %xmm0, 5(%edx) L(bk_write_5bytes): movl 1(%eax), %ecx movl %ecx, 1(%edx) @@ -1462,40 +2749,79 @@ L(bk_write_1bytes): #endif RETURN - ALIGN (4) + .p2align 4 +L(bk_write_41bytes): + movq 33(%eax), %xmm0 + movq %xmm0, 33(%edx) +L(bk_write_33bytes): + movq 25(%eax), %xmm0 + movq %xmm0, 25(%edx) +L(bk_write_25bytes): + movq 17(%eax), %xmm0 + movq %xmm0, 17(%edx) +L(bk_write_17bytes): + movq 9(%eax), %xmm0 + movq %xmm0, 9(%edx) +L(bk_write_9bytes): + movq 1(%eax), %xmm0 + movq %xmm0, 1(%edx) + movzbl (%eax), %ecx + movb %cl, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + .p2align 4 L(bk_write_46bytes): - movl 42(%eax), %ecx - movl %ecx, 42(%edx) -L(bk_write_42bytes): - movl 38(%eax), %ecx - movl %ecx, 38(%edx) + movq 38(%eax), %xmm0 + movq %xmm0, 38(%edx) L(bk_write_38bytes): - movl 34(%eax), %ecx - movl %ecx, 34(%edx) -L(bk_write_34bytes): - movl 30(%eax), %ecx - movl %ecx, 30(%edx) + movq 30(%eax), %xmm0 + movq %xmm0, 30(%edx) L(bk_write_30bytes): - movl 26(%eax), %ecx - movl %ecx, 26(%edx) -L(bk_write_26bytes): - movl 22(%eax), %ecx - movl %ecx, 22(%edx) + movq 22(%eax), %xmm0 + movq %xmm0, 22(%edx) L(bk_write_22bytes): - movl 18(%eax), %ecx - movl %ecx, 18(%edx) -L(bk_write_18bytes): - movl 14(%eax), %ecx - movl %ecx, 14(%edx) + movq 14(%eax), %xmm0 + movq %xmm0, 14(%edx) L(bk_write_14bytes): - movl 10(%eax), %ecx - movl %ecx, 10(%edx) -L(bk_write_10bytes): - movl 6(%eax), %ecx - movl %ecx, 6(%edx) + movq 6(%eax), %xmm0 + movq %xmm0, 6(%edx) L(bk_write_6bytes): movl 2(%eax), %ecx movl %ecx, 2(%edx) + movzwl (%eax), %ecx + movw %cx, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + .p2align 4 +L(bk_write_42bytes): + movq 34(%eax), %xmm0 + movq %xmm0, 34(%edx) +L(bk_write_34bytes): + movq 26(%eax), %xmm0 + movq %xmm0, 26(%edx) +L(bk_write_26bytes): + movq 18(%eax), %xmm0 + movq %xmm0, 18(%edx) +L(bk_write_18bytes): + movq 10(%eax), %xmm0 + movq %xmm0, 10(%edx) +L(bk_write_10bytes): + movq 2(%eax), %xmm0 + movq %xmm0, 2(%edx) L(bk_write_2bytes): movzwl (%eax), %ecx movw %cx, (%edx) @@ -1508,40 +2834,54 @@ L(bk_write_2bytes): #endif RETURN - ALIGN (4) + .p2align 4 L(bk_write_47bytes): - movl 43(%eax), %ecx - movl %ecx, 43(%edx) -L(bk_write_43bytes): - movl 39(%eax), %ecx - movl %ecx, 39(%edx) + movq 39(%eax), %xmm0 + movq %xmm0, 39(%edx) L(bk_write_39bytes): - movl 35(%eax), %ecx - movl %ecx, 35(%edx) -L(bk_write_35bytes): - movl 31(%eax), %ecx - movl %ecx, 31(%edx) + movq 31(%eax), %xmm0 + movq %xmm0, 31(%edx) L(bk_write_31bytes): - movl 27(%eax), %ecx - movl %ecx, 27(%edx) -L(bk_write_27bytes): - movl 23(%eax), %ecx - movl %ecx, 23(%edx) + movq 23(%eax), %xmm0 + movq %xmm0, 23(%edx) L(bk_write_23bytes): - movl 19(%eax), %ecx - movl %ecx, 19(%edx) -L(bk_write_19bytes): - movl 15(%eax), %ecx - movl %ecx, 15(%edx) + movq 15(%eax), %xmm0 + movq %xmm0, 15(%edx) L(bk_write_15bytes): - movl 11(%eax), %ecx - movl %ecx, 11(%edx) -L(bk_write_11bytes): - movl 7(%eax), %ecx - movl %ecx, 7(%edx) + movq 7(%eax), %xmm0 + movq %xmm0, 7(%edx) L(bk_write_7bytes): movl 3(%eax), %ecx movl %ecx, 3(%edx) + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + .p2align 4 +L(bk_write_43bytes): + movq 35(%eax), %xmm0 + movq %xmm0, 35(%edx) +L(bk_write_35bytes): + movq 27(%eax), %xmm0 + movq %xmm0, 27(%edx) +L(bk_write_27bytes): + movq 19(%eax), %xmm0 + movq %xmm0, 19(%edx) +L(bk_write_19bytes): + movq 11(%eax), %xmm0 + movq %xmm0, 11(%edx) +L(bk_write_11bytes): + movq 3(%eax), %xmm0 + movq %xmm0, 3(%edx) L(bk_write_3bytes): movzwl 1(%eax), %ecx movw %cx, 1(%edx) @@ -1558,7 +2898,7 @@ L(bk_write_3bytes): .pushsection .rodata.ssse3,"a",@progbits - ALIGN (2) + .p2align 2 L(table_48bytes_fwd): .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) @@ -1609,7 +2949,58 @@ L(table_48bytes_fwd): .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) - ALIGN (2) + .p2align 2 +L(table_48bytes_fwd_align): + .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) + + .p2align 2 L(shl_table): .int JMPTBL (L(shl_0), L(shl_table)) .int JMPTBL (L(shl_1), L(shl_table)) @@ -1628,7 +3019,7 @@ L(shl_table): .int JMPTBL (L(shl_14), L(shl_table)) .int JMPTBL (L(shl_15), L(shl_table)) - ALIGN (2) + .p2align 2 L(table_48_bytes_bwd): .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) @@ -1682,12 +3073,12 @@ L(table_48_bytes_bwd): .popsection #ifdef USE_AS_MEMMOVE - ALIGN (4) + .p2align 4 L(copy_backward): - PUSH (%esi) - movl %eax, %esi + PUSH (%edi) + movl %eax, %edi lea (%ecx,%edx,1),%edx - lea (%ecx,%esi,1),%esi + lea (%ecx,%edi,1),%edi testl $0x3, %edx jnz L(bk_align) @@ -1702,60 +3093,53 @@ L(bk_write_64bytesless): L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ sub $32, %ecx - movl -4(%esi), %eax - movl %eax, -4(%edx) - movl -8(%esi), %eax - movl %eax, -8(%edx) - movl -12(%esi), %eax - movl %eax, -12(%edx) - movl -16(%esi), %eax - movl %eax, -16(%edx) - movl -20(%esi), %eax - movl %eax, -20(%edx) - movl -24(%esi), %eax - movl %eax, -24(%edx) - movl -28(%esi), %eax - movl %eax, -28(%edx) - movl -32(%esi), %eax - movl %eax, -32(%edx) + movq -8(%edi), %xmm0 + movq %xmm0, -8(%edx) + movq -16(%edi), %xmm0 + movq %xmm0, -16(%edx) + movq -24(%edi), %xmm0 + movq %xmm0, -24(%edx) + movq -32(%edi), %xmm0 + movq %xmm0, -32(%edx) sub $32, %edx - sub $32, %esi + sub $32, %edi L(bk_write_less32bytes): - movl %esi, %eax + movl %edi, %eax sub %ecx, %edx sub %ecx, %eax - POP (%esi) + POP (%edi) L(bk_write_less32bytes_2): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) - CFI_PUSH (%esi) - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(bk_align): cmp $8, %ecx jbe L(bk_write_less32bytes) testl $1, %edx /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, - then (EDX & 2) must be != 0. */ + then (EDX & 2) must be != 0. */ jz L(bk_got2) - sub $1, %esi + sub $1, %edi sub $1, %ecx sub $1, %edx - movzbl (%esi), %eax + movzbl (%edi), %eax movb %al, (%edx) testl $2, %edx jz L(bk_aligned_4) L(bk_got2): - sub $2, %esi + sub $2, %edi sub $2, %ecx sub $2, %edx - movzwl (%esi), %eax + movzwl (%edi), %eax movw %ax, (%edx) jmp L(bk_aligned_4) - ALIGN (4) + .p2align 4 L(bk_write_more64bytes): /* Check alignment of last byte. */ testl $15, %edx @@ -1763,45 +3147,46 @@ L(bk_write_more64bytes): /* EDX is aligned 4 bytes, but not 16 bytes. */ L(bk_ssse3_align): - sub $4, %esi + sub $4, %edi sub $4, %ecx sub $4, %edx - movl (%esi), %eax + movl (%edi), %eax movl %eax, (%edx) testl $15, %edx jz L(bk_ssse3_cpy_pre) - sub $4, %esi + sub $4, %edi sub $4, %ecx sub $4, %edx - movl (%esi), %eax + movl (%edi), %eax movl %eax, (%edx) testl $15, %edx jz L(bk_ssse3_cpy_pre) - sub $4, %esi + sub $4, %edi sub $4, %ecx sub $4, %edx - movl (%esi), %eax + movl (%edi), %eax movl %eax, (%edx) L(bk_ssse3_cpy_pre): cmp $64, %ecx jb L(bk_write_more32bytes) + .p2align 4 L(bk_ssse3_cpy): - sub $64, %esi + sub $64, %edi sub $64, %ecx sub $64, %edx - movdqu 0x30(%esi), %xmm3 + movdqu 0x30(%edi), %xmm3 movdqa %xmm3, 0x30(%edx) - movdqu 0x20(%esi), %xmm2 + movdqu 0x20(%edi), %xmm2 movdqa %xmm2, 0x20(%edx) - movdqu 0x10(%esi), %xmm1 + movdqu 0x10(%edi), %xmm1 movdqa %xmm1, 0x10(%edx) - movdqu (%esi), %xmm0 + movdqu (%edi), %xmm0 movdqa %xmm0, (%edx) cmp $64, %ecx jae L(bk_ssse3_cpy) |
