diff options
author | Alexander Ivchenko <alexander.ivchenko@intel.com> | 2014-07-14 18:54:34 +0400 |
---|---|---|
committer | Alexander Ivchenko <alexander.ivchenko@intel.com> | 2014-07-17 17:29:26 +0400 |
commit | 907194a395ffa5bc27cad81adbed241f9fd772f0 (patch) | |
tree | e2b284ebaa9de33dbc15adf4d5c19dbfa3bd6753 /runtime | |
parent | b2a59010b787bd9d5d9bf36d32682faa5ad8da24 (diff) | |
download | art-907194a395ffa5bc27cad81adbed241f9fd772f0.zip art-907194a395ffa5bc27cad81adbed241f9fd772f0.tar.gz art-907194a395ffa5bc27cad81adbed241f9fd772f0.tar.bz2 |
Add optimized assembler implementation of __memcmp16 for x86.
Change-Id: Ia442dd749f8113244ea580f97d574ebaa73e4989
Signed-off-by: Alexander Ivchenko <alexander.ivchenko@intel.com>
Diffstat (limited to 'runtime')
-rw-r--r-- | runtime/Android.mk | 1 | ||||
-rw-r--r-- | runtime/arch/memcmp16.h | 2 | ||||
-rw-r--r-- | runtime/arch/x86/asm_support_x86.S | 4 | ||||
-rw-r--r-- | runtime/arch/x86/memcmp16_x86.S | 1038 |
4 files changed, 1044 insertions, 1 deletions
diff --git a/runtime/Android.mk b/runtime/Android.mk index d2fc229..d82e842 100644 --- a/runtime/Android.mk +++ b/runtime/Android.mk @@ -238,6 +238,7 @@ LIBART_SRC_FILES_x86 := \ arch/x86/context_x86.cc \ arch/x86/entrypoints_init_x86.cc \ arch/x86/jni_entrypoints_x86.S \ + arch/x86/memcmp16_x86.S \ arch/x86/portable_entrypoints_x86.S \ arch/x86/quick_entrypoints_x86.S \ arch/x86/thread_x86.cc \ diff --git a/runtime/arch/memcmp16.h b/runtime/arch/memcmp16.h index 1144c8c..65d2f92 100644 --- a/runtime/arch/memcmp16.h +++ b/runtime/arch/memcmp16.h @@ -30,7 +30,7 @@ // // In both cases, MemCmp16 is declared. -#if defined(__aarch64__) || defined(__arm__) || defined(__mips) +#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__) extern "C" uint32_t __memcmp16(const uint16_t* s0, const uint16_t* s1, size_t count); #define MemCmp16 __memcmp16 diff --git a/runtime/arch/x86/asm_support_x86.S b/runtime/arch/x86/asm_support_x86.S index ae39be1..e468c2a 100644 --- a/runtime/arch/x86/asm_support_x86.S +++ b/runtime/arch/x86/asm_support_x86.S @@ -81,6 +81,8 @@ #define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg #define CFI_RESTORE(reg) .cfi_restore reg #define CFI_REL_OFFSET(reg,size) .cfi_rel_offset reg,size + #define CFI_RESTORE_STATE .cfi_restore_state + #define CFI_REMEMBER_STATE .cfi_remember_state #else // Mac OS' doesn't like cfi_* directives. #define CFI_STARTPROC @@ -90,6 +92,8 @@ #define CFI_DEF_CFA_REGISTER(reg) #define CFI_RESTORE(reg) #define CFI_REL_OFFSET(reg,size) + #define CFI_RESTORE_STATE + #define CFI_REMEMBER_STATE #endif // Symbols. diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S new file mode 100644 index 0000000..17662fa --- /dev/null +++ b/runtime/arch/x86/memcmp16_x86.S @@ -0,0 +1,1038 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "asm_support_x86.S" + +#define MEMCMP __memcmp16 + +/* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */ + +#ifndef L +# define L(label) .L##label +#endif + +#define CFI_PUSH(REG) \ + CFI_ADJUST_CFA_OFFSET(4); \ + CFI_REL_OFFSET(REG, 0) + +#define CFI_POP(REG) \ + CFI_ADJUST_CFA_OFFSET(-4); \ + CFI_RESTORE(REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define PARMS 4 +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 +#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE + +DEFINE_FUNCTION MEMCMP + movl LEN(%esp), %ecx + + shl $1, %ecx + jz L(zero) + + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) + + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) + + CFI_POP (%ebx) + + .p2align 4 +L(zero): + xor %eax, %eax + ret + + .p2align 4 +L(48bytesormore): + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + CFI_REMEMBER_STATE + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + + cmp $0, %edx + je L(shr_0) + cmp $2, %edx + je L(shr_2) + cmp $4, %edx + je L(shr_4) + cmp $6, %edx + je L(shr_6) + cmp $8, %edx + je L(shr_8) + cmp $10, %edx + je L(shr_10) + cmp $12, %edx + je L(shr_12) + jmp L(shr_14) + + .p2align 4 +L(shr_0): + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_0_gobble): + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx +L(shr_0_gobble_loop_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_2): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_2_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_4): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_4_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_6): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_6_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_8): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_8_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_10): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_10_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_12): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_12_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_14): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(shr_14_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 +L(exit): + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx + +L(first16bytes): + add %eax, %esi +L(less16bytes): + test %dl, %dl + jz L(next_four_words) + test $15, %dl + jz L(second_two_words) + test $3, %dl + jz L(second_word) + movzwl -16(%edi), %eax + movzwl -16(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 +L(second_word): + movzwl -14(%edi), %eax + movzwl -14(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 +L(second_two_words): + test $63, %dl + jz L(fourth_word) + movzwl -12(%edi), %eax + movzwl -12(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 +L(fourth_word): + movzwl -10(%edi), %eax + movzwl -10(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 +L(next_four_words): + test $15, %dh + jz L(fourth_two_words) + test $3, %dh + jz L(sixth_word) + movzwl -8(%edi), %eax + movzwl -8(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 +L(sixth_word): + movzwl -6(%edi), %eax + movzwl -6(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 +L(fourth_two_words): + test $63, %dh + jz L(eighth_word) + movzwl -4(%edi), %eax + movzwl -4(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 +L(eighth_word): + movzwl -2(%edi), %eax + movzwl -2(%esi), %ebx + subl %ebx, %eax + RETURN + + + CFI_PUSH (%ebx) + + .p2align 4 +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) + cmp $10, %ecx + je L(10bytes) + cmp $12, %ecx + je L(12bytes) + jmp L(14bytes) + + .p2align 4 +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) + cmp $18, %ecx + je L(18bytes) + cmp $20, %ecx + je L(20bytes) + jmp L(22bytes) + + .p2align 4 +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) + cmp $26, %ecx + je L(26bytes) + cmp $28, %ecx + je L(28bytes) + jmp L(30bytes) + + .p2align 4 +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) + cmp $34, %ecx + je L(34bytes) + cmp $36, %ecx + je L(36bytes) + jmp L(38bytes) + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) + cmp $2, %ecx + je L(2bytes) + cmp $4, %ecx + je L(4bytes) + jmp L(6bytes) + + .p2align 4 +L(more40bytes): + cmp $40, %ecx + je L(40bytes) + cmp $42, %ecx + je L(42bytes) + cmp $44, %ecx + je L(44bytes) + jmp L(46bytes) + + .p2align 4 +L(46bytes): + movzwl -46(%eax), %ecx + movzwl -46(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(44bytes): + movzwl -44(%eax), %ecx + movzwl -44(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(42bytes): + movzwl -42(%eax), %ecx + movzwl -42(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(40bytes): + movzwl -40(%eax), %ecx + movzwl -40(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(38bytes): + movzwl -38(%eax), %ecx + movzwl -38(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(36bytes): + movzwl -36(%eax), %ecx + movzwl -36(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(34bytes): + movzwl -34(%eax), %ecx + movzwl -34(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(32bytes): + movzwl -32(%eax), %ecx + movzwl -32(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(30bytes): + movzwl -30(%eax), %ecx + movzwl -30(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(28bytes): + movzwl -28(%eax), %ecx + movzwl -28(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(26bytes): + movzwl -26(%eax), %ecx + movzwl -26(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(24bytes): + movzwl -24(%eax), %ecx + movzwl -24(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(22bytes): + movzwl -22(%eax), %ecx + movzwl -22(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(20bytes): + movzwl -20(%eax), %ecx + movzwl -20(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(18bytes): + movzwl -18(%eax), %ecx + movzwl -18(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(16bytes): + movzwl -16(%eax), %ecx + movzwl -16(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(14bytes): + movzwl -14(%eax), %ecx + movzwl -14(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(12bytes): + movzwl -12(%eax), %ecx + movzwl -12(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(10bytes): + movzwl -10(%eax), %ecx + movzwl -10(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(8bytes): + movzwl -8(%eax), %ecx + movzwl -8(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(6bytes): + movzwl -6(%eax), %ecx + movzwl -6(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(4bytes): + movzwl -4(%eax), %ecx + movzwl -4(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) +L(2bytes): + movzwl -2(%eax), %eax + movzwl -2(%edx), %ebx + subl %ebx, %eax + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(memcmp16_exit): + POP (%ebx) + mov %ecx, %eax + ret +END_FUNCTION MEMCMP |