summaryrefslogtreecommitdiffstats
path: root/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
diff options
context:
space:
mode:
Diffstat (limited to 'libc/arch-x86/silvermont/string/sse4-memcmp-slm.S')
-rwxr-xr-xlibc/arch-x86/silvermont/string/sse4-memcmp-slm.S1278
1 files changed, 1278 insertions, 0 deletions
diff --git a/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
new file mode 100755
index 0000000..e5028ff
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
@@ -0,0 +1,1278 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state .cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state .cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 4
+#define BLK1 PARMS
+#define BLK2 BLK1 + 4
+#define LEN BLK2 + 4
+#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+#if (defined SHARED || defined __PIC__)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+/* We first load PC into EBX. */ \
+ call __x86.get_pc_thunk.bx; \
+/* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+/* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
+ jmp *%ebx
+#else
+# define JMPTBL(I, B) I
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+#endif
+
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+ movl BLK1(%esp), %eax
+ movl BLK2(%esp), %edx
+ movl LEN(%esp), %ecx
+
+#ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(return0)
+#else
+ cmp $1, %ecx
+ jbe L(less1bytes)
+#endif
+
+ pxor %xmm0, %xmm0
+ cmp $64, %ecx
+ ja L(64bytesormore)
+ cmp $8, %ecx
+
+#ifndef USE_AS_WMEMCMP
+ PUSH (%ebx)
+ jb L(less8bytes)
+#else
+ jb L(less8bytes)
+ PUSH (%ebx)
+#endif
+
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %bl
+ cmpb (%edx), %bl
+ jne L(nonzero)
+
+ mov 1(%eax), %bl
+ cmpb 1(%edx), %bl
+ jne L(nonzero)
+
+ cmp $2, %ecx
+ jz L(0bytes)
+
+ mov 2(%eax), %bl
+ cmpb 2(%edx), %bl
+ jne L(nonzero)
+
+ cmp $3, %ecx
+ jz L(0bytes)
+
+ mov 3(%eax), %bl
+ cmpb 3(%edx), %bl
+ jne L(nonzero)
+
+ cmp $4, %ecx
+ jz L(0bytes)
+
+ mov 4(%eax), %bl
+ cmpb 4(%edx), %bl
+ jne L(nonzero)
+
+ cmp $5, %ecx
+ jz L(0bytes)
+
+ mov 5(%eax), %bl
+ cmpb 5(%edx), %bl
+ jne L(nonzero)
+
+ cmp $6, %ecx
+ jz L(0bytes)
+
+ mov 6(%eax), %bl
+ cmpb 6(%edx), %bl
+ je L(0bytes)
+
+L(nonzero):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(above)
+ neg %eax
+L(above):
+ ret
+ CFI_PUSH (%ebx)
+#endif
+
+ .p2align 4
+L(0bytes):
+ POP (%ebx)
+ xor %eax, %eax
+ ret
+
+#ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ je L(return0)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+ .p2align 4
+L(return0):
+ xor %eax, %eax
+ ret
+#endif
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less1bytes):
+ jb L(0bytesend)
+ movzbl (%eax), %eax
+ movzbl (%edx), %edx
+ sub %edx, %eax
+ ret
+
+ .p2align 4
+L(0bytesend):
+ xor %eax, %eax
+ ret
+#endif
+ .p2align 4
+L(64bytesormore):
+ PUSH (%ebx)
+ mov %ecx, %ebx
+ mov $64, %ecx
+ sub $64, %ebx
+L(64bytesormore_loop):
+ movdqu (%eax), %xmm1
+ movdqu (%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_16diff)
+
+ movdqu 16(%eax), %xmm1
+ movdqu 16(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_32diff)
+
+ movdqu 32(%eax), %xmm1
+ movdqu 32(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_48diff)
+
+ movdqu 48(%eax), %xmm1
+ movdqu 48(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_64diff)
+ add %ecx, %eax
+ add %ecx, %edx
+ sub %ecx, %ebx
+ jae L(64bytesormore_loop)
+ add %ebx, %ecx
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+#ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+#endif
+ .p2align 4
+L(find_16diff):
+ sub $16, %ecx
+L(find_32diff):
+ sub $16, %ecx
+L(find_48diff):
+ sub $16, %ecx
+L(find_64diff):
+ add %ecx, %edx
+ add %ecx, %eax
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+#else
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ cmp -4(%edx), %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+#endif
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(49bytes):
+ movdqu -49(%eax), %xmm1
+ movdqu -49(%edx), %xmm2
+ mov $-49, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(33bytes):
+ movdqu -33(%eax), %xmm1
+ movdqu -33(%edx), %xmm2
+ mov $-33, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(17bytes):
+ mov -17(%eax), %ecx
+ mov -17(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(13bytes):
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(9bytes):
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(5bytes):
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(50bytes):
+ mov $-50, %ebx
+ movdqu -50(%eax), %xmm1
+ movdqu -50(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ mov $-34, %ebx
+ movdqu -34(%eax), %xmm1
+ movdqu -34(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%eax), %ecx
+ mov -18(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(14bytes):
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(10bytes):
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(6bytes):
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(51bytes):
+ mov $-51, %ebx
+ movdqu -51(%eax), %xmm1
+ movdqu -51(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(35bytes):
+ mov $-35, %ebx
+ movdqu -35(%eax), %xmm1
+ movdqu -35(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(19bytes):
+ movl -19(%eax), %ecx
+ movl -19(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+L(1bytes):
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+#endif
+ .p2align 4
+L(52bytes):
+ movdqu -52(%eax), %xmm1
+ movdqu -52(%edx), %xmm2
+ mov $-52, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%eax), %xmm1
+ movdqu -36(%edx), %xmm2
+ mov $-36, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%eax), %xmm1
+ movdqu -20(%edx), %xmm2
+ mov $-20, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -4(%edx), %ecx
+#endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(53bytes):
+ movdqu -53(%eax), %xmm1
+ movdqu -53(%edx), %xmm2
+ mov $-53, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(37bytes):
+ mov $-37, %ebx
+ movdqu -37(%eax), %xmm1
+ movdqu -37(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(21bytes):
+ mov $-21, %ebx
+ movdqu -21(%eax), %xmm1
+ movdqu -21(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(54bytes):
+ movdqu -54(%eax), %xmm1
+ movdqu -54(%edx), %xmm2
+ mov $-54, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ mov $-38, %ebx
+ movdqu -38(%eax), %xmm1
+ movdqu -38(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ mov $-22, %ebx
+ movdqu -22(%eax), %xmm1
+ movdqu -22(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(55bytes):
+ movdqu -55(%eax), %xmm1
+ movdqu -55(%edx), %xmm2
+ mov $-55, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(39bytes):
+ mov $-39, %ebx
+ movdqu -39(%eax), %xmm1
+ movdqu -39(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(23bytes):
+ mov $-23, %ebx
+ movdqu -23(%eax), %xmm1
+ movdqu -23(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+#endif
+ .p2align 4
+L(56bytes):
+ movdqu -56(%eax), %xmm1
+ movdqu -56(%edx), %xmm2
+ mov $-56, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ mov $-40, %ebx
+ movdqu -40(%eax), %xmm1
+ movdqu -40(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ mov $-24, %ebx
+ movdqu -24(%eax), %xmm1
+ movdqu -24(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -8(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -4(%edx), %ecx
+#endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(57bytes):
+ movdqu -57(%eax), %xmm1
+ movdqu -57(%edx), %xmm2
+ mov $-57, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(41bytes):
+ mov $-41, %ebx
+ movdqu -41(%eax), %xmm1
+ movdqu -41(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(25bytes):
+ mov $-25, %ebx
+ movdqu -25(%eax), %xmm1
+ movdqu -25(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(58bytes):
+ movdqu -58(%eax), %xmm1
+ movdqu -58(%edx), %xmm2
+ mov $-58, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ mov $-42, %ebx
+ movdqu -42(%eax), %xmm1
+ movdqu -42(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ mov $-26, %ebx
+ movdqu -26(%eax), %xmm1
+ movdqu -26(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(59bytes):
+ movdqu -59(%eax), %xmm1
+ movdqu -59(%edx), %xmm2
+ mov $-59, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(43bytes):
+ mov $-43, %ebx
+ movdqu -43(%eax), %xmm1
+ movdqu -43(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(27bytes):
+ mov $-27, %ebx
+ movdqu -27(%eax), %xmm1
+ movdqu -27(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+#endif
+ .p2align 4
+L(60bytes):
+ movdqu -60(%eax), %xmm1
+ movdqu -60(%edx), %xmm2
+ mov $-60, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ mov $-44, %ebx
+ movdqu -44(%eax), %xmm1
+ movdqu -44(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ mov $-28, %ebx
+ movdqu -28(%eax), %xmm1
+ movdqu -28(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -12(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -12(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -8(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -4(%edx), %ecx
+#endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(61bytes):
+ movdqu -61(%eax), %xmm1
+ movdqu -61(%edx), %xmm2
+ mov $-61, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(45bytes):
+ mov $-45, %ebx
+ movdqu -45(%eax), %xmm1
+ movdqu -45(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(29bytes):
+ mov $-29, %ebx
+ movdqu -29(%eax), %xmm1
+ movdqu -29(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(62bytes):
+ movdqu -62(%eax), %xmm1
+ movdqu -62(%edx), %xmm2
+ mov $-62, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ mov $-46, %ebx
+ movdqu -46(%eax), %xmm1
+ movdqu -46(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ mov $-30, %ebx
+ movdqu -30(%eax), %xmm1
+ movdqu -30(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(63bytes):
+ movdqu -63(%eax), %xmm1
+ movdqu -63(%edx), %xmm2
+ mov $-63, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(47bytes):
+ mov $-47, %ebx
+ movdqu -47(%eax), %xmm1
+ movdqu -47(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(31bytes):
+ mov $-31, %ebx
+ movdqu -31(%eax), %xmm1
+ movdqu -31(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+#endif
+
+ .p2align 4
+L(64bytes):
+ movdqu -64(%eax), %xmm1
+ movdqu -64(%edx), %xmm2
+ mov $-64, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%eax), %xmm1
+ movdqu -48(%edx), %xmm2
+ mov $-48, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%eax), %xmm1
+ movdqu -32(%edx), %xmm2
+ mov $-32, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -16(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -12(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -12(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -8(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -4(%edx), %ecx
+#endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ mov (%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ mov 4(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ mov 8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ mov 12(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+#else
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ cmp 4(%edx), %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ cmp 8(%edx), %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ cmp 12(%edx), %ecx
+
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+#endif
+
+ .p2align 4
+L(find_diff):
+#ifndef USE_AS_WMEMCMP
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ shr $16,%ecx
+ shr $16,%ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+L(end):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(bigger)
+ neg %eax
+L(bigger):
+ ret
+#else
+ POP (%ebx)
+ mov $1, %eax
+ jg L(bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(bigger):
+ ret
+#endif
+END (MEMCMP)
+
+ .section .rodata.sse4.2,"a",@progbits
+ .p2align 2
+ .type L(table_64bytes), @object
+#ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(1bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(3bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(5bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(7bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(9bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(11bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(13bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(15bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(17bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(19bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(21bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(23bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(25bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(27bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(29bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(31bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(33bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(35bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(37bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(39bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(41bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(43bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(45bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(47bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(49bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(51bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(53bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(55bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(57bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(59bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(61bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(63bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+#else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+#endif