sse4-memcmp-slm.S revision 15581383153c5da29befb7f5cdc30bc21e9da54b
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc	.cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc	.cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg)	.cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
53#endif
54
55#ifndef cfi_remember_state
56# define cfi_remember_state	.cfi_remember_state
57#endif
58
59#ifndef cfi_restore_state
60# define cfi_restore_state	.cfi_restore_state
61#endif
62
63#ifndef ENTRY
64# define ENTRY(name)             \
65	.type name, @function;   \
66	.globl name;             \
67	.p2align 4;              \
68name:                            \
69	cfi_startproc
70#endif
71
72#ifndef END
73# define END(name)               \
74	cfi_endproc;             \
75	.size name, .-name
76#endif
77
78#ifndef MEMCMP
79# define MEMCMP	memcmp
80#endif
81
82#define CFI_PUSH(REG)	\
83	cfi_adjust_cfa_offset (4);	\
84	cfi_rel_offset (REG, 0)
85
86#define CFI_POP(REG)	\
87	cfi_adjust_cfa_offset (-4);	\
88	cfi_restore (REG)
89
90#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
91#define POP(REG)	popl REG; CFI_POP (REG)
92
93#define PARMS	4
94#define BLK1	PARMS
95#define BLK2	BLK1 + 4
96#define LEN	BLK2 + 4
97#define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
98
99
100#if (defined SHARED || defined __PIC__)
101# define JMPTBL(I, B)	I - B
102
103/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
104	jump	table with relative offsets.  INDEX is a register contains the
105	index	into the jump table.   SCALE is the scale of INDEX. */
106
107# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
108/* We first load PC into EBX.  */	\
109	call	__x86.get_pc_thunk.bx;	\
110/* Get the address of the jump table.  */	\
111	addl	$(TABLE - .), %ebx;	\
112/* Get the entry and convert the relative offset to the	\
113	absolute	address.  */	\
114	addl	(%ebx,INDEX,SCALE), %ebx;	\
115/* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
116	jmp	*%ebx
117#else
118# define JMPTBL(I, B)	I
119
120/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
121	jump	table with relative offsets.  INDEX is a register contains the
122	index	into the jump table.   SCALE is the scale of INDEX. */
123# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
124	jmp	*TABLE(,INDEX,SCALE)
125#endif
126
127
128/* Warning!
129           wmemcmp has to use SIGNED comparison for elements.
130           memcmp has to use UNSIGNED comparison for elemnts.
131*/
132
133	.section .text.sse4.2,"ax",@progbits
134ENTRY (MEMCMP)
135	movl	BLK1(%esp), %eax
136	movl	BLK2(%esp), %edx
137	movl	LEN(%esp), %ecx
138
139#ifdef USE_AS_WMEMCMP
140	shl	$2, %ecx
141	test	%ecx, %ecx
142	jz	L(return0)
143#else
144	cmp	$1, %ecx
145	jbe	L(less1bytes)
146#endif
147
148	pxor	%xmm0, %xmm0
149	cmp	$64, %ecx
150	ja	L(64bytesormore)
151	cmp	$8, %ecx
152
153#ifndef USE_AS_WMEMCMP
154	PUSH	(%ebx)
155	jb	L(less8bytes)
156#else
157	jb	L(less8bytes)
158	PUSH	(%ebx)
159#endif
160
161	add	%ecx, %edx
162	add	%ecx, %eax
163	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
164
165#ifndef USE_AS_WMEMCMP
166	.p2align 4
167L(less8bytes):
168	mov	(%eax), %bl
169	cmpb	(%edx), %bl
170	jne	L(nonzero)
171
172	mov	1(%eax), %bl
173	cmpb	1(%edx), %bl
174	jne	L(nonzero)
175
176	cmp	$2, %ecx
177	jz	L(0bytes)
178
179	mov	2(%eax), %bl
180	cmpb	2(%edx), %bl
181	jne	L(nonzero)
182
183	cmp	$3, %ecx
184	jz	L(0bytes)
185
186	mov	3(%eax), %bl
187	cmpb	3(%edx), %bl
188	jne	L(nonzero)
189
190	cmp	$4, %ecx
191	jz	L(0bytes)
192
193	mov	4(%eax), %bl
194	cmpb	4(%edx), %bl
195	jne	L(nonzero)
196
197	cmp	$5, %ecx
198	jz	L(0bytes)
199
200	mov	5(%eax), %bl
201	cmpb	5(%edx), %bl
202	jne	L(nonzero)
203
204	cmp	$6, %ecx
205	jz	L(0bytes)
206
207	mov	6(%eax), %bl
208	cmpb	6(%edx), %bl
209	je	L(0bytes)
210
211L(nonzero):
212	POP	(%ebx)
213	mov	$1, %eax
214	ja	L(above)
215	neg	%eax
216L(above):
217	ret
218	CFI_PUSH (%ebx)
219#endif
220
221	.p2align 4
222L(0bytes):
223	POP	(%ebx)
224	xor	%eax, %eax
225	ret
226
227#ifdef USE_AS_WMEMCMP
228
229/* for wmemcmp, case N == 1 */
230
231	.p2align 4
232L(less8bytes):
233	mov	(%eax), %ecx
234	cmp	(%edx), %ecx
235	je	L(return0)
236	mov	$1, %eax
237	jg	L(find_diff_bigger)
238	neg	%eax
239	ret
240
241	.p2align 4
242L(find_diff_bigger):
243	ret
244
245	.p2align 4
246L(return0):
247	xor	%eax, %eax
248	ret
249#endif
250
251#ifndef USE_AS_WMEMCMP
252	.p2align 4
253L(less1bytes):
254	jb	L(0bytesend)
255	movzbl	(%eax), %eax
256	movzbl	(%edx), %edx
257	sub	%edx, %eax
258	ret
259
260	.p2align 4
261L(0bytesend):
262	xor	%eax, %eax
263	ret
264#endif
265	.p2align 4
266L(64bytesormore):
267	PUSH	(%ebx)
268	mov	%ecx, %ebx
269	mov	$64, %ecx
270	sub	$64, %ebx
271L(64bytesormore_loop):
272	movdqu	(%eax), %xmm1
273	movdqu	(%edx), %xmm2
274	pxor	%xmm1, %xmm2
275	ptest	%xmm2, %xmm0
276	jnc	L(find_16diff)
277
278	movdqu	16(%eax), %xmm1
279	movdqu	16(%edx), %xmm2
280	pxor	%xmm1, %xmm2
281	ptest	%xmm2, %xmm0
282	jnc	L(find_32diff)
283
284	movdqu	32(%eax), %xmm1
285	movdqu	32(%edx), %xmm2
286	pxor	%xmm1, %xmm2
287	ptest	%xmm2, %xmm0
288	jnc	L(find_48diff)
289
290	movdqu	48(%eax), %xmm1
291	movdqu	48(%edx), %xmm2
292	pxor	%xmm1, %xmm2
293	ptest	%xmm2, %xmm0
294	jnc	L(find_64diff)
295	add	%ecx, %eax
296	add	%ecx, %edx
297	sub	%ecx, %ebx
298	jae	L(64bytesormore_loop)
299	add	%ebx, %ecx
300	add	%ecx, %edx
301	add	%ecx, %eax
302	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
303
304#ifdef USE_AS_WMEMCMP
305
306/* Label needs only for table_64bytes filling */
307L(unreal_case):
308/* no code here */
309
310#endif
311	.p2align 4
312L(find_16diff):
313	sub	$16, %ecx
314L(find_32diff):
315	sub	$16, %ecx
316L(find_48diff):
317	sub	$16, %ecx
318L(find_64diff):
319	add	%ecx, %edx
320	add	%ecx, %eax
321
322#ifndef USE_AS_WMEMCMP
323	.p2align 4
324L(16bytes):
325	mov	-16(%eax), %ecx
326	mov	-16(%edx), %ebx
327	cmp	%ebx, %ecx
328	jne	L(find_diff)
329L(12bytes):
330	mov	-12(%eax), %ecx
331	mov	-12(%edx), %ebx
332	cmp	%ebx, %ecx
333	jne	L(find_diff)
334L(8bytes):
335	mov	-8(%eax), %ecx
336	mov	-8(%edx), %ebx
337	cmp	%ebx, %ecx
338	jne	L(find_diff)
339L(4bytes):
340	mov	-4(%eax), %ecx
341	mov	-4(%edx), %ebx
342	cmp	%ebx, %ecx
343	mov	$0, %eax
344	jne	L(find_diff)
345	RETURN
346#else
347	.p2align 4
348L(16bytes):
349	mov	-16(%eax), %ecx
350	cmp	-16(%edx), %ecx
351	jne	L(find_diff)
352L(12bytes):
353	mov	-12(%eax), %ecx
354	cmp	-12(%edx), %ecx
355	jne	L(find_diff)
356L(8bytes):
357	mov	-8(%eax), %ecx
358	cmp	-8(%edx), %ecx
359	jne	L(find_diff)
360L(4bytes):
361	mov	-4(%eax), %ecx
362	cmp	-4(%edx), %ecx
363	mov	$0, %eax
364	jne	L(find_diff)
365	RETURN
366#endif
367
368#ifndef USE_AS_WMEMCMP
369	.p2align 4
370L(49bytes):
371	movdqu	-49(%eax), %xmm1
372	movdqu	-49(%edx), %xmm2
373	mov	$-49, %ebx
374	pxor	%xmm1, %xmm2
375	ptest	%xmm2, %xmm0
376	jnc	L(less16bytes)
377L(33bytes):
378	movdqu	-33(%eax), %xmm1
379	movdqu	-33(%edx), %xmm2
380	mov	$-33, %ebx
381	pxor	%xmm1, %xmm2
382	ptest	%xmm2, %xmm0
383	jnc	L(less16bytes)
384L(17bytes):
385	mov	-17(%eax), %ecx
386	mov	-17(%edx), %ebx
387	cmp	%ebx, %ecx
388	jne	L(find_diff)
389L(13bytes):
390	mov	-13(%eax), %ecx
391	mov	-13(%edx), %ebx
392	cmp	%ebx, %ecx
393	jne	L(find_diff)
394L(9bytes):
395	mov	-9(%eax), %ecx
396	mov	-9(%edx), %ebx
397	cmp	%ebx, %ecx
398	jne	L(find_diff)
399L(5bytes):
400	mov	-5(%eax), %ecx
401	mov	-5(%edx), %ebx
402	cmp	%ebx, %ecx
403	jne	L(find_diff)
404	movzbl	-1(%eax), %ecx
405	cmp	-1(%edx), %cl
406	mov	$0, %eax
407	jne	L(end)
408	RETURN
409
410	.p2align 4
411L(50bytes):
412	mov	$-50, %ebx
413	movdqu	-50(%eax), %xmm1
414	movdqu	-50(%edx), %xmm2
415	pxor	%xmm1, %xmm2
416	ptest	%xmm2, %xmm0
417	jnc	L(less16bytes)
418L(34bytes):
419	mov	$-34, %ebx
420	movdqu	-34(%eax), %xmm1
421	movdqu	-34(%edx), %xmm2
422	pxor	%xmm1, %xmm2
423	ptest	%xmm2, %xmm0
424	jnc	L(less16bytes)
425L(18bytes):
426	mov	-18(%eax), %ecx
427	mov	-18(%edx), %ebx
428	cmp	%ebx, %ecx
429	jne	L(find_diff)
430L(14bytes):
431	mov	-14(%eax), %ecx
432	mov	-14(%edx), %ebx
433	cmp	%ebx, %ecx
434	jne	L(find_diff)
435L(10bytes):
436	mov	-10(%eax), %ecx
437	mov	-10(%edx), %ebx
438	cmp	%ebx, %ecx
439	jne	L(find_diff)
440L(6bytes):
441	mov	-6(%eax), %ecx
442	mov	-6(%edx), %ebx
443	cmp	%ebx, %ecx
444	jne	L(find_diff)
445L(2bytes):
446	movzwl	-2(%eax), %ecx
447	movzwl	-2(%edx), %ebx
448	cmp	%bl, %cl
449	jne	L(end)
450	cmp	%bh, %ch
451	mov	$0, %eax
452	jne	L(end)
453	RETURN
454
455	.p2align 4
456L(51bytes):
457	mov	$-51, %ebx
458	movdqu	-51(%eax), %xmm1
459	movdqu	-51(%edx), %xmm2
460	pxor	%xmm1, %xmm2
461	ptest	%xmm2, %xmm0
462	jnc	L(less16bytes)
463L(35bytes):
464	mov	$-35, %ebx
465	movdqu	-35(%eax), %xmm1
466	movdqu	-35(%edx), %xmm2
467	pxor	%xmm1, %xmm2
468	ptest	%xmm2, %xmm0
469	jnc	L(less16bytes)
470L(19bytes):
471	movl	-19(%eax), %ecx
472	movl	-19(%edx), %ebx
473	cmp	%ebx, %ecx
474	jne	L(find_diff)
475L(15bytes):
476	movl	-15(%eax), %ecx
477	movl	-15(%edx), %ebx
478	cmp	%ebx, %ecx
479	jne	L(find_diff)
480L(11bytes):
481	movl	-11(%eax), %ecx
482	movl	-11(%edx), %ebx
483	cmp	%ebx, %ecx
484	jne	L(find_diff)
485L(7bytes):
486	movl	-7(%eax), %ecx
487	movl	-7(%edx), %ebx
488	cmp	%ebx, %ecx
489	jne	L(find_diff)
490L(3bytes):
491	movzwl	-3(%eax), %ecx
492	movzwl	-3(%edx), %ebx
493	cmpb	%bl, %cl
494	jne	L(end)
495	cmp	%bx, %cx
496	jne	L(end)
497L(1bytes):
498	movzbl	-1(%eax), %eax
499	cmpb	-1(%edx), %al
500	mov	$0, %eax
501	jne	L(end)
502	RETURN
503#endif
504	.p2align 4
505L(52bytes):
506	movdqu	-52(%eax), %xmm1
507	movdqu	-52(%edx), %xmm2
508	mov	$-52, %ebx
509	pxor	%xmm1, %xmm2
510	ptest	%xmm2, %xmm0
511	jnc	L(less16bytes)
512L(36bytes):
513	movdqu	-36(%eax), %xmm1
514	movdqu	-36(%edx), %xmm2
515	mov	$-36, %ebx
516	pxor	%xmm1, %xmm2
517	ptest	%xmm2, %xmm0
518	jnc	L(less16bytes)
519L(20bytes):
520	movdqu	-20(%eax), %xmm1
521	movdqu	-20(%edx), %xmm2
522	mov	$-20, %ebx
523	pxor	%xmm1, %xmm2
524	ptest	%xmm2, %xmm0
525	jnc	L(less16bytes)
526	mov	-4(%eax), %ecx
527#ifndef USE_AS_WMEMCMP
528	mov	-4(%edx), %ebx
529	cmp	%ebx, %ecx
530#else
531	cmp	-4(%edx), %ecx
532#endif
533	mov	$0, %eax
534	jne	L(find_diff)
535	RETURN
536
537#ifndef USE_AS_WMEMCMP
538	.p2align 4
539L(53bytes):
540	movdqu	-53(%eax), %xmm1
541	movdqu	-53(%edx), %xmm2
542	mov	$-53, %ebx
543	pxor	%xmm1, %xmm2
544	ptest	%xmm2, %xmm0
545	jnc	L(less16bytes)
546L(37bytes):
547	mov	$-37, %ebx
548	movdqu	-37(%eax), %xmm1
549	movdqu	-37(%edx), %xmm2
550	pxor	%xmm1, %xmm2
551	ptest	%xmm2, %xmm0
552	jnc	L(less16bytes)
553L(21bytes):
554	mov	$-21, %ebx
555	movdqu	-21(%eax), %xmm1
556	movdqu	-21(%edx), %xmm2
557	pxor	%xmm1, %xmm2
558	ptest	%xmm2, %xmm0
559	jnc	L(less16bytes)
560	mov	-5(%eax), %ecx
561	mov	-5(%edx), %ebx
562	cmp	%ebx, %ecx
563	jne	L(find_diff)
564	movzbl	-1(%eax), %ecx
565	cmp	-1(%edx), %cl
566	mov	$0, %eax
567	jne	L(end)
568	RETURN
569
570	.p2align 4
571L(54bytes):
572	movdqu	-54(%eax), %xmm1
573	movdqu	-54(%edx), %xmm2
574	mov	$-54, %ebx
575	pxor	%xmm1, %xmm2
576	ptest	%xmm2, %xmm0
577	jnc	L(less16bytes)
578L(38bytes):
579	mov	$-38, %ebx
580	movdqu	-38(%eax), %xmm1
581	movdqu	-38(%edx), %xmm2
582	pxor	%xmm1, %xmm2
583	ptest	%xmm2, %xmm0
584	jnc	L(less16bytes)
585L(22bytes):
586	mov	$-22, %ebx
587	movdqu	-22(%eax), %xmm1
588	movdqu	-22(%edx), %xmm2
589	pxor	%xmm1, %xmm2
590	ptest	%xmm2, %xmm0
591	jnc	L(less16bytes)
592
593	mov	-6(%eax), %ecx
594	mov	-6(%edx), %ebx
595	cmp	%ebx, %ecx
596	jne	L(find_diff)
597	movzwl	-2(%eax), %ecx
598	movzwl	-2(%edx), %ebx
599	cmp	%bl, %cl
600	jne	L(end)
601	cmp	%bh, %ch
602	mov	$0, %eax
603	jne	L(end)
604	RETURN
605
606	.p2align 4
607L(55bytes):
608	movdqu	-55(%eax), %xmm1
609	movdqu	-55(%edx), %xmm2
610	mov	$-55, %ebx
611	pxor	%xmm1, %xmm2
612	ptest	%xmm2, %xmm0
613	jnc	L(less16bytes)
614L(39bytes):
615	mov	$-39, %ebx
616	movdqu	-39(%eax), %xmm1
617	movdqu	-39(%edx), %xmm2
618	pxor	%xmm1, %xmm2
619	ptest	%xmm2, %xmm0
620	jnc	L(less16bytes)
621L(23bytes):
622	mov	$-23, %ebx
623	movdqu	-23(%eax), %xmm1
624	movdqu	-23(%edx), %xmm2
625	pxor	%xmm1, %xmm2
626	ptest	%xmm2, %xmm0
627	jnc	L(less16bytes)
628	movl	-7(%eax), %ecx
629	movl	-7(%edx), %ebx
630	cmp	%ebx, %ecx
631	jne	L(find_diff)
632	movzwl	-3(%eax), %ecx
633	movzwl	-3(%edx), %ebx
634	cmpb	%bl, %cl
635	jne	L(end)
636	cmp	%bx, %cx
637	jne	L(end)
638	movzbl	-1(%eax), %eax
639	cmpb	-1(%edx), %al
640	mov	$0, %eax
641	jne	L(end)
642	RETURN
643#endif
644	.p2align 4
645L(56bytes):
646	movdqu	-56(%eax), %xmm1
647	movdqu	-56(%edx), %xmm2
648	mov	$-56, %ebx
649	pxor	%xmm1, %xmm2
650	ptest	%xmm2, %xmm0
651	jnc	L(less16bytes)
652L(40bytes):
653	mov	$-40, %ebx
654	movdqu	-40(%eax), %xmm1
655	movdqu	-40(%edx), %xmm2
656	pxor	%xmm1, %xmm2
657	ptest	%xmm2, %xmm0
658	jnc	L(less16bytes)
659L(24bytes):
660	mov	$-24, %ebx
661	movdqu	-24(%eax), %xmm1
662	movdqu	-24(%edx), %xmm2
663	pxor	%xmm1, %xmm2
664	ptest	%xmm2, %xmm0
665	jnc	L(less16bytes)
666
667	mov	-8(%eax), %ecx
668#ifndef USE_AS_WMEMCMP
669	mov	-8(%edx), %ebx
670	cmp	%ebx, %ecx
671#else
672	cmp	-8(%edx), %ecx
673#endif
674	jne	L(find_diff)
675
676	mov	-4(%eax), %ecx
677#ifndef USE_AS_WMEMCMP
678	mov	-4(%edx), %ebx
679	cmp	%ebx, %ecx
680#else
681	cmp	-4(%edx), %ecx
682#endif
683	mov	$0, %eax
684	jne	L(find_diff)
685	RETURN
686
687#ifndef USE_AS_WMEMCMP
688	.p2align 4
689L(57bytes):
690	movdqu	-57(%eax), %xmm1
691	movdqu	-57(%edx), %xmm2
692	mov	$-57, %ebx
693	pxor	%xmm1, %xmm2
694	ptest	%xmm2, %xmm0
695	jnc	L(less16bytes)
696L(41bytes):
697	mov	$-41, %ebx
698	movdqu	-41(%eax), %xmm1
699	movdqu	-41(%edx), %xmm2
700	pxor	%xmm1, %xmm2
701	ptest	%xmm2, %xmm0
702	jnc	L(less16bytes)
703L(25bytes):
704	mov	$-25, %ebx
705	movdqu	-25(%eax), %xmm1
706	movdqu	-25(%edx), %xmm2
707	pxor	%xmm1, %xmm2
708	ptest	%xmm2, %xmm0
709	jnc	L(less16bytes)
710	mov	-9(%eax), %ecx
711	mov	-9(%edx), %ebx
712	cmp	%ebx, %ecx
713	jne	L(find_diff)
714	mov	-5(%eax), %ecx
715	mov	-5(%edx), %ebx
716	cmp	%ebx, %ecx
717	jne	L(find_diff)
718	movzbl	-1(%eax), %ecx
719	cmp	-1(%edx), %cl
720	mov	$0, %eax
721	jne	L(end)
722	RETURN
723
724	.p2align 4
725L(58bytes):
726	movdqu	-58(%eax), %xmm1
727	movdqu	-58(%edx), %xmm2
728	mov	$-58, %ebx
729	pxor	%xmm1, %xmm2
730	ptest	%xmm2, %xmm0
731	jnc	L(less16bytes)
732L(42bytes):
733	mov	$-42, %ebx
734	movdqu	-42(%eax), %xmm1
735	movdqu	-42(%edx), %xmm2
736	pxor	%xmm1, %xmm2
737	ptest	%xmm2, %xmm0
738	jnc	L(less16bytes)
739L(26bytes):
740	mov	$-26, %ebx
741	movdqu	-26(%eax), %xmm1
742	movdqu	-26(%edx), %xmm2
743	pxor	%xmm1, %xmm2
744	ptest	%xmm2, %xmm0
745	jnc	L(less16bytes)
746
747	mov	-10(%eax), %ecx
748	mov	-10(%edx), %ebx
749	cmp	%ebx, %ecx
750	jne	L(find_diff)
751
752	mov	-6(%eax), %ecx
753	mov	-6(%edx), %ebx
754	cmp	%ebx, %ecx
755	jne	L(find_diff)
756
757	movzwl	-2(%eax), %ecx
758	movzwl	-2(%edx), %ebx
759	cmp	%bl, %cl
760	jne	L(end)
761	cmp	%bh, %ch
762	mov	$0, %eax
763	jne	L(end)
764	RETURN
765
766	.p2align 4
767L(59bytes):
768	movdqu	-59(%eax), %xmm1
769	movdqu	-59(%edx), %xmm2
770	mov	$-59, %ebx
771	pxor	%xmm1, %xmm2
772	ptest	%xmm2, %xmm0
773	jnc	L(less16bytes)
774L(43bytes):
775	mov	$-43, %ebx
776	movdqu	-43(%eax), %xmm1
777	movdqu	-43(%edx), %xmm2
778	pxor	%xmm1, %xmm2
779	ptest	%xmm2, %xmm0
780	jnc	L(less16bytes)
781L(27bytes):
782	mov	$-27, %ebx
783	movdqu	-27(%eax), %xmm1
784	movdqu	-27(%edx), %xmm2
785	pxor	%xmm1, %xmm2
786	ptest	%xmm2, %xmm0
787	jnc	L(less16bytes)
788	movl	-11(%eax), %ecx
789	movl	-11(%edx), %ebx
790	cmp	%ebx, %ecx
791	jne	L(find_diff)
792	movl	-7(%eax), %ecx
793	movl	-7(%edx), %ebx
794	cmp	%ebx, %ecx
795	jne	L(find_diff)
796	movzwl	-3(%eax), %ecx
797	movzwl	-3(%edx), %ebx
798	cmpb	%bl, %cl
799	jne	L(end)
800	cmp	%bx, %cx
801	jne	L(end)
802	movzbl	-1(%eax), %eax
803	cmpb	-1(%edx), %al
804	mov	$0, %eax
805	jne	L(end)
806	RETURN
807#endif
808	.p2align 4
809L(60bytes):
810	movdqu	-60(%eax), %xmm1
811	movdqu	-60(%edx), %xmm2
812	mov	$-60, %ebx
813	pxor	%xmm1, %xmm2
814	ptest	%xmm2, %xmm0
815	jnc	L(less16bytes)
816L(44bytes):
817	mov	$-44, %ebx
818	movdqu	-44(%eax), %xmm1
819	movdqu	-44(%edx), %xmm2
820	pxor	%xmm1, %xmm2
821	ptest	%xmm2, %xmm0
822	jnc	L(less16bytes)
823L(28bytes):
824	mov	$-28, %ebx
825	movdqu	-28(%eax), %xmm1
826	movdqu	-28(%edx), %xmm2
827	pxor	%xmm1, %xmm2
828	ptest	%xmm2, %xmm0
829	jnc	L(less16bytes)
830
831	mov	-12(%eax), %ecx
832#ifndef USE_AS_WMEMCMP
833	mov	-12(%edx), %ebx
834	cmp	%ebx, %ecx
835#else
836	cmp	-12(%edx), %ecx
837#endif
838	jne	L(find_diff)
839
840	mov	-8(%eax), %ecx
841#ifndef USE_AS_WMEMCMP
842	mov	-8(%edx), %ebx
843	cmp	%ebx, %ecx
844#else
845	cmp	-8(%edx), %ecx
846#endif
847	jne	L(find_diff)
848
849	mov	-4(%eax), %ecx
850#ifndef USE_AS_WMEMCMP
851	mov	-4(%edx), %ebx
852	cmp	%ebx, %ecx
853#else
854	cmp	-4(%edx), %ecx
855#endif
856	mov	$0, %eax
857	jne	L(find_diff)
858	RETURN
859
860#ifndef USE_AS_WMEMCMP
861	.p2align 4
862L(61bytes):
863	movdqu	-61(%eax), %xmm1
864	movdqu	-61(%edx), %xmm2
865	mov	$-61, %ebx
866	pxor	%xmm1, %xmm2
867	ptest	%xmm2, %xmm0
868	jnc	L(less16bytes)
869L(45bytes):
870	mov	$-45, %ebx
871	movdqu	-45(%eax), %xmm1
872	movdqu	-45(%edx), %xmm2
873	pxor	%xmm1, %xmm2
874	ptest	%xmm2, %xmm0
875	jnc	L(less16bytes)
876L(29bytes):
877	mov	$-29, %ebx
878	movdqu	-29(%eax), %xmm1
879	movdqu	-29(%edx), %xmm2
880	pxor	%xmm1, %xmm2
881	ptest	%xmm2, %xmm0
882	jnc	L(less16bytes)
883
884	mov	-13(%eax), %ecx
885	mov	-13(%edx), %ebx
886	cmp	%ebx, %ecx
887	jne	L(find_diff)
888
889	mov	-9(%eax), %ecx
890	mov	-9(%edx), %ebx
891	cmp	%ebx, %ecx
892	jne	L(find_diff)
893
894	mov	-5(%eax), %ecx
895	mov	-5(%edx), %ebx
896	cmp	%ebx, %ecx
897	jne	L(find_diff)
898	movzbl	-1(%eax), %ecx
899	cmp	-1(%edx), %cl
900	mov	$0, %eax
901	jne	L(end)
902	RETURN
903
904	.p2align 4
905L(62bytes):
906	movdqu	-62(%eax), %xmm1
907	movdqu	-62(%edx), %xmm2
908	mov	$-62, %ebx
909	pxor	%xmm1, %xmm2
910	ptest	%xmm2, %xmm0
911	jnc	L(less16bytes)
912L(46bytes):
913	mov	$-46, %ebx
914	movdqu	-46(%eax), %xmm1
915	movdqu	-46(%edx), %xmm2
916	pxor	%xmm1, %xmm2
917	ptest	%xmm2, %xmm0
918	jnc	L(less16bytes)
919L(30bytes):
920	mov	$-30, %ebx
921	movdqu	-30(%eax), %xmm1
922	movdqu	-30(%edx), %xmm2
923	pxor	%xmm1, %xmm2
924	ptest	%xmm2, %xmm0
925	jnc	L(less16bytes)
926	mov	-14(%eax), %ecx
927	mov	-14(%edx), %ebx
928	cmp	%ebx, %ecx
929	jne	L(find_diff)
930	mov	-10(%eax), %ecx
931	mov	-10(%edx), %ebx
932	cmp	%ebx, %ecx
933	jne	L(find_diff)
934	mov	-6(%eax), %ecx
935	mov	-6(%edx), %ebx
936	cmp	%ebx, %ecx
937	jne	L(find_diff)
938	movzwl	-2(%eax), %ecx
939	movzwl	-2(%edx), %ebx
940	cmp	%bl, %cl
941	jne	L(end)
942	cmp	%bh, %ch
943	mov	$0, %eax
944	jne	L(end)
945	RETURN
946
947	.p2align 4
948L(63bytes):
949	movdqu	-63(%eax), %xmm1
950	movdqu	-63(%edx), %xmm2
951	mov	$-63, %ebx
952	pxor	%xmm1, %xmm2
953	ptest	%xmm2, %xmm0
954	jnc	L(less16bytes)
955L(47bytes):
956	mov	$-47, %ebx
957	movdqu	-47(%eax), %xmm1
958	movdqu	-47(%edx), %xmm2
959	pxor	%xmm1, %xmm2
960	ptest	%xmm2, %xmm0
961	jnc	L(less16bytes)
962L(31bytes):
963	mov	$-31, %ebx
964	movdqu	-31(%eax), %xmm1
965	movdqu	-31(%edx), %xmm2
966	pxor	%xmm1, %xmm2
967	ptest	%xmm2, %xmm0
968	jnc	L(less16bytes)
969
970	movl	-15(%eax), %ecx
971	movl	-15(%edx), %ebx
972	cmp	%ebx, %ecx
973	jne	L(find_diff)
974	movl	-11(%eax), %ecx
975	movl	-11(%edx), %ebx
976	cmp	%ebx, %ecx
977	jne	L(find_diff)
978	movl	-7(%eax), %ecx
979	movl	-7(%edx), %ebx
980	cmp	%ebx, %ecx
981	jne	L(find_diff)
982	movzwl	-3(%eax), %ecx
983	movzwl	-3(%edx), %ebx
984	cmpb	%bl, %cl
985	jne	L(end)
986	cmp	%bx, %cx
987	jne	L(end)
988	movzbl	-1(%eax), %eax
989	cmpb	-1(%edx), %al
990	mov	$0, %eax
991	jne	L(end)
992	RETURN
993#endif
994
995	.p2align 4
996L(64bytes):
997	movdqu	-64(%eax), %xmm1
998	movdqu	-64(%edx), %xmm2
999	mov	$-64, %ebx
1000	pxor	%xmm1, %xmm2
1001	ptest	%xmm2, %xmm0
1002	jnc	L(less16bytes)
1003L(48bytes):
1004	movdqu	-48(%eax), %xmm1
1005	movdqu	-48(%edx), %xmm2
1006	mov	$-48, %ebx
1007	pxor	%xmm1, %xmm2
1008	ptest	%xmm2, %xmm0
1009	jnc	L(less16bytes)
1010L(32bytes):
1011	movdqu	-32(%eax), %xmm1
1012	movdqu	-32(%edx), %xmm2
1013	mov	$-32, %ebx
1014	pxor	%xmm1, %xmm2
1015	ptest	%xmm2, %xmm0
1016	jnc	L(less16bytes)
1017
1018	mov	-16(%eax), %ecx
1019#ifndef USE_AS_WMEMCMP
1020	mov	-16(%edx), %ebx
1021	cmp	%ebx, %ecx
1022#else
1023	cmp	-16(%edx), %ecx
1024#endif
1025	jne	L(find_diff)
1026
1027	mov	-12(%eax), %ecx
1028#ifndef USE_AS_WMEMCMP
1029	mov	-12(%edx), %ebx
1030	cmp	%ebx, %ecx
1031#else
1032	cmp	-12(%edx), %ecx
1033#endif
1034	jne	L(find_diff)
1035
1036	mov	-8(%eax), %ecx
1037#ifndef USE_AS_WMEMCMP
1038	mov	-8(%edx), %ebx
1039	cmp	%ebx, %ecx
1040#else
1041	cmp	-8(%edx), %ecx
1042#endif
1043	jne	L(find_diff)
1044
1045	mov	-4(%eax), %ecx
1046#ifndef USE_AS_WMEMCMP
1047	mov	-4(%edx), %ebx
1048	cmp	%ebx, %ecx
1049#else
1050	cmp	-4(%edx), %ecx
1051#endif
1052	mov	$0, %eax
1053	jne	L(find_diff)
1054	RETURN
1055
1056#ifndef USE_AS_WMEMCMP
1057	.p2align 4
1058L(less16bytes):
1059	add	%ebx, %eax
1060	add	%ebx, %edx
1061
1062	mov	(%eax), %ecx
1063	mov	(%edx), %ebx
1064	cmp	%ebx, %ecx
1065	jne	L(find_diff)
1066
1067	mov	4(%eax), %ecx
1068	mov	4(%edx), %ebx
1069	cmp	%ebx, %ecx
1070	jne	L(find_diff)
1071
1072	mov	8(%eax), %ecx
1073	mov	8(%edx), %ebx
1074	cmp	%ebx, %ecx
1075	jne	L(find_diff)
1076
1077	mov	12(%eax), %ecx
1078	mov	12(%edx), %ebx
1079	cmp	%ebx, %ecx
1080	mov	$0, %eax
1081	jne	L(find_diff)
1082	RETURN
1083#else
1084	.p2align 4
1085L(less16bytes):
1086	add	%ebx, %eax
1087	add	%ebx, %edx
1088
1089	mov	(%eax), %ecx
1090	cmp	(%edx), %ecx
1091	jne	L(find_diff)
1092
1093	mov	4(%eax), %ecx
1094	cmp	4(%edx), %ecx
1095	jne	L(find_diff)
1096
1097	mov	8(%eax), %ecx
1098	cmp	8(%edx), %ecx
1099	jne	L(find_diff)
1100
1101	mov	12(%eax), %ecx
1102	cmp	12(%edx), %ecx
1103
1104	mov	$0, %eax
1105	jne	L(find_diff)
1106	RETURN
1107#endif
1108
1109	.p2align 4
1110L(find_diff):
1111#ifndef USE_AS_WMEMCMP
1112	cmpb	%bl, %cl
1113	jne	L(end)
1114	cmp	%bx, %cx
1115	jne	L(end)
1116	shr	$16,%ecx
1117	shr	$16,%ebx
1118	cmp	%bl, %cl
1119	jne	L(end)
1120	cmp	%bx, %cx
1121L(end):
1122	POP	(%ebx)
1123	mov	$1, %eax
1124	ja	L(bigger)
1125	neg	%eax
1126L(bigger):
1127	ret
1128#else
1129	POP	(%ebx)
1130	mov	$1, %eax
1131	jg	L(bigger)
1132	neg	%eax
1133	ret
1134
1135	.p2align 4
1136L(bigger):
1137	ret
1138#endif
1139END (MEMCMP)
1140
1141	.section .rodata.sse4.2,"a",@progbits
1142	.p2align 2
1143	.type	L(table_64bytes), @object
1144#ifndef USE_AS_WMEMCMP
1145L(table_64bytes):
1146	.int	JMPTBL (L(0bytes), L(table_64bytes))
1147	.int	JMPTBL (L(1bytes), L(table_64bytes))
1148	.int	JMPTBL (L(2bytes), L(table_64bytes))
1149	.int	JMPTBL (L(3bytes), L(table_64bytes))
1150	.int	JMPTBL (L(4bytes), L(table_64bytes))
1151	.int	JMPTBL (L(5bytes), L(table_64bytes))
1152	.int	JMPTBL (L(6bytes), L(table_64bytes))
1153	.int	JMPTBL (L(7bytes), L(table_64bytes))
1154	.int	JMPTBL (L(8bytes), L(table_64bytes))
1155	.int	JMPTBL (L(9bytes), L(table_64bytes))
1156	.int	JMPTBL (L(10bytes), L(table_64bytes))
1157	.int	JMPTBL (L(11bytes), L(table_64bytes))
1158	.int	JMPTBL (L(12bytes), L(table_64bytes))
1159	.int	JMPTBL (L(13bytes), L(table_64bytes))
1160	.int	JMPTBL (L(14bytes), L(table_64bytes))
1161	.int	JMPTBL (L(15bytes), L(table_64bytes))
1162	.int	JMPTBL (L(16bytes), L(table_64bytes))
1163	.int	JMPTBL (L(17bytes), L(table_64bytes))
1164	.int	JMPTBL (L(18bytes), L(table_64bytes))
1165	.int	JMPTBL (L(19bytes), L(table_64bytes))
1166	.int	JMPTBL (L(20bytes), L(table_64bytes))
1167	.int	JMPTBL (L(21bytes), L(table_64bytes))
1168	.int	JMPTBL (L(22bytes), L(table_64bytes))
1169	.int	JMPTBL (L(23bytes), L(table_64bytes))
1170	.int	JMPTBL (L(24bytes), L(table_64bytes))
1171	.int	JMPTBL (L(25bytes), L(table_64bytes))
1172	.int	JMPTBL (L(26bytes), L(table_64bytes))
1173	.int	JMPTBL (L(27bytes), L(table_64bytes))
1174	.int	JMPTBL (L(28bytes), L(table_64bytes))
1175	.int	JMPTBL (L(29bytes), L(table_64bytes))
1176	.int	JMPTBL (L(30bytes), L(table_64bytes))
1177	.int	JMPTBL (L(31bytes), L(table_64bytes))
1178	.int	JMPTBL (L(32bytes), L(table_64bytes))
1179	.int	JMPTBL (L(33bytes), L(table_64bytes))
1180	.int	JMPTBL (L(34bytes), L(table_64bytes))
1181	.int	JMPTBL (L(35bytes), L(table_64bytes))
1182	.int	JMPTBL (L(36bytes), L(table_64bytes))
1183	.int	JMPTBL (L(37bytes), L(table_64bytes))
1184	.int	JMPTBL (L(38bytes), L(table_64bytes))
1185	.int	JMPTBL (L(39bytes), L(table_64bytes))
1186	.int	JMPTBL (L(40bytes), L(table_64bytes))
1187	.int	JMPTBL (L(41bytes), L(table_64bytes))
1188	.int	JMPTBL (L(42bytes), L(table_64bytes))
1189	.int	JMPTBL (L(43bytes), L(table_64bytes))
1190	.int	JMPTBL (L(44bytes), L(table_64bytes))
1191	.int	JMPTBL (L(45bytes), L(table_64bytes))
1192	.int	JMPTBL (L(46bytes), L(table_64bytes))
1193	.int	JMPTBL (L(47bytes), L(table_64bytes))
1194	.int	JMPTBL (L(48bytes), L(table_64bytes))
1195	.int	JMPTBL (L(49bytes), L(table_64bytes))
1196	.int	JMPTBL (L(50bytes), L(table_64bytes))
1197	.int	JMPTBL (L(51bytes), L(table_64bytes))
1198	.int	JMPTBL (L(52bytes), L(table_64bytes))
1199	.int	JMPTBL (L(53bytes), L(table_64bytes))
1200	.int	JMPTBL (L(54bytes), L(table_64bytes))
1201	.int	JMPTBL (L(55bytes), L(table_64bytes))
1202	.int	JMPTBL (L(56bytes), L(table_64bytes))
1203	.int	JMPTBL (L(57bytes), L(table_64bytes))
1204	.int	JMPTBL (L(58bytes), L(table_64bytes))
1205	.int	JMPTBL (L(59bytes), L(table_64bytes))
1206	.int	JMPTBL (L(60bytes), L(table_64bytes))
1207	.int	JMPTBL (L(61bytes), L(table_64bytes))
1208	.int	JMPTBL (L(62bytes), L(table_64bytes))
1209	.int	JMPTBL (L(63bytes), L(table_64bytes))
1210	.int	JMPTBL (L(64bytes), L(table_64bytes))
1211#else
1212L(table_64bytes):
1213	.int	JMPTBL (L(0bytes), L(table_64bytes))
1214	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1215	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1216	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1217	.int	JMPTBL (L(4bytes), L(table_64bytes))
1218	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1219	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1220	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1221	.int	JMPTBL (L(8bytes), L(table_64bytes))
1222	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1223	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1224	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1225	.int	JMPTBL (L(12bytes), L(table_64bytes))
1226	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1227	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1228	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1229	.int	JMPTBL (L(16bytes), L(table_64bytes))
1230	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1231	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1232	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1233	.int	JMPTBL (L(20bytes), L(table_64bytes))
1234	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1235	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1236	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1237	.int	JMPTBL (L(24bytes), L(table_64bytes))
1238	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1239	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1240	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1241	.int	JMPTBL (L(28bytes), L(table_64bytes))
1242	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1243	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1244	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1245	.int	JMPTBL (L(32bytes), L(table_64bytes))
1246	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1247	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1248	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1249	.int	JMPTBL (L(36bytes), L(table_64bytes))
1250	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1251	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1252	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1253	.int	JMPTBL (L(40bytes), L(table_64bytes))
1254	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1255	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1256	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1257	.int	JMPTBL (L(44bytes), L(table_64bytes))
1258	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1259	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1260	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1261	.int	JMPTBL (L(48bytes), L(table_64bytes))
1262	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1263	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1264	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1265	.int	JMPTBL (L(52bytes), L(table_64bytes))
1266	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1267	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1268	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1269	.int	JMPTBL (L(56bytes), L(table_64bytes))
1270	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1271	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1272	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1273	.int	JMPTBL (L(60bytes), L(table_64bytes))
1274	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1275	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1276	.int	JMPTBL (L(unreal_case), L(table_64bytes))
1277	.int	JMPTBL (L(64bytes), L(table_64bytes))
1278#endif
1279