1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef MEMCMP
32# define MEMCMP		ssse3_memcmp3_new
33#endif
34
35#ifndef L
36# define L(label)	.L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n)	.p2align n
41#endif
42
43#ifndef cfi_startproc
44# define cfi_startproc			.cfi_startproc
45#endif
46
47#ifndef cfi_endproc
48# define cfi_endproc			.cfi_endproc
49#endif
50
51#ifndef cfi_rel_offset
52# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
53#endif
54
55#ifndef cfi_restore
56# define cfi_restore(reg)		.cfi_restore reg
57#endif
58
59#ifndef cfi_adjust_cfa_offset
60# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
61#endif
62
63#ifndef cfi_remember_state
64# define cfi_remember_state		.cfi_remember_state
65#endif
66
67#ifndef cfi_restore_state
68# define cfi_restore_state		.cfi_restore_state
69#endif
70
71#ifndef ENTRY
72# define ENTRY(name)			\
73	.type name,  @function; 	\
74	.globl name;			\
75	.p2align 4;			\
76name:					\
77	cfi_startproc
78#endif
79
80#ifndef END
81# define END(name)			\
82	cfi_endproc;			\
83	.size name, .-name
84#endif
85
86#define CFI_PUSH(REG)						\
87  cfi_adjust_cfa_offset (4);					\
88  cfi_rel_offset (REG, 0)
89
90#define CFI_POP(REG)						\
91  cfi_adjust_cfa_offset (-4);					\
92  cfi_restore (REG)
93
94#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
95#define POP(REG)	popl REG; CFI_POP (REG)
96
97#define PARMS		4
98#define BLK1		PARMS
99#define BLK2		BLK1+4
100#define LEN		BLK2+4
101#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
102#define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
103
104	.section .text.ssse3,"ax",@progbits
105ENTRY (MEMCMP)
106	movl	LEN(%esp), %ecx
107	movl	BLK1(%esp), %eax
108	cmp	$48, %ecx
109	movl	BLK2(%esp), %edx
110	jae	L(48bytesormore)
111	cmp	$1, %ecx
112	jbe	L(less1bytes)
113	PUSH (%ebx)
114	add	%ecx, %edx
115	add	%ecx, %eax
116	jmp	L(less48bytes)
117
118	CFI_POP (%ebx)
119	ALIGN (4)
120L(less1bytes):
121	jb	L(zero)
122	movb	(%eax), %cl
123	cmp	(%edx), %cl
124	je	L(zero)
125	mov	$1, %eax
126	ja	L(1bytesend)
127	neg	%eax
128L(1bytesend):
129	ret
130
131	ALIGN (4)
132L(zero):
133	mov	$0, %eax
134	ret
135
136	ALIGN (4)
137L(48bytesormore):
138	PUSH (%ebx)
139	PUSH (%esi)
140	PUSH (%edi)
141	cfi_remember_state
142	movdqu    (%eax), %xmm3
143	movdqu    (%edx), %xmm0
144	movl	%eax, %edi
145	movl	%edx, %esi
146	pcmpeqb   %xmm0, %xmm3
147	pmovmskb  %xmm3, %edx
148	lea	16(%edi), %edi
149
150	sub      $0xffff, %edx
151	lea	16(%esi), %esi
152	jnz	  L(less16bytes)
153	mov	%edi, %edx
154	and	$0xf, %edx
155	xor	%edx, %edi
156	sub	%edx, %esi
157	add	%edx, %ecx
158	mov	%esi, %edx
159	and	$0xf, %edx
160	jz	L(shr_0)
161	xor	%edx, %esi
162
163	cmp	$8, %edx
164	jae	L(next_unaligned_table)
165	cmp	$0, %edx
166	je	L(shr_0)
167	cmp	$1, %edx
168	je	L(shr_1)
169	cmp	$2, %edx
170	je	L(shr_2)
171	cmp	$3, %edx
172	je	L(shr_3)
173	cmp	$4, %edx
174	je	L(shr_4)
175	cmp	$5, %edx
176	je	L(shr_5)
177	cmp	$6, %edx
178	je	L(shr_6)
179	jmp	L(shr_7)
180
181	ALIGN (4)
182L(next_unaligned_table):
183	cmp	$8, %edx
184	je	L(shr_8)
185	cmp	$9, %edx
186	je	L(shr_9)
187	cmp	$10, %edx
188	je	L(shr_10)
189	cmp	$11, %edx
190	je	L(shr_11)
191	cmp	$12, %edx
192	je	L(shr_12)
193	cmp	$13, %edx
194	je	L(shr_13)
195	cmp	$14, %edx
196	je	L(shr_14)
197	jmp	L(shr_15)
198
199	ALIGN (4)
200L(shr_0):
201	cmp	$80, %ecx
202	jae	L(shr_0_gobble)
203	lea	-48(%ecx), %ecx
204	xor	%eax, %eax
205	movaps	(%esi), %xmm1
206	pcmpeqb	(%edi), %xmm1
207	movaps	16(%esi), %xmm2
208	pcmpeqb	16(%edi), %xmm2
209	pand	%xmm1, %xmm2
210	pmovmskb %xmm2, %edx
211	add	$32, %edi
212	add	$32, %esi
213	sub	$0xffff, %edx
214	jnz	L(exit)
215
216	lea	(%ecx, %edi,1), %eax
217	lea	(%ecx, %esi,1), %edx
218	POP (%edi)
219	POP (%esi)
220	jmp	L(less48bytes)
221
222	cfi_restore_state
223	cfi_remember_state
224	ALIGN (4)
225L(shr_0_gobble):
226	lea	-48(%ecx), %ecx
227	movdqa	(%esi), %xmm0
228	xor	%eax, %eax
229	pcmpeqb	(%edi), %xmm0
230	sub	$32, %ecx
231	movdqa	16(%esi), %xmm2
232	pcmpeqb	16(%edi), %xmm2
233L(shr_0_gobble_loop):
234	pand	%xmm0, %xmm2
235	sub	$32, %ecx
236	pmovmskb %xmm2, %edx
237	movdqa	%xmm0, %xmm1
238	movdqa	32(%esi), %xmm0
239	movdqa	48(%esi), %xmm2
240	sbb	$0xffff, %edx
241	pcmpeqb	32(%edi), %xmm0
242	pcmpeqb	48(%edi), %xmm2
243	lea	32(%edi), %edi
244	lea	32(%esi), %esi
245	jz	L(shr_0_gobble_loop)
246
247	pand	%xmm0, %xmm2
248	cmp	$0, %ecx
249	jge	L(shr_0_gobble_loop_next)
250	inc	%edx
251	add	$32, %ecx
252L(shr_0_gobble_loop_next):
253	test	%edx, %edx
254	jnz	L(exit)
255
256	pmovmskb %xmm2, %edx
257	movdqa	%xmm0, %xmm1
258	lea	32(%edi), %edi
259	lea	32(%esi), %esi
260	sub	$0xffff, %edx
261	jnz	L(exit)
262	lea	(%ecx, %edi,1), %eax
263	lea	(%ecx, %esi,1), %edx
264	POP (%edi)
265	POP (%esi)
266	jmp	L(less48bytes)
267
268	cfi_restore_state
269	cfi_remember_state
270	ALIGN (4)
271L(shr_1):
272	cmp	$80, %ecx
273	lea	-48(%ecx), %ecx
274	mov	%edx, %eax
275	jae	L(shr_1_gobble)
276
277	movdqa	16(%esi), %xmm1
278	movdqa	%xmm1, %xmm2
279	palignr	$1,(%esi), %xmm1
280	pcmpeqb	(%edi), %xmm1
281
282	movdqa	32(%esi), %xmm3
283	palignr	$1,%xmm2, %xmm3
284	pcmpeqb	16(%edi), %xmm3
285
286	pand	%xmm1, %xmm3
287	pmovmskb %xmm3, %edx
288	lea	32(%edi), %edi
289	lea	32(%esi), %esi
290	sub	$0xffff, %edx
291	jnz	L(exit)
292	lea	(%ecx, %edi,1), %eax
293	lea	1(%ecx, %esi,1), %edx
294	POP (%edi)
295	POP (%esi)
296	jmp	L(less48bytes)
297
298	cfi_restore_state
299	cfi_remember_state
300	ALIGN (4)
301L(shr_1_gobble):
302	sub	$32, %ecx
303	movdqa	16(%esi), %xmm0
304	palignr	$1,(%esi), %xmm0
305	pcmpeqb	(%edi), %xmm0
306
307	movdqa	32(%esi), %xmm3
308	palignr	$1,16(%esi), %xmm3
309	pcmpeqb	16(%edi), %xmm3
310
311L(shr_1_gobble_loop):
312	pand	%xmm0, %xmm3
313	sub	$32, %ecx
314	pmovmskb %xmm3, %edx
315	movdqa	%xmm0, %xmm1
316
317	movdqa	64(%esi), %xmm3
318	palignr	$1,48(%esi), %xmm3
319	sbb	$0xffff, %edx
320	movdqa	48(%esi), %xmm0
321	palignr	$1,32(%esi), %xmm0
322	pcmpeqb	32(%edi), %xmm0
323	lea	32(%esi), %esi
324	pcmpeqb	48(%edi), %xmm3
325
326	lea	32(%edi), %edi
327	jz	L(shr_1_gobble_loop)
328	pand	%xmm0, %xmm3
329
330	cmp	$0, %ecx
331	jge	L(shr_1_gobble_next)
332	inc	%edx
333	add	$32, %ecx
334L(shr_1_gobble_next):
335	test	%edx, %edx
336	jnz	L(exit)
337
338	pmovmskb %xmm3, %edx
339	movdqa	%xmm0, %xmm1
340	lea	32(%edi), %edi
341	lea	32(%esi), %esi
342	sub	$0xffff, %edx
343	jnz	L(exit)
344
345	lea	(%ecx, %edi,1), %eax
346	lea	1(%ecx, %esi,1), %edx
347	POP (%edi)
348	POP (%esi)
349	jmp	L(less48bytes)
350
351	cfi_restore_state
352	cfi_remember_state
353	ALIGN (4)
354L(shr_2):
355	cmp	$80, %ecx
356	lea	-48(%ecx), %ecx
357	mov	%edx, %eax
358	jae	L(shr_2_gobble)
359
360	movdqa	16(%esi), %xmm1
361	movdqa	%xmm1, %xmm2
362	palignr	$2,(%esi), %xmm1
363	pcmpeqb	(%edi), %xmm1
364
365	movdqa	32(%esi), %xmm3
366	palignr	$2,%xmm2, %xmm3
367	pcmpeqb	16(%edi), %xmm3
368
369	pand	%xmm1, %xmm3
370	pmovmskb %xmm3, %edx
371	lea	32(%edi), %edi
372	lea	32(%esi), %esi
373	sub	$0xffff, %edx
374	jnz	L(exit)
375	lea	(%ecx, %edi,1), %eax
376	lea	2(%ecx, %esi,1), %edx
377	POP (%edi)
378	POP (%esi)
379	jmp	L(less48bytes)
380
381	cfi_restore_state
382	cfi_remember_state
383	ALIGN (4)
384L(shr_2_gobble):
385	sub	$32, %ecx
386	movdqa	16(%esi), %xmm0
387	palignr	$2,(%esi), %xmm0
388	pcmpeqb	(%edi), %xmm0
389
390	movdqa	32(%esi), %xmm3
391	palignr	$2,16(%esi), %xmm3
392	pcmpeqb	16(%edi), %xmm3
393
394L(shr_2_gobble_loop):
395	pand	%xmm0, %xmm3
396	sub	$32, %ecx
397	pmovmskb %xmm3, %edx
398	movdqa	%xmm0, %xmm1
399
400	movdqa	64(%esi), %xmm3
401	palignr	$2,48(%esi), %xmm3
402	sbb	$0xffff, %edx
403	movdqa	48(%esi), %xmm0
404	palignr	$2,32(%esi), %xmm0
405	pcmpeqb	32(%edi), %xmm0
406	lea	32(%esi), %esi
407	pcmpeqb	48(%edi), %xmm3
408
409	lea	32(%edi), %edi
410	jz	L(shr_2_gobble_loop)
411	pand	%xmm0, %xmm3
412
413	cmp	$0, %ecx
414	jge	L(shr_2_gobble_next)
415	inc	%edx
416	add	$32, %ecx
417L(shr_2_gobble_next):
418	test	%edx, %edx
419	jnz	L(exit)
420
421	pmovmskb %xmm3, %edx
422	movdqa	%xmm0, %xmm1
423	lea	32(%edi), %edi
424	lea	32(%esi), %esi
425	sub	$0xffff, %edx
426	jnz	L(exit)
427
428	lea	(%ecx, %edi,1), %eax
429	lea	2(%ecx, %esi,1), %edx
430	POP (%edi)
431	POP (%esi)
432	jmp	L(less48bytes)
433
434	cfi_restore_state
435	cfi_remember_state
436	ALIGN (4)
437L(shr_3):
438	cmp	$80, %ecx
439	lea	-48(%ecx), %ecx
440	mov	%edx, %eax
441	jae	L(shr_3_gobble)
442
443	movdqa	16(%esi), %xmm1
444	movdqa	%xmm1, %xmm2
445	palignr	$3,(%esi), %xmm1
446	pcmpeqb	(%edi), %xmm1
447
448	movdqa	32(%esi), %xmm3
449	palignr	$3,%xmm2, %xmm3
450	pcmpeqb	16(%edi), %xmm3
451
452	pand	%xmm1, %xmm3
453	pmovmskb %xmm3, %edx
454	lea	32(%edi), %edi
455	lea	32(%esi), %esi
456	sub	$0xffff, %edx
457	jnz	L(exit)
458	lea	(%ecx, %edi,1), %eax
459	lea	3(%ecx, %esi,1), %edx
460	POP (%edi)
461	POP (%esi)
462	jmp	L(less48bytes)
463
464	cfi_restore_state
465	cfi_remember_state
466	ALIGN (4)
467L(shr_3_gobble):
468	sub	$32, %ecx
469	movdqa	16(%esi), %xmm0
470	palignr	$3,(%esi), %xmm0
471	pcmpeqb	(%edi), %xmm0
472
473	movdqa	32(%esi), %xmm3
474	palignr	$3,16(%esi), %xmm3
475	pcmpeqb	16(%edi), %xmm3
476
477L(shr_3_gobble_loop):
478	pand	%xmm0, %xmm3
479	sub	$32, %ecx
480	pmovmskb %xmm3, %edx
481	movdqa	%xmm0, %xmm1
482
483	movdqa	64(%esi), %xmm3
484	palignr	$3,48(%esi), %xmm3
485	sbb	$0xffff, %edx
486	movdqa	48(%esi), %xmm0
487	palignr	$3,32(%esi), %xmm0
488	pcmpeqb	32(%edi), %xmm0
489	lea	32(%esi), %esi
490	pcmpeqb	48(%edi), %xmm3
491
492	lea	32(%edi), %edi
493	jz	L(shr_3_gobble_loop)
494	pand	%xmm0, %xmm3
495
496	cmp	$0, %ecx
497	jge	L(shr_3_gobble_next)
498	inc	%edx
499	add	$32, %ecx
500L(shr_3_gobble_next):
501	test	%edx, %edx
502	jnz	L(exit)
503
504	pmovmskb %xmm3, %edx
505	movdqa	%xmm0, %xmm1
506	lea	32(%edi), %edi
507	lea	32(%esi), %esi
508	sub	$0xffff, %edx
509	jnz	L(exit)
510
511	lea	(%ecx, %edi,1), %eax
512	lea	3(%ecx, %esi,1), %edx
513	POP (%edi)
514	POP (%esi)
515	jmp	L(less48bytes)
516
517	cfi_restore_state
518	cfi_remember_state
519	ALIGN (4)
520L(shr_4):
521	cmp	$80, %ecx
522	lea	-48(%ecx), %ecx
523	mov	%edx, %eax
524	jae	L(shr_4_gobble)
525
526	movdqa	16(%esi), %xmm1
527	movdqa	%xmm1, %xmm2
528	palignr	$4,(%esi), %xmm1
529	pcmpeqb	(%edi), %xmm1
530
531	movdqa	32(%esi), %xmm3
532	palignr	$4,%xmm2, %xmm3
533	pcmpeqb	16(%edi), %xmm3
534
535	pand	%xmm1, %xmm3
536	pmovmskb %xmm3, %edx
537	lea	32(%edi), %edi
538	lea	32(%esi), %esi
539	sub	$0xffff, %edx
540	jnz	L(exit)
541	lea	(%ecx, %edi,1), %eax
542	lea	4(%ecx, %esi,1), %edx
543	POP (%edi)
544	POP (%esi)
545	jmp	L(less48bytes)
546
547	cfi_restore_state
548	cfi_remember_state
549	ALIGN (4)
550L(shr_4_gobble):
551	sub	$32, %ecx
552	movdqa	16(%esi), %xmm0
553	palignr	$4,(%esi), %xmm0
554	pcmpeqb	(%edi), %xmm0
555
556	movdqa	32(%esi), %xmm3
557	palignr	$4,16(%esi), %xmm3
558	pcmpeqb	16(%edi), %xmm3
559
560L(shr_4_gobble_loop):
561	pand	%xmm0, %xmm3
562	sub	$32, %ecx
563	pmovmskb %xmm3, %edx
564	movdqa	%xmm0, %xmm1
565
566	movdqa	64(%esi), %xmm3
567	palignr	$4,48(%esi), %xmm3
568	sbb	$0xffff, %edx
569	movdqa	48(%esi), %xmm0
570	palignr	$4,32(%esi), %xmm0
571	pcmpeqb	32(%edi), %xmm0
572	lea	32(%esi), %esi
573	pcmpeqb	48(%edi), %xmm3
574
575	lea	32(%edi), %edi
576	jz	L(shr_4_gobble_loop)
577	pand	%xmm0, %xmm3
578
579	cmp	$0, %ecx
580	jge	L(shr_4_gobble_next)
581	inc	%edx
582	add	$32, %ecx
583L(shr_4_gobble_next):
584	test	%edx, %edx
585	jnz	L(exit)
586
587	pmovmskb %xmm3, %edx
588	movdqa	%xmm0, %xmm1
589	lea	32(%edi), %edi
590	lea	32(%esi), %esi
591	sub	$0xffff, %edx
592	jnz	L(exit)
593
594	lea	(%ecx, %edi,1), %eax
595	lea	4(%ecx, %esi,1), %edx
596	POP (%edi)
597	POP (%esi)
598	jmp	L(less48bytes)
599
600	cfi_restore_state
601	cfi_remember_state
602	ALIGN (4)
603L(shr_5):
604	cmp	$80, %ecx
605	lea	-48(%ecx), %ecx
606	mov	%edx, %eax
607	jae	L(shr_5_gobble)
608
609	movdqa	16(%esi), %xmm1
610	movdqa	%xmm1, %xmm2
611	palignr	$5,(%esi), %xmm1
612	pcmpeqb	(%edi), %xmm1
613
614	movdqa	32(%esi), %xmm3
615	palignr	$5,%xmm2, %xmm3
616	pcmpeqb	16(%edi), %xmm3
617
618	pand	%xmm1, %xmm3
619	pmovmskb %xmm3, %edx
620	lea	32(%edi), %edi
621	lea	32(%esi), %esi
622	sub	$0xffff, %edx
623	jnz	L(exit)
624	lea	(%ecx, %edi,1), %eax
625	lea	5(%ecx, %esi,1), %edx
626	POP (%edi)
627	POP (%esi)
628	jmp	L(less48bytes)
629
630	cfi_restore_state
631	cfi_remember_state
632	ALIGN (4)
633L(shr_5_gobble):
634	sub	$32, %ecx
635	movdqa	16(%esi), %xmm0
636	palignr	$5,(%esi), %xmm0
637	pcmpeqb	(%edi), %xmm0
638
639	movdqa	32(%esi), %xmm3
640	palignr	$5,16(%esi), %xmm3
641	pcmpeqb	16(%edi), %xmm3
642
643L(shr_5_gobble_loop):
644	pand	%xmm0, %xmm3
645	sub	$32, %ecx
646	pmovmskb %xmm3, %edx
647	movdqa	%xmm0, %xmm1
648
649	movdqa	64(%esi), %xmm3
650	palignr	$5,48(%esi), %xmm3
651	sbb	$0xffff, %edx
652	movdqa	48(%esi), %xmm0
653	palignr	$5,32(%esi), %xmm0
654	pcmpeqb	32(%edi), %xmm0
655	lea	32(%esi), %esi
656	pcmpeqb	48(%edi), %xmm3
657
658	lea	32(%edi), %edi
659	jz	L(shr_5_gobble_loop)
660	pand	%xmm0, %xmm3
661
662	cmp	$0, %ecx
663	jge	L(shr_5_gobble_next)
664	inc	%edx
665	add	$32, %ecx
666L(shr_5_gobble_next):
667	test	%edx, %edx
668	jnz	L(exit)
669
670	pmovmskb %xmm3, %edx
671	movdqa	%xmm0, %xmm1
672	lea	32(%edi), %edi
673	lea	32(%esi), %esi
674	sub	$0xffff, %edx
675	jnz	L(exit)
676
677	lea	(%ecx, %edi,1), %eax
678	lea	5(%ecx, %esi,1), %edx
679	POP (%edi)
680	POP (%esi)
681	jmp	L(less48bytes)
682
683	cfi_restore_state
684	cfi_remember_state
685	ALIGN (4)
686L(shr_6):
687	cmp	$80, %ecx
688	lea	-48(%ecx), %ecx
689	mov	%edx, %eax
690	jae	L(shr_6_gobble)
691
692	movdqa	16(%esi), %xmm1
693	movdqa	%xmm1, %xmm2
694	palignr	$6,(%esi), %xmm1
695	pcmpeqb	(%edi), %xmm1
696
697	movdqa	32(%esi), %xmm3
698	palignr	$6,%xmm2, %xmm3
699	pcmpeqb	16(%edi), %xmm3
700
701	pand	%xmm1, %xmm3
702	pmovmskb %xmm3, %edx
703	lea	32(%edi), %edi
704	lea	32(%esi), %esi
705	sub	$0xffff, %edx
706	jnz	L(exit)
707	lea	(%ecx, %edi,1), %eax
708	lea	6(%ecx, %esi,1), %edx
709	POP (%edi)
710	POP (%esi)
711	jmp	L(less48bytes)
712
713	cfi_restore_state
714	cfi_remember_state
715	ALIGN (4)
716L(shr_6_gobble):
717	sub	$32, %ecx
718	movdqa	16(%esi), %xmm0
719	palignr	$6,(%esi), %xmm0
720	pcmpeqb	(%edi), %xmm0
721
722	movdqa	32(%esi), %xmm3
723	palignr	$6,16(%esi), %xmm3
724	pcmpeqb	16(%edi), %xmm3
725
726L(shr_6_gobble_loop):
727	pand	%xmm0, %xmm3
728	sub	$32, %ecx
729	pmovmskb %xmm3, %edx
730	movdqa	%xmm0, %xmm1
731
732	movdqa	64(%esi), %xmm3
733	palignr	$6,48(%esi), %xmm3
734	sbb	$0xffff, %edx
735	movdqa	48(%esi), %xmm0
736	palignr	$6,32(%esi), %xmm0
737	pcmpeqb	32(%edi), %xmm0
738	lea	32(%esi), %esi
739	pcmpeqb	48(%edi), %xmm3
740
741	lea	32(%edi), %edi
742	jz	L(shr_6_gobble_loop)
743	pand	%xmm0, %xmm3
744
745	cmp	$0, %ecx
746	jge	L(shr_6_gobble_next)
747	inc	%edx
748	add	$32, %ecx
749L(shr_6_gobble_next):
750	test	%edx, %edx
751	jnz	L(exit)
752
753	pmovmskb %xmm3, %edx
754	movdqa	%xmm0, %xmm1
755	lea	32(%edi), %edi
756	lea	32(%esi), %esi
757	sub	$0xffff, %edx
758	jnz	L(exit)
759
760	lea	(%ecx, %edi,1), %eax
761	lea	6(%ecx, %esi,1), %edx
762	POP (%edi)
763	POP (%esi)
764	jmp	L(less48bytes)
765
766	cfi_restore_state
767	cfi_remember_state
768	ALIGN (4)
769L(shr_7):
770	cmp	$80, %ecx
771	lea	-48(%ecx), %ecx
772	mov	%edx, %eax
773	jae	L(shr_7_gobble)
774
775	movdqa	16(%esi), %xmm1
776	movdqa	%xmm1, %xmm2
777	palignr	$7,(%esi), %xmm1
778	pcmpeqb	(%edi), %xmm1
779
780	movdqa	32(%esi), %xmm3
781	palignr	$7,%xmm2, %xmm3
782	pcmpeqb	16(%edi), %xmm3
783
784	pand	%xmm1, %xmm3
785	pmovmskb %xmm3, %edx
786	lea	32(%edi), %edi
787	lea	32(%esi), %esi
788	sub	$0xffff, %edx
789	jnz	L(exit)
790	lea	(%ecx, %edi,1), %eax
791	lea	7(%ecx, %esi,1), %edx
792	POP (%edi)
793	POP (%esi)
794	jmp	L(less48bytes)
795
796	cfi_restore_state
797	cfi_remember_state
798	ALIGN (4)
799L(shr_7_gobble):
800	sub	$32, %ecx
801	movdqa	16(%esi), %xmm0
802	palignr	$7,(%esi), %xmm0
803	pcmpeqb	(%edi), %xmm0
804
805	movdqa	32(%esi), %xmm3
806	palignr	$7,16(%esi), %xmm3
807	pcmpeqb	16(%edi), %xmm3
808
809L(shr_7_gobble_loop):
810	pand	%xmm0, %xmm3
811	sub	$32, %ecx
812	pmovmskb %xmm3, %edx
813	movdqa	%xmm0, %xmm1
814
815	movdqa	64(%esi), %xmm3
816	palignr	$7,48(%esi), %xmm3
817	sbb	$0xffff, %edx
818	movdqa	48(%esi), %xmm0
819	palignr	$7,32(%esi), %xmm0
820	pcmpeqb	32(%edi), %xmm0
821	lea	32(%esi), %esi
822	pcmpeqb	48(%edi), %xmm3
823
824	lea	32(%edi), %edi
825	jz	L(shr_7_gobble_loop)
826	pand	%xmm0, %xmm3
827
828	cmp	$0, %ecx
829	jge	L(shr_7_gobble_next)
830	inc	%edx
831	add	$32, %ecx
832L(shr_7_gobble_next):
833	test	%edx, %edx
834	jnz	L(exit)
835
836	pmovmskb %xmm3, %edx
837	movdqa	%xmm0, %xmm1
838	lea	32(%edi), %edi
839	lea	32(%esi), %esi
840	sub	$0xffff, %edx
841	jnz	L(exit)
842
843	lea	(%ecx, %edi,1), %eax
844	lea	7(%ecx, %esi,1), %edx
845	POP (%edi)
846	POP (%esi)
847	jmp	L(less48bytes)
848
849	cfi_restore_state
850	cfi_remember_state
851	ALIGN (4)
852L(shr_8):
853	cmp	$80, %ecx
854	lea	-48(%ecx), %ecx
855	mov	%edx, %eax
856	jae	L(shr_8_gobble)
857
858	movdqa	16(%esi), %xmm1
859	movdqa	%xmm1, %xmm2
860	palignr	$8,(%esi), %xmm1
861	pcmpeqb	(%edi), %xmm1
862
863	movdqa	32(%esi), %xmm3
864	palignr	$8,%xmm2, %xmm3
865	pcmpeqb	16(%edi), %xmm3
866
867	pand	%xmm1, %xmm3
868	pmovmskb %xmm3, %edx
869	lea	32(%edi), %edi
870	lea	32(%esi), %esi
871	sub	$0xffff, %edx
872	jnz	L(exit)
873	lea	(%ecx, %edi,1), %eax
874	lea	8(%ecx, %esi,1), %edx
875	POP (%edi)
876	POP (%esi)
877	jmp	L(less48bytes)
878
879	cfi_restore_state
880	cfi_remember_state
881	ALIGN (4)
882L(shr_8_gobble):
883	sub	$32, %ecx
884	movdqa	16(%esi), %xmm0
885	palignr	$8,(%esi), %xmm0
886	pcmpeqb	(%edi), %xmm0
887
888	movdqa	32(%esi), %xmm3
889	palignr	$8,16(%esi), %xmm3
890	pcmpeqb	16(%edi), %xmm3
891
892L(shr_8_gobble_loop):
893	pand	%xmm0, %xmm3
894	sub	$32, %ecx
895	pmovmskb %xmm3, %edx
896	movdqa	%xmm0, %xmm1
897
898	movdqa	64(%esi), %xmm3
899	palignr	$8,48(%esi), %xmm3
900	sbb	$0xffff, %edx
901	movdqa	48(%esi), %xmm0
902	palignr	$8,32(%esi), %xmm0
903	pcmpeqb	32(%edi), %xmm0
904	lea	32(%esi), %esi
905	pcmpeqb	48(%edi), %xmm3
906
907	lea	32(%edi), %edi
908	jz	L(shr_8_gobble_loop)
909	pand	%xmm0, %xmm3
910
911	cmp	$0, %ecx
912	jge	L(shr_8_gobble_next)
913	inc	%edx
914	add	$32, %ecx
915L(shr_8_gobble_next):
916	test	%edx, %edx
917	jnz	L(exit)
918
919	pmovmskb %xmm3, %edx
920	movdqa	%xmm0, %xmm1
921	lea	32(%edi), %edi
922	lea	32(%esi), %esi
923	sub	$0xffff, %edx
924	jnz	L(exit)
925
926	lea	(%ecx, %edi,1), %eax
927	lea	8(%ecx, %esi,1), %edx
928	POP (%edi)
929	POP (%esi)
930	jmp	L(less48bytes)
931
932	cfi_restore_state
933	cfi_remember_state
934	ALIGN (4)
935L(shr_9):
936	cmp	$80, %ecx
937	lea	-48(%ecx), %ecx
938	mov	%edx, %eax
939	jae	L(shr_9_gobble)
940
941	movdqa	16(%esi), %xmm1
942	movdqa	%xmm1, %xmm2
943	palignr	$9,(%esi), %xmm1
944	pcmpeqb	(%edi), %xmm1
945
946	movdqa	32(%esi), %xmm3
947	palignr	$9,%xmm2, %xmm3
948	pcmpeqb	16(%edi), %xmm3
949
950	pand	%xmm1, %xmm3
951	pmovmskb %xmm3, %edx
952	lea	32(%edi), %edi
953	lea	32(%esi), %esi
954	sub	$0xffff, %edx
955	jnz	L(exit)
956	lea	(%ecx, %edi,1), %eax
957	lea	9(%ecx, %esi,1), %edx
958	POP (%edi)
959	POP (%esi)
960	jmp	L(less48bytes)
961
962	cfi_restore_state
963	cfi_remember_state
964	ALIGN (4)
965L(shr_9_gobble):
966	sub	$32, %ecx
967	movdqa	16(%esi), %xmm0
968	palignr	$9,(%esi), %xmm0
969	pcmpeqb	(%edi), %xmm0
970
971	movdqa	32(%esi), %xmm3
972	palignr	$9,16(%esi), %xmm3
973	pcmpeqb	16(%edi), %xmm3
974
975L(shr_9_gobble_loop):
976	pand	%xmm0, %xmm3
977	sub	$32, %ecx
978	pmovmskb %xmm3, %edx
979	movdqa	%xmm0, %xmm1
980
981	movdqa	64(%esi), %xmm3
982	palignr	$9,48(%esi), %xmm3
983	sbb	$0xffff, %edx
984	movdqa	48(%esi), %xmm0
985	palignr	$9,32(%esi), %xmm0
986	pcmpeqb	32(%edi), %xmm0
987	lea	32(%esi), %esi
988	pcmpeqb	48(%edi), %xmm3
989
990	lea	32(%edi), %edi
991	jz	L(shr_9_gobble_loop)
992	pand	%xmm0, %xmm3
993
994	cmp	$0, %ecx
995	jge	L(shr_9_gobble_next)
996	inc	%edx
997	add	$32, %ecx
998L(shr_9_gobble_next):
999	test	%edx, %edx
1000	jnz	L(exit)
1001
1002	pmovmskb %xmm3, %edx
1003	movdqa	%xmm0, %xmm1
1004	lea	32(%edi), %edi
1005	lea	32(%esi), %esi
1006	sub	$0xffff, %edx
1007	jnz	L(exit)
1008
1009	lea	(%ecx, %edi,1), %eax
1010	lea	9(%ecx, %esi,1), %edx
1011	POP (%edi)
1012	POP (%esi)
1013	jmp	L(less48bytes)
1014
1015	cfi_restore_state
1016	cfi_remember_state
1017	ALIGN (4)
1018L(shr_10):
1019	cmp	$80, %ecx
1020	lea	-48(%ecx), %ecx
1021	mov	%edx, %eax
1022	jae	L(shr_10_gobble)
1023
1024	movdqa	16(%esi), %xmm1
1025	movdqa	%xmm1, %xmm2
1026	palignr	$10, (%esi), %xmm1
1027	pcmpeqb	(%edi), %xmm1
1028
1029	movdqa	32(%esi), %xmm3
1030	palignr	$10,%xmm2, %xmm3
1031	pcmpeqb	16(%edi), %xmm3
1032
1033	pand	%xmm1, %xmm3
1034	pmovmskb %xmm3, %edx
1035	lea	32(%edi), %edi
1036	lea	32(%esi), %esi
1037	sub	$0xffff, %edx
1038	jnz	L(exit)
1039	lea	(%ecx, %edi,1), %eax
1040	lea	10(%ecx, %esi,1), %edx
1041	POP (%edi)
1042	POP (%esi)
1043	jmp	L(less48bytes)
1044
1045	cfi_restore_state
1046	cfi_remember_state
1047	ALIGN (4)
1048L(shr_10_gobble):
1049	sub	$32, %ecx
1050	movdqa	16(%esi), %xmm0
1051	palignr	$10, (%esi), %xmm0
1052	pcmpeqb	(%edi), %xmm0
1053
1054	movdqa	32(%esi), %xmm3
1055	palignr	$10, 16(%esi), %xmm3
1056	pcmpeqb	16(%edi), %xmm3
1057
1058L(shr_10_gobble_loop):
1059	pand	%xmm0, %xmm3
1060	sub	$32, %ecx
1061	pmovmskb %xmm3, %edx
1062	movdqa	%xmm0, %xmm1
1063
1064	movdqa	64(%esi), %xmm3
1065	palignr	$10,48(%esi), %xmm3
1066	sbb	$0xffff, %edx
1067	movdqa	48(%esi), %xmm0
1068	palignr	$10,32(%esi), %xmm0
1069	pcmpeqb	32(%edi), %xmm0
1070	lea	32(%esi), %esi
1071	pcmpeqb	48(%edi), %xmm3
1072
1073	lea	32(%edi), %edi
1074	jz	L(shr_10_gobble_loop)
1075	pand	%xmm0, %xmm3
1076
1077	cmp	$0, %ecx
1078	jge	L(shr_10_gobble_next)
1079	inc	%edx
1080	add	$32, %ecx
1081L(shr_10_gobble_next):
1082	test	%edx, %edx
1083	jnz	L(exit)
1084
1085	pmovmskb %xmm3, %edx
1086	movdqa	%xmm0, %xmm1
1087	lea	32(%edi), %edi
1088	lea	32(%esi), %esi
1089	sub	$0xffff, %edx
1090	jnz	L(exit)
1091
1092	lea	(%ecx, %edi,1), %eax
1093	lea	10(%ecx, %esi,1), %edx
1094	POP (%edi)
1095	POP (%esi)
1096	jmp	L(less48bytes)
1097
1098	cfi_restore_state
1099	cfi_remember_state
1100	ALIGN (4)
1101L(shr_11):
1102	cmp	$80, %ecx
1103	lea	-48(%ecx), %ecx
1104	mov	%edx, %eax
1105	jae	L(shr_11_gobble)
1106
1107	movdqa	16(%esi), %xmm1
1108	movdqa	%xmm1, %xmm2
1109	palignr	$11, (%esi), %xmm1
1110	pcmpeqb	(%edi), %xmm1
1111
1112	movdqa	32(%esi), %xmm3
1113	palignr	$11, %xmm2, %xmm3
1114	pcmpeqb	16(%edi), %xmm3
1115
1116	pand	%xmm1, %xmm3
1117	pmovmskb %xmm3, %edx
1118	lea	32(%edi), %edi
1119	lea	32(%esi), %esi
1120	sub	$0xffff, %edx
1121	jnz	L(exit)
1122	lea	(%ecx, %edi,1), %eax
1123	lea	11(%ecx, %esi,1), %edx
1124	POP (%edi)
1125	POP (%esi)
1126	jmp	L(less48bytes)
1127
1128	cfi_restore_state
1129	cfi_remember_state
1130	ALIGN (4)
1131L(shr_11_gobble):
1132	sub	$32, %ecx
1133	movdqa	16(%esi), %xmm0
1134	palignr	$11, (%esi), %xmm0
1135	pcmpeqb	(%edi), %xmm0
1136
1137	movdqa	32(%esi), %xmm3
1138	palignr	$11, 16(%esi), %xmm3
1139	pcmpeqb	16(%edi), %xmm3
1140
1141L(shr_11_gobble_loop):
1142	pand	%xmm0, %xmm3
1143	sub	$32, %ecx
1144	pmovmskb %xmm3, %edx
1145	movdqa	%xmm0, %xmm1
1146
1147	movdqa	64(%esi), %xmm3
1148	palignr	$11,48(%esi), %xmm3
1149	sbb	$0xffff, %edx
1150	movdqa	48(%esi), %xmm0
1151	palignr	$11,32(%esi), %xmm0
1152	pcmpeqb	32(%edi), %xmm0
1153	lea	32(%esi), %esi
1154	pcmpeqb	48(%edi), %xmm3
1155
1156	lea	32(%edi), %edi
1157	jz	L(shr_11_gobble_loop)
1158	pand	%xmm0, %xmm3
1159
1160	cmp	$0, %ecx
1161	jge	L(shr_11_gobble_next)
1162	inc	%edx
1163	add	$32, %ecx
1164L(shr_11_gobble_next):
1165	test	%edx, %edx
1166	jnz	L(exit)
1167
1168	pmovmskb %xmm3, %edx
1169	movdqa	%xmm0, %xmm1
1170	lea	32(%edi), %edi
1171	lea	32(%esi), %esi
1172	sub	$0xffff, %edx
1173	jnz	L(exit)
1174
1175	lea	(%ecx, %edi,1), %eax
1176	lea	11(%ecx, %esi,1), %edx
1177	POP (%edi)
1178	POP (%esi)
1179	jmp	L(less48bytes)
1180
1181	cfi_restore_state
1182	cfi_remember_state
1183	ALIGN (4)
1184L(shr_12):
1185	cmp	$80, %ecx
1186	lea	-48(%ecx), %ecx
1187	mov	%edx, %eax
1188	jae	L(shr_12_gobble)
1189
1190	movdqa	16(%esi), %xmm1
1191	movdqa	%xmm1, %xmm2
1192	palignr	$12, (%esi), %xmm1
1193	pcmpeqb	(%edi), %xmm1
1194
1195	movdqa	32(%esi), %xmm3
1196	palignr	$12, %xmm2, %xmm3
1197	pcmpeqb	16(%edi), %xmm3
1198
1199	pand	%xmm1, %xmm3
1200	pmovmskb %xmm3, %edx
1201	lea	32(%edi), %edi
1202	lea	32(%esi), %esi
1203	sub	$0xffff, %edx
1204	jnz	L(exit)
1205	lea	(%ecx, %edi,1), %eax
1206	lea	12(%ecx, %esi,1), %edx
1207	POP (%edi)
1208	POP (%esi)
1209	jmp	L(less48bytes)
1210
1211	cfi_restore_state
1212	cfi_remember_state
1213	ALIGN (4)
1214L(shr_12_gobble):
1215	sub	$32, %ecx
1216	movdqa	16(%esi), %xmm0
1217	palignr	$12, (%esi), %xmm0
1218	pcmpeqb	(%edi), %xmm0
1219
1220	movdqa	32(%esi), %xmm3
1221	palignr	$12, 16(%esi), %xmm3
1222	pcmpeqb	16(%edi), %xmm3
1223
1224L(shr_12_gobble_loop):
1225	pand	%xmm0, %xmm3
1226	sub	$32, %ecx
1227	pmovmskb %xmm3, %edx
1228	movdqa	%xmm0, %xmm1
1229
1230	movdqa	64(%esi), %xmm3
1231	palignr	$12,48(%esi), %xmm3
1232	sbb	$0xffff, %edx
1233	movdqa	48(%esi), %xmm0
1234	palignr	$12,32(%esi), %xmm0
1235	pcmpeqb	32(%edi), %xmm0
1236	lea	32(%esi), %esi
1237	pcmpeqb	48(%edi), %xmm3
1238
1239	lea	32(%edi), %edi
1240	jz	L(shr_12_gobble_loop)
1241	pand	%xmm0, %xmm3
1242
1243	cmp	$0, %ecx
1244	jge	L(shr_12_gobble_next)
1245	inc	%edx
1246	add	$32, %ecx
1247L(shr_12_gobble_next):
1248	test	%edx, %edx
1249	jnz	L(exit)
1250
1251	pmovmskb %xmm3, %edx
1252	movdqa	%xmm0, %xmm1
1253	lea	32(%edi), %edi
1254	lea	32(%esi), %esi
1255	sub	$0xffff, %edx
1256	jnz	L(exit)
1257
1258	lea	(%ecx, %edi,1), %eax
1259	lea	12(%ecx, %esi,1), %edx
1260	POP (%edi)
1261	POP (%esi)
1262	jmp	L(less48bytes)
1263
1264	cfi_restore_state
1265	cfi_remember_state
1266	ALIGN (4)
1267L(shr_13):
1268	cmp	$80, %ecx
1269	lea	-48(%ecx), %ecx
1270	mov	%edx, %eax
1271	jae	L(shr_13_gobble)
1272
1273	movdqa	16(%esi), %xmm1
1274	movdqa	%xmm1, %xmm2
1275	palignr	$13, (%esi), %xmm1
1276	pcmpeqb	(%edi), %xmm1
1277
1278	movdqa	32(%esi), %xmm3
1279	palignr	$13, %xmm2, %xmm3
1280	pcmpeqb	16(%edi), %xmm3
1281
1282	pand	%xmm1, %xmm3
1283	pmovmskb %xmm3, %edx
1284	lea	32(%edi), %edi
1285	lea	32(%esi), %esi
1286	sub	$0xffff, %edx
1287	jnz	L(exit)
1288	lea	(%ecx, %edi,1), %eax
1289	lea	13(%ecx, %esi,1), %edx
1290	POP (%edi)
1291	POP (%esi)
1292	jmp	L(less48bytes)
1293
1294	cfi_restore_state
1295	cfi_remember_state
1296	ALIGN (4)
1297L(shr_13_gobble):
1298	sub	$32, %ecx
1299	movdqa	16(%esi), %xmm0
1300	palignr	$13, (%esi), %xmm0
1301	pcmpeqb	(%edi), %xmm0
1302
1303	movdqa	32(%esi), %xmm3
1304	palignr	$13, 16(%esi), %xmm3
1305	pcmpeqb	16(%edi), %xmm3
1306
1307L(shr_13_gobble_loop):
1308	pand	%xmm0, %xmm3
1309	sub	$32, %ecx
1310	pmovmskb %xmm3, %edx
1311	movdqa	%xmm0, %xmm1
1312
1313	movdqa	64(%esi), %xmm3
1314	palignr	$13,48(%esi), %xmm3
1315	sbb	$0xffff, %edx
1316	movdqa	48(%esi), %xmm0
1317	palignr	$13,32(%esi), %xmm0
1318	pcmpeqb	32(%edi), %xmm0
1319	lea	32(%esi), %esi
1320	pcmpeqb	48(%edi), %xmm3
1321
1322	lea	32(%edi), %edi
1323	jz	L(shr_13_gobble_loop)
1324	pand	%xmm0, %xmm3
1325
1326	cmp	$0, %ecx
1327	jge	L(shr_13_gobble_next)
1328	inc	%edx
1329	add	$32, %ecx
1330L(shr_13_gobble_next):
1331	test	%edx, %edx
1332	jnz	L(exit)
1333
1334	pmovmskb %xmm3, %edx
1335	movdqa	%xmm0, %xmm1
1336	lea	32(%edi), %edi
1337	lea	32(%esi), %esi
1338	sub	$0xffff, %edx
1339	jnz	L(exit)
1340
1341	lea	(%ecx, %edi,1), %eax
1342	lea	13(%ecx, %esi,1), %edx
1343	POP (%edi)
1344	POP (%esi)
1345	jmp	L(less48bytes)
1346
1347	cfi_restore_state
1348	cfi_remember_state
1349	ALIGN (4)
1350L(shr_14):
1351	cmp	$80, %ecx
1352	lea	-48(%ecx), %ecx
1353	mov	%edx, %eax
1354	jae	L(shr_14_gobble)
1355
1356	movdqa	16(%esi), %xmm1
1357	movdqa	%xmm1, %xmm2
1358	palignr	$14, (%esi), %xmm1
1359	pcmpeqb	(%edi), %xmm1
1360
1361	movdqa	32(%esi), %xmm3
1362	palignr	$14, %xmm2, %xmm3
1363	pcmpeqb	16(%edi), %xmm3
1364
1365	pand	%xmm1, %xmm3
1366	pmovmskb %xmm3, %edx
1367	lea	32(%edi), %edi
1368	lea	32(%esi), %esi
1369	sub	$0xffff, %edx
1370	jnz	L(exit)
1371	lea	(%ecx, %edi,1), %eax
1372	lea	14(%ecx, %esi,1), %edx
1373	POP (%edi)
1374	POP (%esi)
1375	jmp	L(less48bytes)
1376
1377	cfi_restore_state
1378	cfi_remember_state
1379	ALIGN (4)
1380L(shr_14_gobble):
1381	sub	$32, %ecx
1382	movdqa	16(%esi), %xmm0
1383	palignr	$14, (%esi), %xmm0
1384	pcmpeqb	(%edi), %xmm0
1385
1386	movdqa	32(%esi), %xmm3
1387	palignr	$14, 16(%esi), %xmm3
1388	pcmpeqb	16(%edi), %xmm3
1389
1390L(shr_14_gobble_loop):
1391	pand	%xmm0, %xmm3
1392	sub	$32, %ecx
1393	pmovmskb %xmm3, %edx
1394	movdqa	%xmm0, %xmm1
1395
1396	movdqa	64(%esi), %xmm3
1397	palignr	$14,48(%esi), %xmm3
1398	sbb	$0xffff, %edx
1399	movdqa	48(%esi), %xmm0
1400	palignr	$14,32(%esi), %xmm0
1401	pcmpeqb	32(%edi), %xmm0
1402	lea	32(%esi), %esi
1403	pcmpeqb	48(%edi), %xmm3
1404
1405	lea	32(%edi), %edi
1406	jz	L(shr_14_gobble_loop)
1407	pand	%xmm0, %xmm3
1408
1409	cmp	$0, %ecx
1410	jge	L(shr_14_gobble_next)
1411	inc	%edx
1412	add	$32, %ecx
1413L(shr_14_gobble_next):
1414	test	%edx, %edx
1415	jnz	L(exit)
1416
1417	pmovmskb %xmm3, %edx
1418	movdqa	%xmm0, %xmm1
1419	lea	32(%edi), %edi
1420	lea	32(%esi), %esi
1421	sub	$0xffff, %edx
1422	jnz	L(exit)
1423
1424	lea	(%ecx, %edi,1), %eax
1425	lea	14(%ecx, %esi,1), %edx
1426	POP (%edi)
1427	POP (%esi)
1428	jmp	L(less48bytes)
1429
1430	cfi_restore_state
1431	cfi_remember_state
1432	ALIGN (4)
1433L(shr_15):
1434	cmp	$80, %ecx
1435	lea	-48(%ecx), %ecx
1436	mov	%edx, %eax
1437	jae	L(shr_15_gobble)
1438
1439	movdqa	16(%esi), %xmm1
1440	movdqa	%xmm1, %xmm2
1441	palignr	$15, (%esi), %xmm1
1442	pcmpeqb	(%edi), %xmm1
1443
1444	movdqa	32(%esi), %xmm3
1445	palignr	$15, %xmm2, %xmm3
1446	pcmpeqb	16(%edi), %xmm3
1447
1448	pand	%xmm1, %xmm3
1449	pmovmskb %xmm3, %edx
1450	lea	32(%edi), %edi
1451	lea	32(%esi), %esi
1452	sub	$0xffff, %edx
1453	jnz	L(exit)
1454	lea	(%ecx, %edi,1), %eax
1455	lea	15(%ecx, %esi,1), %edx
1456	POP (%edi)
1457	POP (%esi)
1458	jmp	L(less48bytes)
1459
1460	cfi_restore_state
1461	cfi_remember_state
1462	ALIGN (4)
1463L(shr_15_gobble):
1464	sub	$32, %ecx
1465	movdqa	16(%esi), %xmm0
1466	palignr	$15, (%esi), %xmm0
1467	pcmpeqb	(%edi), %xmm0
1468
1469	movdqa	32(%esi), %xmm3
1470	palignr	$15, 16(%esi), %xmm3
1471	pcmpeqb	16(%edi), %xmm3
1472
1473L(shr_15_gobble_loop):
1474	pand	%xmm0, %xmm3
1475	sub	$32, %ecx
1476	pmovmskb %xmm3, %edx
1477	movdqa	%xmm0, %xmm1
1478
1479	movdqa	64(%esi), %xmm3
1480	palignr	$15,48(%esi), %xmm3
1481	sbb	$0xffff, %edx
1482	movdqa	48(%esi), %xmm0
1483	palignr	$15,32(%esi), %xmm0
1484	pcmpeqb	32(%edi), %xmm0
1485	lea	32(%esi), %esi
1486	pcmpeqb	48(%edi), %xmm3
1487
1488	lea	32(%edi), %edi
1489	jz	L(shr_15_gobble_loop)
1490	pand	%xmm0, %xmm3
1491
1492	cmp	$0, %ecx
1493	jge	L(shr_15_gobble_next)
1494	inc	%edx
1495	add	$32, %ecx
1496L(shr_15_gobble_next):
1497	test	%edx, %edx
1498	jnz	L(exit)
1499
1500	pmovmskb %xmm3, %edx
1501	movdqa	%xmm0, %xmm1
1502	lea	32(%edi), %edi
1503	lea	32(%esi), %esi
1504	sub	$0xffff, %edx
1505	jnz	L(exit)
1506
1507	lea	(%ecx, %edi,1), %eax
1508	lea	15(%ecx, %esi,1), %edx
1509	POP (%edi)
1510	POP (%esi)
1511	jmp	L(less48bytes)
1512
1513	cfi_restore_state
1514	cfi_remember_state
1515	ALIGN (4)
1516L(exit):
1517	pmovmskb %xmm1, %ebx
1518	sub	$0xffff, %ebx
1519	jz	L(first16bytes)
1520	lea	-16(%esi), %esi
1521	lea	-16(%edi), %edi
1522	mov	%ebx, %edx
1523L(first16bytes):
1524	add	%eax, %esi
1525L(less16bytes):
1526	test	%dl, %dl
1527	jz	L(next_24_bytes)
1528
1529	test	$0x01, %dl
1530	jnz	L(Byte16)
1531
1532	test	$0x02, %dl
1533	jnz	L(Byte17)
1534
1535	test	$0x04, %dl
1536	jnz	L(Byte18)
1537
1538	test	$0x08, %dl
1539	jnz	L(Byte19)
1540
1541	test	$0x10, %dl
1542	jnz	L(Byte20)
1543
1544	test	$0x20, %dl
1545	jnz	L(Byte21)
1546
1547	test	$0x40, %dl
1548	jnz	L(Byte22)
1549L(Byte23):
1550	movzbl	 -9(%edi), %eax
1551	movzbl	 -9(%esi), %edx
1552	sub	%edx, %eax
1553	RETURN
1554
1555	ALIGN (4)
1556L(Byte16):
1557	movzbl	 -16(%edi), %eax
1558	movzbl	 -16(%esi), %edx
1559	sub	%edx, %eax
1560	RETURN
1561
1562	ALIGN (4)
1563L(Byte17):
1564	movzbl	 -15(%edi), %eax
1565	movzbl	 -15(%esi), %edx
1566	sub	%edx, %eax
1567	RETURN
1568
1569	ALIGN (4)
1570L(Byte18):
1571	movzbl	 -14(%edi), %eax
1572	movzbl	 -14(%esi), %edx
1573	sub	%edx, %eax
1574	RETURN
1575
1576	ALIGN (4)
1577L(Byte19):
1578	movzbl	 -13(%edi), %eax
1579	movzbl	 -13(%esi), %edx
1580	sub	%edx, %eax
1581	RETURN
1582
1583	ALIGN (4)
1584L(Byte20):
1585	movzbl	 -12(%edi), %eax
1586	movzbl	 -12(%esi), %edx
1587	sub	%edx, %eax
1588	RETURN
1589
1590	ALIGN (4)
1591L(Byte21):
1592	movzbl	 -11(%edi), %eax
1593	movzbl	 -11(%esi), %edx
1594	sub	%edx, %eax
1595	RETURN
1596
1597	ALIGN (4)
1598L(Byte22):
1599	movzbl	 -10(%edi), %eax
1600	movzbl	 -10(%esi), %edx
1601	sub	%edx, %eax
1602	RETURN
1603
1604	ALIGN (4)
1605L(next_24_bytes):
1606	lea	8(%edi), %edi
1607	lea	8(%esi), %esi
1608	test	$0x01, %dh
1609	jnz	L(Byte16)
1610
1611	test	$0x02, %dh
1612	jnz	L(Byte17)
1613
1614	test	$0x04, %dh
1615	jnz	L(Byte18)
1616
1617	test	$0x08, %dh
1618	jnz	L(Byte19)
1619
1620	test	$0x10, %dh
1621	jnz	L(Byte20)
1622
1623	test	$0x20, %dh
1624	jnz	L(Byte21)
1625
1626	test	$0x40, %dh
1627	jnz	L(Byte22)
1628
1629	ALIGN (4)
1630L(Byte31):
1631	movzbl	 -9(%edi), %eax
1632	movzbl	 -9(%esi), %edx
1633	sub	%edx, %eax
1634	RETURN_END
1635	CFI_PUSH (%ebx)
1636
1637	ALIGN (4)
1638L(more8bytes):
1639	cmp	$16, %ecx
1640	jae	L(more16bytes)
1641	cmp	$8, %ecx
1642	je	L(8bytes)
1643	cmp	$9, %ecx
1644	je	L(9bytes)
1645	cmp	$10, %ecx
1646	je	L(10bytes)
1647	cmp	$11, %ecx
1648	je	L(11bytes)
1649	cmp	$12, %ecx
1650	je	L(12bytes)
1651	cmp	$13, %ecx
1652	je	L(13bytes)
1653	cmp	$14, %ecx
1654	je	L(14bytes)
1655	jmp	L(15bytes)
1656
1657	ALIGN (4)
1658L(more16bytes):
1659	cmp	$24, %ecx
1660	jae	L(more24bytes)
1661	cmp	$16, %ecx
1662	je	L(16bytes)
1663	cmp	$17, %ecx
1664	je	L(17bytes)
1665	cmp	$18, %ecx
1666	je	L(18bytes)
1667	cmp	$19, %ecx
1668	je	L(19bytes)
1669	cmp	$20, %ecx
1670	je	L(20bytes)
1671	cmp	$21, %ecx
1672	je	L(21bytes)
1673	cmp	$22, %ecx
1674	je	L(22bytes)
1675	jmp	L(23bytes)
1676
1677	ALIGN (4)
1678L(more24bytes):
1679	cmp	$32, %ecx
1680	jae	L(more32bytes)
1681	cmp	$24, %ecx
1682	je	L(24bytes)
1683	cmp	$25, %ecx
1684	je	L(25bytes)
1685	cmp	$26, %ecx
1686	je	L(26bytes)
1687	cmp	$27, %ecx
1688	je	L(27bytes)
1689	cmp	$28, %ecx
1690	je	L(28bytes)
1691	cmp	$29, %ecx
1692	je	L(29bytes)
1693	cmp	$30, %ecx
1694	je	L(30bytes)
1695	jmp	L(31bytes)
1696
1697	ALIGN (4)
1698L(more32bytes):
1699	cmp	$40, %ecx
1700	jae	L(more40bytes)
1701	cmp	$32, %ecx
1702	je	L(32bytes)
1703	cmp	$33, %ecx
1704	je	L(33bytes)
1705	cmp	$34, %ecx
1706	je	L(34bytes)
1707	cmp	$35, %ecx
1708	je	L(35bytes)
1709	cmp	$36, %ecx
1710	je	L(36bytes)
1711	cmp	$37, %ecx
1712	je	L(37bytes)
1713	cmp	$38, %ecx
1714	je	L(38bytes)
1715	jmp	L(39bytes)
1716
1717	ALIGN (4)
1718L(more40bytes):
1719	cmp	$40, %ecx
1720	je	L(40bytes)
1721	cmp	$41, %ecx
1722	je	L(41bytes)
1723	cmp	$42, %ecx
1724	je	L(42bytes)
1725	cmp	$43, %ecx
1726	je	L(43bytes)
1727	cmp	$44, %ecx
1728	je	L(44bytes)
1729	cmp	$45, %ecx
1730	je	L(45bytes)
1731	cmp	$46, %ecx
1732	je	L(46bytes)
1733	jmp	L(47bytes)
1734
1735	ALIGN (4)
1736L(less48bytes):
1737	cmp	$8, %ecx
1738	jae	L(more8bytes)
1739	cmp	$2, %ecx
1740	je	L(2bytes)
1741	cmp	$3, %ecx
1742	je	L(3bytes)
1743	cmp	$4, %ecx
1744	je	L(4bytes)
1745	cmp	$5, %ecx
1746	je	L(5bytes)
1747	cmp	$6, %ecx
1748	je	L(6bytes)
1749	jmp	L(7bytes)
1750
1751
1752	ALIGN (4)
1753L(44bytes):
1754	mov	-44(%eax), %ecx
1755	mov	-44(%edx), %ebx
1756	cmp	%ebx, %ecx
1757	jne	L(find_diff)
1758L(40bytes):
1759	mov	-40(%eax), %ecx
1760	mov	-40(%edx), %ebx
1761	cmp	%ebx, %ecx
1762	jne	L(find_diff)
1763L(36bytes):
1764	mov	-36(%eax), %ecx
1765	mov	-36(%edx), %ebx
1766	cmp	%ebx, %ecx
1767	jne	L(find_diff)
1768L(32bytes):
1769	mov	-32(%eax), %ecx
1770	mov	-32(%edx), %ebx
1771	cmp	%ebx, %ecx
1772	jne	L(find_diff)
1773L(28bytes):
1774	mov	-28(%eax), %ecx
1775	mov	-28(%edx), %ebx
1776	cmp	%ebx, %ecx
1777	jne	L(find_diff)
1778L(24bytes):
1779	mov	-24(%eax), %ecx
1780	mov	-24(%edx), %ebx
1781	cmp	%ebx, %ecx
1782	jne	L(find_diff)
1783L(20bytes):
1784	mov	-20(%eax), %ecx
1785	mov	-20(%edx), %ebx
1786	cmp	%ebx, %ecx
1787	jne	L(find_diff)
1788L(16bytes):
1789	mov	-16(%eax), %ecx
1790	mov	-16(%edx), %ebx
1791	cmp	%ebx, %ecx
1792	jne	L(find_diff)
1793L(12bytes):
1794	mov	-12(%eax), %ecx
1795	mov	-12(%edx), %ebx
1796	cmp	%ebx, %ecx
1797	jne	L(find_diff)
1798L(8bytes):
1799	mov	-8(%eax), %ecx
1800	mov	-8(%edx), %ebx
1801	cmp	%ebx, %ecx
1802	jne	L(find_diff)
1803L(4bytes):
1804	mov	-4(%eax), %ecx
1805	mov	-4(%edx), %ebx
1806	cmp	%ebx, %ecx
1807	mov	$0, %eax
1808	jne	L(find_diff)
1809	POP (%ebx)
1810	ret
1811	CFI_PUSH (%ebx)
1812
1813	ALIGN (4)
1814L(45bytes):
1815	mov	-45(%eax), %ecx
1816	mov	-45(%edx), %ebx
1817	cmp	%ebx, %ecx
1818	jne	L(find_diff)
1819L(41bytes):
1820	mov	-41(%eax), %ecx
1821	mov	-41(%edx), %ebx
1822	cmp	%ebx, %ecx
1823	jne	L(find_diff)
1824L(37bytes):
1825	mov	-37(%eax), %ecx
1826	mov	-37(%edx), %ebx
1827	cmp	%ebx, %ecx
1828	jne	L(find_diff)
1829L(33bytes):
1830	mov	-33(%eax), %ecx
1831	mov	-33(%edx), %ebx
1832	cmp	%ebx, %ecx
1833	jne	L(find_diff)
1834L(29bytes):
1835	mov	-29(%eax), %ecx
1836	mov	-29(%edx), %ebx
1837	cmp	%ebx, %ecx
1838	jne	L(find_diff)
1839L(25bytes):
1840	mov	-25(%eax), %ecx
1841	mov	-25(%edx), %ebx
1842	cmp	%ebx, %ecx
1843	jne	L(find_diff)
1844L(21bytes):
1845	mov	-21(%eax), %ecx
1846	mov	-21(%edx), %ebx
1847	cmp	%ebx, %ecx
1848	jne	L(find_diff)
1849L(17bytes):
1850	mov	-17(%eax), %ecx
1851	mov	-17(%edx), %ebx
1852	cmp	%ebx, %ecx
1853	jne	L(find_diff)
1854L(13bytes):
1855	mov	-13(%eax), %ecx
1856	mov	-13(%edx), %ebx
1857	cmp	%ebx, %ecx
1858	jne	L(find_diff)
1859L(9bytes):
1860	mov	-9(%eax), %ecx
1861	mov	-9(%edx), %ebx
1862	cmp	%ebx, %ecx
1863	jne	L(find_diff)
1864L(5bytes):
1865	mov	-5(%eax), %ecx
1866	mov	-5(%edx), %ebx
1867	cmp	%ebx, %ecx
1868	jne	L(find_diff)
1869	movzbl	-1(%eax), %ecx
1870	cmp	-1(%edx), %cl
1871	mov	$0, %eax
1872	jne	L(end)
1873	POP (%ebx)
1874	ret
1875	CFI_PUSH (%ebx)
1876
1877	ALIGN (4)
1878L(46bytes):
1879	mov	-46(%eax), %ecx
1880	mov	-46(%edx), %ebx
1881	cmp	%ebx, %ecx
1882	jne	L(find_diff)
1883L(42bytes):
1884	mov	-42(%eax), %ecx
1885	mov	-42(%edx), %ebx
1886	cmp	%ebx, %ecx
1887	jne	L(find_diff)
1888L(38bytes):
1889	mov	-38(%eax), %ecx
1890	mov	-38(%edx), %ebx
1891	cmp	%ebx, %ecx
1892	jne	L(find_diff)
1893L(34bytes):
1894	mov	-34(%eax), %ecx
1895	mov	-34(%edx), %ebx
1896	cmp	%ebx, %ecx
1897	jne	L(find_diff)
1898L(30bytes):
1899	mov	-30(%eax), %ecx
1900	mov	-30(%edx), %ebx
1901	cmp	%ebx, %ecx
1902	jne	L(find_diff)
1903L(26bytes):
1904	mov	-26(%eax), %ecx
1905	mov	-26(%edx), %ebx
1906	cmp	%ebx, %ecx
1907	jne	L(find_diff)
1908L(22bytes):
1909	mov	-22(%eax), %ecx
1910	mov	-22(%edx), %ebx
1911	cmp	%ebx, %ecx
1912	jne	L(find_diff)
1913L(18bytes):
1914	mov	-18(%eax), %ecx
1915	mov	-18(%edx), %ebx
1916	cmp	%ebx, %ecx
1917	jne	L(find_diff)
1918L(14bytes):
1919	mov	-14(%eax), %ecx
1920	mov	-14(%edx), %ebx
1921	cmp	%ebx, %ecx
1922	jne	L(find_diff)
1923L(10bytes):
1924	mov	-10(%eax), %ecx
1925	mov	-10(%edx), %ebx
1926	cmp	%ebx, %ecx
1927	jne	L(find_diff)
1928L(6bytes):
1929	mov	-6(%eax), %ecx
1930	mov	-6(%edx), %ebx
1931	cmp	%ebx, %ecx
1932	jne	L(find_diff)
1933L(2bytes):
1934	movzwl	-2(%eax), %ecx
1935	movzwl	-2(%edx), %ebx
1936	cmp	%bl, %cl
1937	jne	L(end)
1938	cmp	%bh, %ch
1939	mov	$0, %eax
1940	jne	L(end)
1941	POP (%ebx)
1942	ret
1943	CFI_PUSH (%ebx)
1944
1945	ALIGN (4)
1946L(47bytes):
1947	movl	-47(%eax), %ecx
1948	movl	-47(%edx), %ebx
1949	cmp	%ebx, %ecx
1950	jne	L(find_diff)
1951L(43bytes):
1952	movl	-43(%eax), %ecx
1953	movl	-43(%edx), %ebx
1954	cmp	%ebx, %ecx
1955	jne	L(find_diff)
1956L(39bytes):
1957	movl	-39(%eax), %ecx
1958	movl	-39(%edx), %ebx
1959	cmp	%ebx, %ecx
1960	jne	L(find_diff)
1961L(35bytes):
1962	movl	-35(%eax), %ecx
1963	movl	-35(%edx), %ebx
1964	cmp	%ebx, %ecx
1965	jne	L(find_diff)
1966L(31bytes):
1967	movl	-31(%eax), %ecx
1968	movl	-31(%edx), %ebx
1969	cmp	%ebx, %ecx
1970	jne	L(find_diff)
1971L(27bytes):
1972	movl	-27(%eax), %ecx
1973	movl	-27(%edx), %ebx
1974	cmp	%ebx, %ecx
1975	jne	L(find_diff)
1976L(23bytes):
1977	movl	-23(%eax), %ecx
1978	movl	-23(%edx), %ebx
1979	cmp	%ebx, %ecx
1980	jne	L(find_diff)
1981L(19bytes):
1982	movl	-19(%eax), %ecx
1983	movl	-19(%edx), %ebx
1984	cmp	%ebx, %ecx
1985	jne	L(find_diff)
1986L(15bytes):
1987	movl	-15(%eax), %ecx
1988	movl	-15(%edx), %ebx
1989	cmp	%ebx, %ecx
1990	jne	L(find_diff)
1991L(11bytes):
1992	movl	-11(%eax), %ecx
1993	movl	-11(%edx), %ebx
1994	cmp	%ebx, %ecx
1995	jne	L(find_diff)
1996L(7bytes):
1997	movl	-7(%eax), %ecx
1998	movl	-7(%edx), %ebx
1999	cmp	%ebx, %ecx
2000	jne	L(find_diff)
2001L(3bytes):
2002	movzwl	-3(%eax), %ecx
2003	movzwl	-3(%edx), %ebx
2004	cmpb	%bl, %cl
2005	jne	L(end)
2006	cmp	%bx, %cx
2007	jne	L(end)
2008	movzbl	-1(%eax), %eax
2009	cmpb	-1(%edx), %al
2010	mov	$0, %eax
2011	jne	L(end)
2012	POP (%ebx)
2013	ret
2014	CFI_PUSH (%ebx)
2015
2016	ALIGN (4)
2017L(find_diff):
2018	cmpb	%bl, %cl
2019	jne	L(end)
2020	cmp	%bx, %cx
2021	jne	L(end)
2022	shr	$16,%ecx
2023	shr	$16,%ebx
2024	cmp	%bl, %cl
2025	jne	L(end)
2026	cmp	%bx, %cx
2027L(end):
2028	POP (%ebx)
2029	mov	$1, %eax
2030	ja	L(bigger)
2031	neg	%eax
2032L(bigger):
2033	ret
2034
2035END (MEMCMP)
2036