1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMCPY
34# define MEMCPY	memcpy
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#ifdef USE_AS_BCOPY
77# define SRC		PARMS
78# define DEST		SRC+4
79# define LEN		DEST+4
80#else
81# define DEST		PARMS
82# define SRC		DEST+4
83# define LEN		SRC+4
84#endif
85
86#define CFI_PUSH(REG)		\
87  cfi_adjust_cfa_offset (4);		\
88  cfi_rel_offset (REG, 0)
89
90#define CFI_POP(REG)		\
91  cfi_adjust_cfa_offset (-4);		\
92  cfi_restore (REG)
93
94#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
95#define POP(REG)	popl REG; CFI_POP (REG)
96
97#if (defined SHARED || defined __PIC__)
98# define PARMS		8		/* Preserve EBX.  */
99# define ENTRANCE	PUSH (%ebx);
100# define RETURN_END	POP (%ebx); ret
101# define RETURN		RETURN_END; CFI_PUSH (%ebx)
102# define JMPTBL(I, B)	I - B
103
104# define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
105
106/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
107	jump table with relative offsets.  INDEX is a register contains the
108	index into the jump table.   SCALE is the scale of INDEX. */
109
110# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
111    /* We first load PC into EBX.  */		\
112	SETUP_PIC_REG(bx);		\
113    /* Get the address of the jump table.  */		\
114	addl	$(TABLE - .), %ebx;		\
115    /* Get the entry and convert the relative offset to the		\
116	absolute	address.  */		\
117	addl	(%ebx, INDEX, SCALE), %ebx;		\
118    /* We loaded the jump table.  Go.  */		\
119	jmp	*%ebx
120#else
121
122# define PARMS		4
123# define ENTRANCE
124# define RETURN_END	ret
125# define RETURN		RETURN_END
126# define JMPTBL(I, B)	I
127
128/* Branch to an entry in a jump table.  TABLE is a jump table with
129	absolute offsets.  INDEX is a register contains the index into the
130	jump table.  SCALE is the scale of INDEX. */
131
132# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
133	jmp	*TABLE(, INDEX, SCALE)
134#endif
135
136	.section .text.ssse3,"ax",@progbits
137ENTRY (MEMCPY)
138	ENTRANCE
139	movl	LEN(%esp), %ecx
140	movl	SRC(%esp), %eax
141	movl	DEST(%esp), %edx
142
143#ifdef USE_AS_MEMMOVE
144	cmp	%eax, %edx
145	jb	L(copy_forward)
146	je	L(fwd_write_0bytes)
147	cmp	$32, %ecx
148	jae	L(memmove_bwd)
149	jmp	L(bk_write_less32bytes_2)
150
151	.p2align 4
152L(memmove_bwd):
153	add	%ecx, %eax
154	cmp	%eax, %edx
155	movl	SRC(%esp), %eax
156	jb	L(copy_backward)
157
158L(copy_forward):
159#endif
160	cmp	$48, %ecx
161	jae	L(48bytesormore)
162
163L(fwd_write_less32bytes):
164#ifndef USE_AS_MEMMOVE
165	cmp	%dl, %al
166	jb	L(bk_write)
167#endif
168	add	%ecx, %edx
169	add	%ecx, %eax
170	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
171#ifndef USE_AS_MEMMOVE
172	.p2align 4
173L(bk_write):
174	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
175#endif
176
177	.p2align 4
178L(48bytesormore):
179#ifndef USE_AS_MEMMOVE
180	movlpd	(%eax), %xmm0
181	movlpd	8(%eax), %xmm1
182	movlpd	%xmm0, (%edx)
183	movlpd	%xmm1, 8(%edx)
184#else
185	movdqu	(%eax), %xmm0
186#endif
187	PUSH (%edi)
188	movl	%edx, %edi
189	and	$-16, %edx
190	add	$16, %edx
191	sub	%edx, %edi
192	add	%edi, %ecx
193	sub	%edi, %eax
194
195#ifdef SHARED_CACHE_SIZE_HALF
196	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
197#else
198# if (defined SHARED || defined __PIC__)
199	SETUP_PIC_REG(bx)
200	add	$_GLOBAL_OFFSET_TABLE_, %ebx
201	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
202# else
203	cmp	__x86_shared_cache_size_half, %ecx
204# endif
205#endif
206
207	mov	%eax, %edi
208	jae	L(large_page)
209	and	$0xf, %edi
210	jz	L(shl_0)
211	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
212
213	.p2align 4
214L(shl_0):
215#ifdef USE_AS_MEMMOVE
216	movl	DEST+4(%esp), %edi
217	movdqu	%xmm0, (%edi)
218#endif
219	xor	%edi, %edi
220	cmp	$127, %ecx
221	ja	L(shl_0_gobble)
222	lea	-32(%ecx), %ecx
223
224	.p2align 4
225L(shl_0_loop):
226	movdqa	(%eax, %edi), %xmm0
227	movdqa	16(%eax, %edi), %xmm1
228	sub	$32, %ecx
229	movdqa	%xmm0, (%edx, %edi)
230	movdqa	%xmm1, 16(%edx, %edi)
231	lea	32(%edi), %edi
232	jb	L(shl_0_end)
233
234	movdqa	(%eax, %edi), %xmm0
235	movdqa	16(%eax, %edi), %xmm1
236	sub	$32, %ecx
237	movdqa	%xmm0, (%edx, %edi)
238	movdqa	%xmm1, 16(%edx, %edi)
239	lea	32(%edi), %edi
240	jb	L(shl_0_end)
241
242	movdqa	(%eax, %edi), %xmm0
243	movdqa	16(%eax, %edi), %xmm1
244	sub	$32, %ecx
245	movdqa	%xmm0, (%edx, %edi)
246	movdqa	%xmm1, 16(%edx, %edi)
247	lea	32(%edi), %edi
248	jb	L(shl_0_end)
249
250	movdqa	(%eax, %edi), %xmm0
251	movdqa	16(%eax, %edi), %xmm1
252	sub	$32, %ecx
253	movdqa	%xmm0, (%edx, %edi)
254	movdqa	%xmm1, 16(%edx, %edi)
255	lea	32(%edi), %edi
256
257L(shl_0_end):
258	lea	32(%ecx), %ecx
259	add	%ecx, %edi
260	add	%edi, %edx
261	add	%edi, %eax
262	POP (%edi)
263	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
264
265	CFI_PUSH (%edi)
266
267	.p2align 4
268L(shl_0_gobble):
269#ifdef DATA_CACHE_SIZE_HALF
270	cmp	$DATA_CACHE_SIZE_HALF, %ecx
271#else
272# if (defined SHARED || defined __PIC__)
273	SETUP_PIC_REG(bx)
274	add	$_GLOBAL_OFFSET_TABLE_, %ebx
275	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
276# else
277	cmp	__x86_data_cache_size_half, %ecx
278# endif
279#endif
280	POP	(%edi)
281	lea	-128(%ecx), %ecx
282	jae	L(shl_0_gobble_mem_loop)
283
284	.p2align 4
285L(shl_0_gobble_cache_loop):
286	movdqa	(%eax), %xmm0
287	movdqa	0x10(%eax), %xmm1
288	movdqa	0x20(%eax), %xmm2
289	movdqa	0x30(%eax), %xmm3
290	movdqa	0x40(%eax), %xmm4
291	movdqa	0x50(%eax), %xmm5
292	movdqa	0x60(%eax), %xmm6
293	movdqa	0x70(%eax), %xmm7
294	lea	0x80(%eax), %eax
295	sub	$128, %ecx
296	movdqa	%xmm0, (%edx)
297	movdqa	%xmm1, 0x10(%edx)
298	movdqa	%xmm2, 0x20(%edx)
299	movdqa	%xmm3, 0x30(%edx)
300	movdqa	%xmm4, 0x40(%edx)
301	movdqa	%xmm5, 0x50(%edx)
302	movdqa	%xmm6, 0x60(%edx)
303	movdqa	%xmm7, 0x70(%edx)
304	lea	0x80(%edx), %edx
305
306	jae	L(shl_0_gobble_cache_loop)
307	cmp	$-0x40, %ecx
308	lea	0x80(%ecx), %ecx
309	jl	L(shl_0_cache_less_64bytes)
310
311	movdqa	(%eax), %xmm0
312	sub	$0x40, %ecx
313	movdqa	0x10(%eax), %xmm1
314	movdqa	%xmm0, (%edx)
315	movdqa	%xmm1, 0x10(%edx)
316	movdqa	0x20(%eax), %xmm0
317	movdqa	0x30(%eax), %xmm1
318	add	$0x40, %eax
319	movdqa	%xmm0, 0x20(%edx)
320	movdqa	%xmm1, 0x30(%edx)
321	add	$0x40, %edx
322
323L(shl_0_cache_less_64bytes):
324	cmp	$0x20, %ecx
325	jb	L(shl_0_cache_less_32bytes)
326	movdqa	(%eax), %xmm0
327	sub	$0x20, %ecx
328	movdqa	0x10(%eax), %xmm1
329	add	$0x20, %eax
330	movdqa	%xmm0, (%edx)
331	movdqa	%xmm1, 0x10(%edx)
332	add	$0x20, %edx
333
334L(shl_0_cache_less_32bytes):
335	cmp	$0x10, %ecx
336	jb	L(shl_0_cache_less_16bytes)
337	sub	$0x10, %ecx
338	movdqa	(%eax), %xmm0
339	add	$0x10, %eax
340	movdqa	%xmm0, (%edx)
341	add	$0x10, %edx
342
343L(shl_0_cache_less_16bytes):
344	add	%ecx, %edx
345	add	%ecx, %eax
346	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
347
348	.p2align 4
349L(shl_0_gobble_mem_loop):
350	prefetcht0 0x1c0(%eax)
351	prefetcht0 0x280(%eax)
352	prefetcht0 0x1c0(%edx)
353
354	movdqa	(%eax), %xmm0
355	movdqa	0x10(%eax), %xmm1
356	movdqa	0x20(%eax), %xmm2
357	movdqa	0x30(%eax), %xmm3
358	movdqa	0x40(%eax), %xmm4
359	movdqa	0x50(%eax), %xmm5
360	movdqa	0x60(%eax), %xmm6
361	movdqa	0x70(%eax), %xmm7
362	lea	0x80(%eax), %eax
363	sub	$0x80, %ecx
364	movdqa	%xmm0, (%edx)
365	movdqa	%xmm1, 0x10(%edx)
366	movdqa	%xmm2, 0x20(%edx)
367	movdqa	%xmm3, 0x30(%edx)
368	movdqa	%xmm4, 0x40(%edx)
369	movdqa	%xmm5, 0x50(%edx)
370	movdqa	%xmm6, 0x60(%edx)
371	movdqa	%xmm7, 0x70(%edx)
372	lea	0x80(%edx), %edx
373
374	jae	L(shl_0_gobble_mem_loop)
375	cmp	$-0x40, %ecx
376	lea	0x80(%ecx), %ecx
377	jl	L(shl_0_mem_less_64bytes)
378
379	movdqa	(%eax), %xmm0
380	sub	$0x40, %ecx
381	movdqa	0x10(%eax), %xmm1
382
383	movdqa	%xmm0, (%edx)
384	movdqa	%xmm1, 0x10(%edx)
385
386	movdqa	0x20(%eax), %xmm0
387	movdqa	0x30(%eax), %xmm1
388	add	$0x40, %eax
389
390	movdqa	%xmm0, 0x20(%edx)
391	movdqa	%xmm1, 0x30(%edx)
392	add	$0x40, %edx
393
394L(shl_0_mem_less_64bytes):
395	cmp	$0x20, %ecx
396	jb	L(shl_0_mem_less_32bytes)
397	movdqa	(%eax), %xmm0
398	sub	$0x20, %ecx
399	movdqa	0x10(%eax), %xmm1
400	add	$0x20, %eax
401	movdqa	%xmm0, (%edx)
402	movdqa	%xmm1, 0x10(%edx)
403	add	$0x20, %edx
404
405L(shl_0_mem_less_32bytes):
406	cmp	$0x10, %ecx
407	jb	L(shl_0_mem_less_16bytes)
408	sub	$0x10, %ecx
409	movdqa	(%eax), %xmm0
410	add	$0x10, %eax
411	movdqa	%xmm0, (%edx)
412	add	$0x10, %edx
413
414L(shl_0_mem_less_16bytes):
415	add	%ecx, %edx
416	add	%ecx, %eax
417	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
418
419	.p2align 4
420L(shl_1):
421#ifndef USE_AS_MEMMOVE
422	movaps	-1(%eax), %xmm1
423#else
424	movl	DEST+4(%esp), %edi
425	movaps	-1(%eax), %xmm1
426	movdqu	%xmm0, (%edi)
427#endif
428#ifdef DATA_CACHE_SIZE_HALF
429	cmp	$DATA_CACHE_SIZE_HALF, %ecx
430#else
431# if (defined SHARED || defined __PIC__)
432	SETUP_PIC_REG(bx)
433	add	$_GLOBAL_OFFSET_TABLE_, %ebx
434	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
435# else
436	cmp	__x86_data_cache_size_half, %ecx
437# endif
438#endif
439	jb L(sh_1_no_prefetch)
440
441	lea	-64(%ecx), %ecx
442
443	.p2align 4
444L(Shl1LoopStart):
445	prefetcht0 0x1c0(%eax)
446	prefetcht0 0x1c0(%edx)
447	movaps	15(%eax), %xmm2
448	movaps	31(%eax), %xmm3
449	movaps	47(%eax), %xmm4
450	movaps	63(%eax), %xmm5
451	movaps	%xmm5, %xmm7
452	palignr	$1, %xmm4, %xmm5
453	palignr	$1, %xmm3, %xmm4
454	movaps	%xmm5, 48(%edx)
455	palignr	$1, %xmm2, %xmm3
456	lea	64(%eax), %eax
457	palignr	$1, %xmm1, %xmm2
458	movaps	%xmm4, 32(%edx)
459	movaps	%xmm3, 16(%edx)
460	movaps	%xmm7, %xmm1
461	movaps	%xmm2, (%edx)
462	lea	64(%edx), %edx
463	sub	$64, %ecx
464	ja	L(Shl1LoopStart)
465
466L(Shl1LoopLeave):
467	add	$32, %ecx
468	jle	L(shl_end_0)
469
470	movaps	15(%eax), %xmm2
471	movaps	31(%eax), %xmm3
472	palignr	$1, %xmm2, %xmm3
473	palignr	$1, %xmm1, %xmm2
474	movaps	%xmm2, (%edx)
475	movaps	%xmm3, 16(%edx)
476	lea	32(%edx, %ecx), %edx
477	lea	32(%eax, %ecx), %eax
478	POP (%edi)
479	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
480
481	CFI_PUSH (%edi)
482
483	.p2align 4
484L(sh_1_no_prefetch):
485	lea	-32(%ecx), %ecx
486	lea	-1(%eax), %eax
487	xor	%edi, %edi
488
489	.p2align 4
490L(sh_1_no_prefetch_loop):
491	movdqa	16(%eax, %edi), %xmm2
492	sub	$32, %ecx
493	movdqa	32(%eax, %edi), %xmm3
494	movdqa	%xmm3, %xmm4
495	palignr	$1, %xmm2, %xmm3
496	palignr	$1, %xmm1, %xmm2
497	lea	32(%edi), %edi
498	movdqa	%xmm2, -32(%edx, %edi)
499	movdqa	%xmm3, -16(%edx, %edi)
500	jb	L(sh_1_end_no_prefetch_loop)
501
502	movdqa	16(%eax, %edi), %xmm2
503	sub	$32, %ecx
504	movdqa	32(%eax, %edi), %xmm3
505	movdqa	%xmm3, %xmm1
506	palignr	$1, %xmm2, %xmm3
507	palignr	$1, %xmm4, %xmm2
508	lea	32(%edi), %edi
509	movdqa	%xmm2, -32(%edx, %edi)
510	movdqa	%xmm3, -16(%edx, %edi)
511	jae	L(sh_1_no_prefetch_loop)
512
513L(sh_1_end_no_prefetch_loop):
514	lea	32(%ecx), %ecx
515	add	%ecx, %edi
516	add	%edi, %edx
517	lea	1(%edi, %eax), %eax
518	POP	(%edi)
519	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
520
521	CFI_PUSH (%edi)
522
523	.p2align 4
524L(shl_2):
525#ifndef USE_AS_MEMMOVE
526	movaps	-2(%eax), %xmm1
527#else
528	movl	DEST+4(%esp), %edi
529	movaps	-2(%eax), %xmm1
530	movdqu	%xmm0, (%edi)
531#endif
532#ifdef DATA_CACHE_SIZE_HALF
533	cmp	$DATA_CACHE_SIZE_HALF, %ecx
534#else
535# if (defined SHARED || defined __PIC__)
536	SETUP_PIC_REG(bx)
537	add	$_GLOBAL_OFFSET_TABLE_, %ebx
538	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
539# else
540	cmp	__x86_data_cache_size_half, %ecx
541# endif
542#endif
543	jb L(sh_2_no_prefetch)
544
545	lea	-64(%ecx), %ecx
546
547	.p2align 4
548L(Shl2LoopStart):
549	prefetcht0 0x1c0(%eax)
550	prefetcht0 0x1c0(%edx)
551	movaps	14(%eax), %xmm2
552	movaps	30(%eax), %xmm3
553	movaps	46(%eax), %xmm4
554	movaps	62(%eax), %xmm5
555	movaps	%xmm5, %xmm7
556	palignr	$2, %xmm4, %xmm5
557	palignr	$2, %xmm3, %xmm4
558	movaps	%xmm5, 48(%edx)
559	palignr	$2, %xmm2, %xmm3
560	lea	64(%eax), %eax
561	palignr	$2, %xmm1, %xmm2
562	movaps	%xmm4, 32(%edx)
563	movaps	%xmm3, 16(%edx)
564	movaps	%xmm7, %xmm1
565	movaps	%xmm2, (%edx)
566	lea	64(%edx), %edx
567	sub	$64, %ecx
568	ja	L(Shl2LoopStart)
569
570L(Shl2LoopLeave):
571	add	$32, %ecx
572	jle	L(shl_end_0)
573
574	movaps	14(%eax), %xmm2
575	movaps	30(%eax), %xmm3
576	palignr	$2, %xmm2, %xmm3
577	palignr	$2, %xmm1, %xmm2
578	movaps	%xmm2, (%edx)
579	movaps	%xmm3, 16(%edx)
580	lea	32(%edx, %ecx), %edx
581	lea	32(%eax, %ecx), %eax
582	POP (%edi)
583	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
584
585	CFI_PUSH (%edi)
586
587	.p2align 4
588L(sh_2_no_prefetch):
589	lea	-32(%ecx), %ecx
590	lea	-2(%eax), %eax
591	xor	%edi, %edi
592
593	.p2align 4
594L(sh_2_no_prefetch_loop):
595	movdqa	16(%eax, %edi), %xmm2
596	sub	$32, %ecx
597	movdqa	32(%eax, %edi), %xmm3
598	movdqa	%xmm3, %xmm4
599	palignr	$2, %xmm2, %xmm3
600	palignr	$2, %xmm1, %xmm2
601	lea	32(%edi), %edi
602	movdqa	%xmm2, -32(%edx, %edi)
603	movdqa	%xmm3, -16(%edx, %edi)
604	jb	L(sh_2_end_no_prefetch_loop)
605
606	movdqa	16(%eax, %edi), %xmm2
607	sub	$32, %ecx
608	movdqa	32(%eax, %edi), %xmm3
609	movdqa	%xmm3, %xmm1
610	palignr	$2, %xmm2, %xmm3
611	palignr	$2, %xmm4, %xmm2
612	lea	32(%edi), %edi
613	movdqa	%xmm2, -32(%edx, %edi)
614	movdqa	%xmm3, -16(%edx, %edi)
615	jae	L(sh_2_no_prefetch_loop)
616
617L(sh_2_end_no_prefetch_loop):
618	lea	32(%ecx), %ecx
619	add	%ecx, %edi
620	add	%edi, %edx
621	lea	2(%edi, %eax), %eax
622	POP	(%edi)
623	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
624
625	CFI_PUSH (%edi)
626
627	.p2align 4
628L(shl_3):
629#ifndef USE_AS_MEMMOVE
630	movaps	-3(%eax), %xmm1
631#else
632	movl	DEST+4(%esp), %edi
633	movaps	-3(%eax), %xmm1
634	movdqu	%xmm0, (%edi)
635#endif
636#ifdef DATA_CACHE_SIZE_HALF
637	cmp	$DATA_CACHE_SIZE_HALF, %ecx
638#else
639# if (defined SHARED || defined __PIC__)
640	SETUP_PIC_REG(bx)
641	add	$_GLOBAL_OFFSET_TABLE_, %ebx
642	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
643# else
644	cmp	__x86_data_cache_size_half, %ecx
645# endif
646#endif
647	jb L(sh_3_no_prefetch)
648
649	lea	-64(%ecx), %ecx
650
651	.p2align 4
652L(Shl3LoopStart):
653	prefetcht0 0x1c0(%eax)
654	prefetcht0 0x1c0(%edx)
655	movaps	13(%eax), %xmm2
656	movaps	29(%eax), %xmm3
657	movaps	45(%eax), %xmm4
658	movaps	61(%eax), %xmm5
659	movaps	%xmm5, %xmm7
660	palignr	$3, %xmm4, %xmm5
661	palignr	$3, %xmm3, %xmm4
662	movaps	%xmm5, 48(%edx)
663	palignr	$3, %xmm2, %xmm3
664	lea	64(%eax), %eax
665	palignr	$3, %xmm1, %xmm2
666	movaps	%xmm4, 32(%edx)
667	movaps	%xmm3, 16(%edx)
668	movaps	%xmm7, %xmm1
669	movaps	%xmm2, (%edx)
670	lea	64(%edx), %edx
671	sub	$64, %ecx
672	ja	L(Shl3LoopStart)
673
674L(Shl3LoopLeave):
675	add	$32, %ecx
676	jle	L(shl_end_0)
677
678	movaps	13(%eax), %xmm2
679	movaps	29(%eax), %xmm3
680	palignr	$3, %xmm2, %xmm3
681	palignr	$3, %xmm1, %xmm2
682	movaps	%xmm2, (%edx)
683	movaps	%xmm3, 16(%edx)
684	lea	32(%edx, %ecx), %edx
685	lea	32(%eax, %ecx), %eax
686	POP (%edi)
687	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
688
689	CFI_PUSH (%edi)
690
691	.p2align 4
692L(sh_3_no_prefetch):
693	lea	-32(%ecx), %ecx
694	lea	-3(%eax), %eax
695	xor	%edi, %edi
696
697	.p2align 4
698L(sh_3_no_prefetch_loop):
699	movdqa	16(%eax, %edi), %xmm2
700	sub	$32, %ecx
701	movdqa	32(%eax, %edi), %xmm3
702	movdqa	%xmm3, %xmm4
703	palignr	$3, %xmm2, %xmm3
704	palignr	$3, %xmm1, %xmm2
705	lea	32(%edi), %edi
706	movdqa	%xmm2, -32(%edx, %edi)
707	movdqa	%xmm3, -16(%edx, %edi)
708
709	jb	L(sh_3_end_no_prefetch_loop)
710
711	movdqa	16(%eax, %edi), %xmm2
712	sub	$32, %ecx
713	movdqa	32(%eax, %edi), %xmm3
714	movdqa	%xmm3, %xmm1
715	palignr	$3, %xmm2, %xmm3
716	palignr	$3, %xmm4, %xmm2
717	lea	32(%edi), %edi
718	movdqa	%xmm2, -32(%edx, %edi)
719	movdqa	%xmm3, -16(%edx, %edi)
720
721	jae	L(sh_3_no_prefetch_loop)
722
723L(sh_3_end_no_prefetch_loop):
724	lea	32(%ecx), %ecx
725	add	%ecx, %edi
726	add	%edi, %edx
727	lea	3(%edi, %eax), %eax
728	POP	(%edi)
729	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
730
731	CFI_PUSH (%edi)
732
733	.p2align 4
734L(shl_4):
735#ifndef USE_AS_MEMMOVE
736	movaps	-4(%eax), %xmm1
737#else
738	movl	DEST+4(%esp), %edi
739	movaps	-4(%eax), %xmm1
740	movdqu	%xmm0, (%edi)
741#endif
742#ifdef DATA_CACHE_SIZE_HALF
743	cmp	$DATA_CACHE_SIZE_HALF, %ecx
744#else
745# if (defined SHARED || defined __PIC__)
746	SETUP_PIC_REG(bx)
747	add	$_GLOBAL_OFFSET_TABLE_, %ebx
748	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
749# else
750	cmp	__x86_data_cache_size_half, %ecx
751# endif
752#endif
753	jb L(sh_4_no_prefetch)
754
755	lea	-64(%ecx), %ecx
756
757	.p2align 4
758L(Shl4LoopStart):
759	prefetcht0 0x1c0(%eax)
760	prefetcht0 0x1c0(%edx)
761	movaps	12(%eax), %xmm2
762	movaps	28(%eax), %xmm3
763	movaps	44(%eax), %xmm4
764	movaps	60(%eax), %xmm5
765	movaps	%xmm5, %xmm7
766	palignr	$4, %xmm4, %xmm5
767	palignr	$4, %xmm3, %xmm4
768	movaps	%xmm5, 48(%edx)
769	palignr	$4, %xmm2, %xmm3
770	lea	64(%eax), %eax
771	palignr	$4, %xmm1, %xmm2
772	movaps	%xmm4, 32(%edx)
773	movaps	%xmm3, 16(%edx)
774	movaps	%xmm7, %xmm1
775	movaps	%xmm2, (%edx)
776	lea	64(%edx), %edx
777	sub	$64, %ecx
778	ja	L(Shl4LoopStart)
779
780L(Shl4LoopLeave):
781	add	$32, %ecx
782	jle	L(shl_end_0)
783
784	movaps	12(%eax), %xmm2
785	movaps	28(%eax), %xmm3
786	palignr	$4, %xmm2, %xmm3
787	palignr	$4, %xmm1, %xmm2
788	movaps	%xmm2, (%edx)
789	movaps	%xmm3, 16(%edx)
790	lea	32(%edx, %ecx), %edx
791	lea	32(%eax, %ecx), %eax
792	POP (%edi)
793	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
794
795	CFI_PUSH (%edi)
796
797	.p2align 4
798L(sh_4_no_prefetch):
799	lea	-32(%ecx), %ecx
800	lea	-4(%eax), %eax
801	xor	%edi, %edi
802
803	.p2align 4
804L(sh_4_no_prefetch_loop):
805	movdqa	16(%eax, %edi), %xmm2
806	sub	$32, %ecx
807	movdqa	32(%eax, %edi), %xmm3
808	movdqa	%xmm3, %xmm4
809	palignr	$4, %xmm2, %xmm3
810	palignr	$4, %xmm1, %xmm2
811	lea	32(%edi), %edi
812	movdqa	%xmm2, -32(%edx, %edi)
813	movdqa	%xmm3, -16(%edx, %edi)
814
815	jb	L(sh_4_end_no_prefetch_loop)
816
817	movdqa	16(%eax, %edi), %xmm2
818	sub	$32, %ecx
819	movdqa	32(%eax, %edi), %xmm3
820	movdqa	%xmm3, %xmm1
821	palignr	$4, %xmm2, %xmm3
822	palignr	$4, %xmm4, %xmm2
823	lea	32(%edi), %edi
824	movdqa	%xmm2, -32(%edx, %edi)
825	movdqa	%xmm3, -16(%edx, %edi)
826
827	jae	L(sh_4_no_prefetch_loop)
828
829L(sh_4_end_no_prefetch_loop):
830	lea	32(%ecx), %ecx
831	add	%ecx, %edi
832	add	%edi, %edx
833	lea	4(%edi, %eax), %eax
834	POP	(%edi)
835	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
836
837	CFI_PUSH (%edi)
838
839	.p2align 4
840L(shl_5):
841#ifndef USE_AS_MEMMOVE
842	movaps	-5(%eax), %xmm1
843#else
844	movl	DEST+4(%esp), %edi
845	movaps	-5(%eax), %xmm1
846	movdqu	%xmm0, (%edi)
847#endif
848#ifdef DATA_CACHE_SIZE_HALF
849	cmp	$DATA_CACHE_SIZE_HALF, %ecx
850#else
851# if (defined SHARED || defined __PIC__)
852	SETUP_PIC_REG(bx)
853	add	$_GLOBAL_OFFSET_TABLE_, %ebx
854	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
855# else
856	cmp	__x86_data_cache_size_half, %ecx
857# endif
858#endif
859	jb L(sh_5_no_prefetch)
860
861	lea	-64(%ecx), %ecx
862
863	.p2align 4
864L(Shl5LoopStart):
865	prefetcht0 0x1c0(%eax)
866	prefetcht0 0x1c0(%edx)
867	movaps	11(%eax), %xmm2
868	movaps	27(%eax), %xmm3
869	movaps	43(%eax), %xmm4
870	movaps	59(%eax), %xmm5
871	movaps	%xmm5, %xmm7
872	palignr	$5, %xmm4, %xmm5
873	palignr	$5, %xmm3, %xmm4
874	movaps	%xmm5, 48(%edx)
875	palignr	$5, %xmm2, %xmm3
876	lea	64(%eax), %eax
877	palignr	$5, %xmm1, %xmm2
878	movaps	%xmm4, 32(%edx)
879	movaps	%xmm3, 16(%edx)
880	movaps	%xmm7, %xmm1
881	movaps	%xmm2, (%edx)
882	lea	64(%edx), %edx
883	sub	$64, %ecx
884	ja	L(Shl5LoopStart)
885
886L(Shl5LoopLeave):
887	add	$32, %ecx
888	jle	L(shl_end_0)
889
890	movaps	11(%eax), %xmm2
891	movaps	27(%eax), %xmm3
892	palignr	$5, %xmm2, %xmm3
893	palignr	$5, %xmm1, %xmm2
894	movaps	%xmm2, (%edx)
895	movaps	%xmm3, 16(%edx)
896	lea	32(%edx, %ecx), %edx
897	lea	32(%eax, %ecx), %eax
898	POP (%edi)
899	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
900
901	CFI_PUSH (%edi)
902
903	.p2align 4
904L(sh_5_no_prefetch):
905	lea	-32(%ecx), %ecx
906	lea	-5(%eax), %eax
907	xor	%edi, %edi
908
909	.p2align 4
910L(sh_5_no_prefetch_loop):
911	movdqa	16(%eax, %edi), %xmm2
912	sub	$32, %ecx
913	movdqa	32(%eax, %edi), %xmm3
914	movdqa	%xmm3, %xmm4
915	palignr	$5, %xmm2, %xmm3
916	palignr	$5, %xmm1, %xmm2
917	lea	32(%edi), %edi
918	movdqa	%xmm2, -32(%edx, %edi)
919	movdqa	%xmm3, -16(%edx, %edi)
920
921	jb	L(sh_5_end_no_prefetch_loop)
922
923	movdqa	16(%eax, %edi), %xmm2
924	sub	$32, %ecx
925	movdqa	32(%eax, %edi), %xmm3
926	movdqa	%xmm3, %xmm1
927	palignr	$5, %xmm2, %xmm3
928	palignr	$5, %xmm4, %xmm2
929	lea	32(%edi), %edi
930	movdqa	%xmm2, -32(%edx, %edi)
931	movdqa	%xmm3, -16(%edx, %edi)
932
933	jae	L(sh_5_no_prefetch_loop)
934
935L(sh_5_end_no_prefetch_loop):
936	lea	32(%ecx), %ecx
937	add	%ecx, %edi
938	add	%edi, %edx
939	lea	5(%edi, %eax), %eax
940	POP	(%edi)
941	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
942
943	CFI_PUSH (%edi)
944
945	.p2align 4
946L(shl_6):
947#ifndef USE_AS_MEMMOVE
948	movaps	-6(%eax), %xmm1
949#else
950	movl	DEST+4(%esp), %edi
951	movaps	-6(%eax), %xmm1
952	movdqu	%xmm0, (%edi)
953#endif
954#ifdef DATA_CACHE_SIZE_HALF
955	cmp	$DATA_CACHE_SIZE_HALF, %ecx
956#else
957# if (defined SHARED || defined __PIC__)
958	SETUP_PIC_REG(bx)
959	add	$_GLOBAL_OFFSET_TABLE_, %ebx
960	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
961# else
962	cmp	__x86_data_cache_size_half, %ecx
963# endif
964#endif
965	jb L(sh_6_no_prefetch)
966
967	lea	-64(%ecx), %ecx
968
969	.p2align 4
970L(Shl6LoopStart):
971	prefetcht0 0x1c0(%eax)
972	prefetcht0 0x1c0(%edx)
973	movaps	10(%eax), %xmm2
974	movaps	26(%eax), %xmm3
975	movaps	42(%eax), %xmm4
976	movaps	58(%eax), %xmm5
977	movaps	%xmm5, %xmm7
978	palignr	$6, %xmm4, %xmm5
979	palignr	$6, %xmm3, %xmm4
980	movaps	%xmm5, 48(%edx)
981	palignr	$6, %xmm2, %xmm3
982	lea	64(%eax), %eax
983	palignr	$6, %xmm1, %xmm2
984	movaps	%xmm4, 32(%edx)
985	movaps	%xmm3, 16(%edx)
986	movaps	%xmm7, %xmm1
987	movaps	%xmm2, (%edx)
988	lea	64(%edx), %edx
989	sub	$64, %ecx
990	ja	L(Shl6LoopStart)
991
992L(Shl6LoopLeave):
993	add	$32, %ecx
994	jle	L(shl_end_0)
995
996	movaps	10(%eax), %xmm2
997	movaps	26(%eax), %xmm3
998	palignr	$6, %xmm2, %xmm3
999	palignr	$6, %xmm1, %xmm2
1000	movaps	%xmm2, (%edx)
1001	movaps	%xmm3, 16(%edx)
1002	lea	32(%edx, %ecx), %edx
1003	lea	32(%eax, %ecx), %eax
1004	POP (%edi)
1005	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1006
1007	CFI_PUSH (%edi)
1008
1009	.p2align 4
1010L(sh_6_no_prefetch):
1011	lea	-32(%ecx), %ecx
1012	lea	-6(%eax), %eax
1013	xor	%edi, %edi
1014
1015	.p2align 4
1016L(sh_6_no_prefetch_loop):
1017	movdqa	16(%eax, %edi), %xmm2
1018	sub	$32, %ecx
1019	movdqa	32(%eax, %edi), %xmm3
1020	movdqa	%xmm3, %xmm4
1021	palignr	$6, %xmm2, %xmm3
1022	palignr	$6, %xmm1, %xmm2
1023	lea	32(%edi), %edi
1024	movdqa	%xmm2, -32(%edx, %edi)
1025	movdqa	%xmm3, -16(%edx, %edi)
1026
1027	jb	L(sh_6_end_no_prefetch_loop)
1028
1029	movdqa	16(%eax, %edi), %xmm2
1030	sub	$32, %ecx
1031	movdqa	32(%eax, %edi), %xmm3
1032	movdqa	%xmm3, %xmm1
1033	palignr	$6, %xmm2, %xmm3
1034	palignr	$6, %xmm4, %xmm2
1035	lea	32(%edi), %edi
1036	movdqa	%xmm2, -32(%edx, %edi)
1037	movdqa	%xmm3, -16(%edx, %edi)
1038
1039	jae	L(sh_6_no_prefetch_loop)
1040
1041L(sh_6_end_no_prefetch_loop):
1042	lea	32(%ecx), %ecx
1043	add	%ecx, %edi
1044	add	%edi, %edx
1045	lea	6(%edi, %eax), %eax
1046	POP	(%edi)
1047	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1048
1049	CFI_PUSH (%edi)
1050
1051	.p2align 4
1052L(shl_7):
1053#ifndef USE_AS_MEMMOVE
1054	movaps	-7(%eax), %xmm1
1055#else
1056	movl	DEST+4(%esp), %edi
1057	movaps	-7(%eax), %xmm1
1058	movdqu	%xmm0, (%edi)
1059#endif
1060#ifdef DATA_CACHE_SIZE_HALF
1061	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1062#else
1063# if (defined SHARED || defined __PIC__)
1064	SETUP_PIC_REG(bx)
1065	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1066	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1067# else
1068	cmp	__x86_data_cache_size_half, %ecx
1069# endif
1070#endif
1071	jb L(sh_7_no_prefetch)
1072
1073	lea	-64(%ecx), %ecx
1074
1075	.p2align 4
1076L(Shl7LoopStart):
1077	prefetcht0 0x1c0(%eax)
1078	prefetcht0 0x1c0(%edx)
1079	movaps	9(%eax), %xmm2
1080	movaps	25(%eax), %xmm3
1081	movaps	41(%eax), %xmm4
1082	movaps	57(%eax), %xmm5
1083	movaps	%xmm5, %xmm7
1084	palignr	$7, %xmm4, %xmm5
1085	palignr	$7, %xmm3, %xmm4
1086	movaps	%xmm5, 48(%edx)
1087	palignr	$7, %xmm2, %xmm3
1088	lea	64(%eax), %eax
1089	palignr	$7, %xmm1, %xmm2
1090	movaps	%xmm4, 32(%edx)
1091	movaps	%xmm3, 16(%edx)
1092	movaps	%xmm7, %xmm1
1093	movaps	%xmm2, (%edx)
1094	lea	64(%edx), %edx
1095	sub	$64, %ecx
1096	ja	L(Shl7LoopStart)
1097
1098L(Shl7LoopLeave):
1099	add	$32, %ecx
1100	jle	L(shl_end_0)
1101
1102	movaps	9(%eax), %xmm2
1103	movaps	25(%eax), %xmm3
1104	palignr	$7, %xmm2, %xmm3
1105	palignr	$7, %xmm1, %xmm2
1106	movaps	%xmm2, (%edx)
1107	movaps	%xmm3, 16(%edx)
1108	lea	32(%edx, %ecx), %edx
1109	lea	32(%eax, %ecx), %eax
1110	POP (%edi)
1111	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1112
1113	CFI_PUSH (%edi)
1114
1115	.p2align 4
1116L(sh_7_no_prefetch):
1117	lea	-32(%ecx), %ecx
1118	lea	-7(%eax), %eax
1119	xor	%edi, %edi
1120
1121	.p2align 4
1122L(sh_7_no_prefetch_loop):
1123	movdqa	16(%eax, %edi), %xmm2
1124	sub	$32, %ecx
1125	movdqa	32(%eax, %edi), %xmm3
1126	movdqa	%xmm3, %xmm4
1127	palignr	$7, %xmm2, %xmm3
1128	palignr	$7, %xmm1, %xmm2
1129	lea	32(%edi), %edi
1130	movdqa	%xmm2, -32(%edx, %edi)
1131	movdqa	%xmm3, -16(%edx, %edi)
1132	jb	L(sh_7_end_no_prefetch_loop)
1133
1134	movdqa	16(%eax, %edi), %xmm2
1135	sub	$32, %ecx
1136	movdqa	32(%eax, %edi), %xmm3
1137	movdqa	%xmm3, %xmm1
1138	palignr	$7, %xmm2, %xmm3
1139	palignr	$7, %xmm4, %xmm2
1140	lea	32(%edi), %edi
1141	movdqa	%xmm2, -32(%edx, %edi)
1142	movdqa	%xmm3, -16(%edx, %edi)
1143	jae	L(sh_7_no_prefetch_loop)
1144
1145L(sh_7_end_no_prefetch_loop):
1146	lea	32(%ecx), %ecx
1147	add	%ecx, %edi
1148	add	%edi, %edx
1149	lea	7(%edi, %eax), %eax
1150	POP	(%edi)
1151	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1152
1153	CFI_PUSH (%edi)
1154
1155	.p2align 4
1156L(shl_8):
1157#ifndef USE_AS_MEMMOVE
1158	movaps	-8(%eax), %xmm1
1159#else
1160	movl	DEST+4(%esp), %edi
1161	movaps	-8(%eax), %xmm1
1162	movdqu	%xmm0, (%edi)
1163#endif
1164#ifdef DATA_CACHE_SIZE_HALF
1165	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1166#else
1167# if (defined SHARED || defined __PIC__)
1168	SETUP_PIC_REG(bx)
1169	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1170	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1171# else
1172	cmp	__x86_data_cache_size_half, %ecx
1173# endif
1174#endif
1175	jb L(sh_8_no_prefetch)
1176
1177	lea	-64(%ecx), %ecx
1178
1179	.p2align 4
1180L(Shl8LoopStart):
1181	prefetcht0 0x1c0(%eax)
1182	prefetcht0 0x1c0(%edx)
1183	movaps	8(%eax), %xmm2
1184	movaps	24(%eax), %xmm3
1185	movaps	40(%eax), %xmm4
1186	movaps	56(%eax), %xmm5
1187	movaps	%xmm5, %xmm7
1188	palignr	$8, %xmm4, %xmm5
1189	palignr	$8, %xmm3, %xmm4
1190	movaps	%xmm5, 48(%edx)
1191	palignr	$8, %xmm2, %xmm3
1192	lea	64(%eax), %eax
1193	palignr	$8, %xmm1, %xmm2
1194	movaps	%xmm4, 32(%edx)
1195	movaps	%xmm3, 16(%edx)
1196	movaps	%xmm7, %xmm1
1197	movaps	%xmm2, (%edx)
1198	lea	64(%edx), %edx
1199	sub	$64, %ecx
1200	ja	L(Shl8LoopStart)
1201
1202L(LoopLeave8):
1203	add	$32, %ecx
1204	jle	L(shl_end_0)
1205
1206	movaps	8(%eax), %xmm2
1207	movaps	24(%eax), %xmm3
1208	palignr	$8, %xmm2, %xmm3
1209	palignr	$8, %xmm1, %xmm2
1210	movaps	%xmm2, (%edx)
1211	movaps	%xmm3, 16(%edx)
1212	lea	32(%edx, %ecx), %edx
1213	lea	32(%eax, %ecx), %eax
1214	POP (%edi)
1215	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1216
1217	CFI_PUSH (%edi)
1218
1219	.p2align 4
1220L(sh_8_no_prefetch):
1221	lea	-32(%ecx), %ecx
1222	lea	-8(%eax), %eax
1223	xor	%edi, %edi
1224
1225	.p2align 4
1226L(sh_8_no_prefetch_loop):
1227	movdqa	16(%eax, %edi), %xmm2
1228	sub	$32, %ecx
1229	movdqa	32(%eax, %edi), %xmm3
1230	movdqa	%xmm3, %xmm4
1231	palignr	$8, %xmm2, %xmm3
1232	palignr	$8, %xmm1, %xmm2
1233	lea	32(%edi), %edi
1234	movdqa	%xmm2, -32(%edx, %edi)
1235	movdqa	%xmm3, -16(%edx, %edi)
1236	jb	L(sh_8_end_no_prefetch_loop)
1237
1238	movdqa	16(%eax, %edi), %xmm2
1239	sub	$32, %ecx
1240	movdqa	32(%eax, %edi), %xmm3
1241	movdqa	%xmm3, %xmm1
1242	palignr	$8, %xmm2, %xmm3
1243	palignr	$8, %xmm4, %xmm2
1244	lea	32(%edi), %edi
1245	movdqa	%xmm2, -32(%edx, %edi)
1246	movdqa	%xmm3, -16(%edx, %edi)
1247	jae	L(sh_8_no_prefetch_loop)
1248
1249L(sh_8_end_no_prefetch_loop):
1250	lea	32(%ecx), %ecx
1251	add	%ecx, %edi
1252	add	%edi, %edx
1253	lea	8(%edi, %eax), %eax
1254	POP	(%edi)
1255	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1256
1257	CFI_PUSH (%edi)
1258
1259	.p2align 4
1260L(shl_9):
1261#ifndef USE_AS_MEMMOVE
1262	movaps	-9(%eax), %xmm1
1263#else
1264	movl	DEST+4(%esp), %edi
1265	movaps	-9(%eax), %xmm1
1266	movdqu	%xmm0, (%edi)
1267#endif
1268#ifdef DATA_CACHE_SIZE_HALF
1269	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1270#else
1271# if (defined SHARED || defined __PIC__)
1272	SETUP_PIC_REG(bx)
1273	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1274	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1275# else
1276	cmp	__x86_data_cache_size_half, %ecx
1277# endif
1278#endif
1279	jb L(sh_9_no_prefetch)
1280
1281	lea	-64(%ecx), %ecx
1282
1283	.p2align 4
1284L(Shl9LoopStart):
1285	prefetcht0 0x1c0(%eax)
1286	prefetcht0 0x1c0(%edx)
1287	movaps	7(%eax), %xmm2
1288	movaps	23(%eax), %xmm3
1289	movaps	39(%eax), %xmm4
1290	movaps	55(%eax), %xmm5
1291	movaps	%xmm5, %xmm7
1292	palignr	$9, %xmm4, %xmm5
1293	palignr	$9, %xmm3, %xmm4
1294	movaps	%xmm5, 48(%edx)
1295	palignr	$9, %xmm2, %xmm3
1296	lea	64(%eax), %eax
1297	palignr	$9, %xmm1, %xmm2
1298	movaps	%xmm4, 32(%edx)
1299	movaps	%xmm3, 16(%edx)
1300	movaps	%xmm7, %xmm1
1301	movaps	%xmm2, (%edx)
1302	lea	64(%edx), %edx
1303	sub	$64, %ecx
1304	ja	L(Shl9LoopStart)
1305
1306L(Shl9LoopLeave):
1307	add	$32, %ecx
1308	jle	L(shl_end_0)
1309
1310	movaps	7(%eax), %xmm2
1311	movaps	23(%eax), %xmm3
1312	palignr	$9, %xmm2, %xmm3
1313	palignr	$9, %xmm1, %xmm2
1314
1315	movaps	%xmm2, (%edx)
1316	movaps	%xmm3, 16(%edx)
1317	lea	32(%edx, %ecx), %edx
1318	lea	32(%eax, %ecx), %eax
1319	POP (%edi)
1320	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1321
1322	CFI_PUSH (%edi)
1323
1324	.p2align 4
1325L(sh_9_no_prefetch):
1326	lea	-32(%ecx), %ecx
1327	lea	-9(%eax), %eax
1328	xor	%edi, %edi
1329
1330	.p2align 4
1331L(sh_9_no_prefetch_loop):
1332	movdqa	16(%eax, %edi), %xmm2
1333	sub	$32, %ecx
1334	movdqa	32(%eax, %edi), %xmm3
1335	movdqa	%xmm3, %xmm4
1336	palignr	$9, %xmm2, %xmm3
1337	palignr	$9, %xmm1, %xmm2
1338	lea	32(%edi), %edi
1339	movdqa	%xmm2, -32(%edx, %edi)
1340	movdqa	%xmm3, -16(%edx, %edi)
1341	jb	L(sh_9_end_no_prefetch_loop)
1342
1343	movdqa	16(%eax, %edi), %xmm2
1344	sub	$32, %ecx
1345	movdqa	32(%eax, %edi), %xmm3
1346	movdqa	%xmm3, %xmm1
1347	palignr	$9, %xmm2, %xmm3
1348	palignr	$9, %xmm4, %xmm2
1349	lea	32(%edi), %edi
1350	movdqa	%xmm2, -32(%edx, %edi)
1351	movdqa	%xmm3, -16(%edx, %edi)
1352	jae	L(sh_9_no_prefetch_loop)
1353
1354L(sh_9_end_no_prefetch_loop):
1355	lea	32(%ecx), %ecx
1356	add	%ecx, %edi
1357	add	%edi, %edx
1358	lea	9(%edi, %eax), %eax
1359	POP	(%edi)
1360	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1361
1362	CFI_PUSH (%edi)
1363
1364	.p2align 4
1365L(shl_10):
1366#ifndef USE_AS_MEMMOVE
1367	movaps	-10(%eax), %xmm1
1368#else
1369	movl	DEST+4(%esp), %edi
1370	movaps	-10(%eax), %xmm1
1371	movdqu	%xmm0, (%edi)
1372#endif
1373#ifdef DATA_CACHE_SIZE_HALF
1374	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1375#else
1376# if (defined SHARED || defined __PIC__)
1377	SETUP_PIC_REG(bx)
1378	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1379	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1380# else
1381	cmp	__x86_data_cache_size_half, %ecx
1382# endif
1383#endif
1384	jb L(sh_10_no_prefetch)
1385
1386	lea	-64(%ecx), %ecx
1387
1388	.p2align 4
1389L(Shl10LoopStart):
1390	prefetcht0 0x1c0(%eax)
1391	prefetcht0 0x1c0(%edx)
1392	movaps	6(%eax), %xmm2
1393	movaps	22(%eax), %xmm3
1394	movaps	38(%eax), %xmm4
1395	movaps	54(%eax), %xmm5
1396	movaps	%xmm5, %xmm7
1397	palignr	$10, %xmm4, %xmm5
1398	palignr	$10, %xmm3, %xmm4
1399	movaps	%xmm5, 48(%edx)
1400	palignr	$10, %xmm2, %xmm3
1401	lea	64(%eax), %eax
1402	palignr	$10, %xmm1, %xmm2
1403	movaps	%xmm4, 32(%edx)
1404	movaps	%xmm3, 16(%edx)
1405	movaps	%xmm7, %xmm1
1406	movaps	%xmm2, (%edx)
1407	lea	64(%edx), %edx
1408	sub	$64, %ecx
1409	ja	L(Shl10LoopStart)
1410
1411L(Shl10LoopLeave):
1412	add	$32, %ecx
1413	jle	L(shl_end_0)
1414
1415	movaps	6(%eax), %xmm2
1416	movaps	22(%eax), %xmm3
1417	palignr	$10, %xmm2, %xmm3
1418	palignr	$10, %xmm1, %xmm2
1419
1420	movaps	%xmm2, (%edx)
1421	movaps	%xmm3, 16(%edx)
1422	lea	32(%edx, %ecx), %edx
1423	lea	32(%eax, %ecx), %eax
1424	POP (%edi)
1425	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1426
1427	CFI_PUSH (%edi)
1428
1429	.p2align 4
1430L(sh_10_no_prefetch):
1431	lea	-32(%ecx), %ecx
1432	lea	-10(%eax), %eax
1433	xor	%edi, %edi
1434
1435	.p2align 4
1436L(sh_10_no_prefetch_loop):
1437	movdqa	16(%eax, %edi), %xmm2
1438	sub	$32, %ecx
1439	movdqa	32(%eax, %edi), %xmm3
1440	movdqa	%xmm3, %xmm4
1441	palignr	$10, %xmm2, %xmm3
1442	palignr	$10, %xmm1, %xmm2
1443	lea	32(%edi), %edi
1444	movdqa	%xmm2, -32(%edx, %edi)
1445	movdqa	%xmm3, -16(%edx, %edi)
1446	jb	L(sh_10_end_no_prefetch_loop)
1447
1448	movdqa	16(%eax, %edi), %xmm2
1449	sub	$32, %ecx
1450	movdqa	32(%eax, %edi), %xmm3
1451	movdqa	%xmm3, %xmm1
1452	palignr	$10, %xmm2, %xmm3
1453	palignr	$10, %xmm4, %xmm2
1454	lea	32(%edi), %edi
1455	movdqa	%xmm2, -32(%edx, %edi)
1456	movdqa	%xmm3, -16(%edx, %edi)
1457	jae	L(sh_10_no_prefetch_loop)
1458
1459L(sh_10_end_no_prefetch_loop):
1460	lea	32(%ecx), %ecx
1461	add	%ecx, %edi
1462	add	%edi, %edx
1463	lea	10(%edi, %eax), %eax
1464	POP	(%edi)
1465	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1466
1467	CFI_PUSH (%edi)
1468
1469	.p2align 4
1470L(shl_11):
1471#ifndef USE_AS_MEMMOVE
1472	movaps	-11(%eax), %xmm1
1473#else
1474	movl	DEST+4(%esp), %edi
1475	movaps	-11(%eax), %xmm1
1476	movdqu	%xmm0, (%edi)
1477#endif
1478#ifdef DATA_CACHE_SIZE_HALF
1479	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1480#else
1481# if (defined SHARED || defined __PIC__)
1482	SETUP_PIC_REG(bx)
1483	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1484	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1485# else
1486	cmp	__x86_data_cache_size_half, %ecx
1487# endif
1488#endif
1489	jb L(sh_11_no_prefetch)
1490
1491	lea	-64(%ecx), %ecx
1492
1493	.p2align 4
1494L(Shl11LoopStart):
1495	prefetcht0 0x1c0(%eax)
1496	prefetcht0 0x1c0(%edx)
1497	movaps	5(%eax), %xmm2
1498	movaps	21(%eax), %xmm3
1499	movaps	37(%eax), %xmm4
1500	movaps	53(%eax), %xmm5
1501	movaps	%xmm5, %xmm7
1502	palignr	$11, %xmm4, %xmm5
1503	palignr	$11, %xmm3, %xmm4
1504	movaps	%xmm5, 48(%edx)
1505	palignr	$11, %xmm2, %xmm3
1506	lea	64(%eax), %eax
1507	palignr	$11, %xmm1, %xmm2
1508	movaps	%xmm4, 32(%edx)
1509	movaps	%xmm3, 16(%edx)
1510	movaps	%xmm7, %xmm1
1511	movaps	%xmm2, (%edx)
1512	lea	64(%edx), %edx
1513	sub	$64, %ecx
1514	ja	L(Shl11LoopStart)
1515
1516L(Shl11LoopLeave):
1517	add	$32, %ecx
1518	jle	L(shl_end_0)
1519
1520	movaps	5(%eax), %xmm2
1521	movaps	21(%eax), %xmm3
1522	palignr	$11, %xmm2, %xmm3
1523	palignr	$11, %xmm1, %xmm2
1524
1525	movaps	%xmm2, (%edx)
1526	movaps	%xmm3, 16(%edx)
1527	lea	32(%edx, %ecx), %edx
1528	lea	32(%eax, %ecx), %eax
1529	POP (%edi)
1530	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1531
1532	CFI_PUSH (%edi)
1533
1534	.p2align 4
1535L(sh_11_no_prefetch):
1536	lea	-32(%ecx), %ecx
1537	lea	-11(%eax), %eax
1538	xor	%edi, %edi
1539
1540	.p2align 4
1541L(sh_11_no_prefetch_loop):
1542	movdqa	16(%eax, %edi), %xmm2
1543	sub	$32, %ecx
1544	movdqa	32(%eax, %edi), %xmm3
1545	movdqa	%xmm3, %xmm4
1546	palignr	$11, %xmm2, %xmm3
1547	palignr	$11, %xmm1, %xmm2
1548	lea	32(%edi), %edi
1549	movdqa	%xmm2, -32(%edx, %edi)
1550	movdqa	%xmm3, -16(%edx, %edi)
1551	jb	L(sh_11_end_no_prefetch_loop)
1552
1553	movdqa	16(%eax, %edi), %xmm2
1554	sub	$32, %ecx
1555	movdqa	32(%eax, %edi), %xmm3
1556	movdqa	%xmm3, %xmm1
1557	palignr	$11, %xmm2, %xmm3
1558	palignr	$11, %xmm4, %xmm2
1559	lea	32(%edi), %edi
1560	movdqa	%xmm2, -32(%edx, %edi)
1561	movdqa	%xmm3, -16(%edx, %edi)
1562	jae	L(sh_11_no_prefetch_loop)
1563
1564L(sh_11_end_no_prefetch_loop):
1565	lea	32(%ecx), %ecx
1566	add	%ecx, %edi
1567	add	%edi, %edx
1568	lea	11(%edi, %eax), %eax
1569	POP	(%edi)
1570	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1571
1572	CFI_PUSH (%edi)
1573
1574	.p2align 4
1575L(shl_12):
1576#ifndef USE_AS_MEMMOVE
1577	movaps	-12(%eax), %xmm1
1578#else
1579	movl	DEST+4(%esp), %edi
1580	movaps	-12(%eax), %xmm1
1581	movdqu	%xmm0, (%edi)
1582#endif
1583#ifdef DATA_CACHE_SIZE_HALF
1584	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1585#else
1586# if (defined SHARED || defined __PIC__)
1587	SETUP_PIC_REG(bx)
1588	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1589	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1590# else
1591	cmp	__x86_data_cache_size_half, %ecx
1592# endif
1593#endif
1594	jb L(sh_12_no_prefetch)
1595
1596	lea	-64(%ecx), %ecx
1597
1598	.p2align 4
1599L(Shl12LoopStart):
1600	prefetcht0 0x1c0(%eax)
1601	prefetcht0 0x1c0(%edx)
1602	movaps	4(%eax), %xmm2
1603	movaps	20(%eax), %xmm3
1604	movaps	36(%eax), %xmm4
1605	movaps	52(%eax), %xmm5
1606	movaps	%xmm5, %xmm7
1607	palignr	$12, %xmm4, %xmm5
1608	palignr	$12, %xmm3, %xmm4
1609	movaps	%xmm5, 48(%edx)
1610	palignr	$12, %xmm2, %xmm3
1611	lea	64(%eax), %eax
1612	palignr	$12, %xmm1, %xmm2
1613	movaps	%xmm4, 32(%edx)
1614	movaps	%xmm3, 16(%edx)
1615	movaps	%xmm7, %xmm1
1616	movaps	%xmm2, (%edx)
1617	lea	64(%edx), %edx
1618	sub	$64, %ecx
1619	ja	L(Shl12LoopStart)
1620
1621L(Shl12LoopLeave):
1622	add	$32, %ecx
1623	jle	L(shl_end_0)
1624
1625	movaps	4(%eax), %xmm2
1626	movaps	20(%eax), %xmm3
1627	palignr	$12, %xmm2, %xmm3
1628	palignr	$12, %xmm1, %xmm2
1629
1630	movaps	%xmm2, (%edx)
1631	movaps	%xmm3, 16(%edx)
1632	lea	32(%edx, %ecx), %edx
1633	lea	32(%eax, %ecx), %eax
1634	POP (%edi)
1635	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1636
1637	CFI_PUSH (%edi)
1638
1639	.p2align 4
1640L(sh_12_no_prefetch):
1641	lea	-32(%ecx), %ecx
1642	lea	-12(%eax), %eax
1643	xor	%edi, %edi
1644
1645	.p2align 4
1646L(sh_12_no_prefetch_loop):
1647	movdqa	16(%eax, %edi), %xmm2
1648	sub	$32, %ecx
1649	movdqa	32(%eax, %edi), %xmm3
1650	movdqa	%xmm3, %xmm4
1651	palignr	$12, %xmm2, %xmm3
1652	palignr	$12, %xmm1, %xmm2
1653	lea	32(%edi), %edi
1654	movdqa	%xmm2, -32(%edx, %edi)
1655	movdqa	%xmm3, -16(%edx, %edi)
1656	jb	L(sh_12_end_no_prefetch_loop)
1657
1658	movdqa	16(%eax, %edi), %xmm2
1659	sub	$32, %ecx
1660	movdqa	32(%eax, %edi), %xmm3
1661	movdqa	%xmm3, %xmm1
1662	palignr	$12, %xmm2, %xmm3
1663	palignr	$12, %xmm4, %xmm2
1664	lea	32(%edi), %edi
1665	movdqa	%xmm2, -32(%edx, %edi)
1666	movdqa	%xmm3, -16(%edx, %edi)
1667	jae	L(sh_12_no_prefetch_loop)
1668
1669L(sh_12_end_no_prefetch_loop):
1670	lea	32(%ecx), %ecx
1671	add	%ecx, %edi
1672	add	%edi, %edx
1673	lea	12(%edi, %eax), %eax
1674	POP	(%edi)
1675	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1676
1677	CFI_PUSH (%edi)
1678
1679	.p2align 4
1680L(shl_13):
1681#ifndef USE_AS_MEMMOVE
1682	movaps	-13(%eax), %xmm1
1683#else
1684	movl	DEST+4(%esp), %edi
1685	movaps	-13(%eax), %xmm1
1686	movdqu	%xmm0, (%edi)
1687#endif
1688#ifdef DATA_CACHE_SIZE_HALF
1689	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1690#else
1691# if (defined SHARED || defined __PIC__)
1692	SETUP_PIC_REG(bx)
1693	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1694	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1695# else
1696	cmp	__x86_data_cache_size_half, %ecx
1697# endif
1698#endif
1699	jb L(sh_13_no_prefetch)
1700
1701	lea	-64(%ecx), %ecx
1702
1703	.p2align 4
1704L(Shl13LoopStart):
1705	prefetcht0 0x1c0(%eax)
1706	prefetcht0 0x1c0(%edx)
1707	movaps	3(%eax), %xmm2
1708	movaps	19(%eax), %xmm3
1709	movaps	35(%eax), %xmm4
1710	movaps	51(%eax), %xmm5
1711	movaps	%xmm5, %xmm7
1712	palignr	$13, %xmm4, %xmm5
1713	palignr	$13, %xmm3, %xmm4
1714	movaps	%xmm5, 48(%edx)
1715	palignr	$13, %xmm2, %xmm3
1716	lea	64(%eax), %eax
1717	palignr	$13, %xmm1, %xmm2
1718	movaps	%xmm4, 32(%edx)
1719	movaps	%xmm3, 16(%edx)
1720	movaps	%xmm7, %xmm1
1721	movaps	%xmm2, (%edx)
1722	lea	64(%edx), %edx
1723	sub	$64, %ecx
1724	ja	L(Shl13LoopStart)
1725
1726L(Shl13LoopLeave):
1727	add	$32, %ecx
1728	jle	L(shl_end_0)
1729
1730	movaps	3(%eax), %xmm2
1731	movaps	19(%eax), %xmm3
1732	palignr	$13, %xmm2, %xmm3
1733	palignr	$13, %xmm1, %xmm2
1734
1735	movaps	%xmm2, (%edx)
1736	movaps	%xmm3, 16(%edx)
1737	lea	32(%edx, %ecx), %edx
1738	lea	32(%eax, %ecx), %eax
1739	POP (%edi)
1740	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1741
1742	CFI_PUSH (%edi)
1743
1744	.p2align 4
1745L(sh_13_no_prefetch):
1746	lea	-32(%ecx), %ecx
1747	lea	-13(%eax), %eax
1748	xor	%edi, %edi
1749
1750	.p2align 4
1751L(sh_13_no_prefetch_loop):
1752	movdqa	16(%eax, %edi), %xmm2
1753	sub	$32, %ecx
1754	movdqa	32(%eax, %edi), %xmm3
1755	movdqa	%xmm3, %xmm4
1756	palignr	$13, %xmm2, %xmm3
1757	palignr	$13, %xmm1, %xmm2
1758	lea	32(%edi), %edi
1759	movdqa	%xmm2, -32(%edx, %edi)
1760	movdqa	%xmm3, -16(%edx, %edi)
1761	jb	L(sh_13_end_no_prefetch_loop)
1762
1763	movdqa	16(%eax, %edi), %xmm2
1764	sub	$32, %ecx
1765	movdqa	32(%eax, %edi), %xmm3
1766	movdqa	%xmm3, %xmm1
1767	palignr	$13, %xmm2, %xmm3
1768	palignr	$13, %xmm4, %xmm2
1769	lea	32(%edi), %edi
1770	movdqa	%xmm2, -32(%edx, %edi)
1771	movdqa	%xmm3, -16(%edx, %edi)
1772	jae	L(sh_13_no_prefetch_loop)
1773
1774L(sh_13_end_no_prefetch_loop):
1775	lea	32(%ecx), %ecx
1776	add	%ecx, %edi
1777	add	%edi, %edx
1778	lea	13(%edi, %eax), %eax
1779	POP	(%edi)
1780	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1781
1782	CFI_PUSH (%edi)
1783
1784	.p2align 4
1785L(shl_14):
1786#ifndef USE_AS_MEMMOVE
1787	movaps	-14(%eax), %xmm1
1788#else
1789	movl	DEST+4(%esp), %edi
1790	movaps	-14(%eax), %xmm1
1791	movdqu	%xmm0, (%edi)
1792#endif
1793#ifdef DATA_CACHE_SIZE_HALF
1794	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1795#else
1796# if (defined SHARED || defined __PIC__)
1797	SETUP_PIC_REG(bx)
1798	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1799	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1800# else
1801	cmp	__x86_data_cache_size_half, %ecx
1802# endif
1803#endif
1804	jb L(sh_14_no_prefetch)
1805
1806	lea	-64(%ecx), %ecx
1807
1808	.p2align 4
1809L(Shl14LoopStart):
1810	prefetcht0 0x1c0(%eax)
1811	prefetcht0 0x1c0(%edx)
1812	movaps	2(%eax), %xmm2
1813	movaps	18(%eax), %xmm3
1814	movaps	34(%eax), %xmm4
1815	movaps	50(%eax), %xmm5
1816	movaps	%xmm5, %xmm7
1817	palignr	$14, %xmm4, %xmm5
1818	palignr	$14, %xmm3, %xmm4
1819	movaps	%xmm5, 48(%edx)
1820	palignr	$14, %xmm2, %xmm3
1821	lea	64(%eax), %eax
1822	palignr	$14, %xmm1, %xmm2
1823	movaps	%xmm4, 32(%edx)
1824	movaps	%xmm3, 16(%edx)
1825	movaps	%xmm7, %xmm1
1826	movaps	%xmm2, (%edx)
1827	lea	64(%edx), %edx
1828	sub	$64, %ecx
1829	ja	L(Shl14LoopStart)
1830
1831L(Shl14LoopLeave):
1832	add	$32, %ecx
1833	jle	L(shl_end_0)
1834
1835	movaps	2(%eax), %xmm2
1836	movaps	18(%eax), %xmm3
1837	palignr	$14, %xmm2, %xmm3
1838	palignr	$14, %xmm1, %xmm2
1839
1840	movaps	%xmm2, (%edx)
1841	movaps	%xmm3, 16(%edx)
1842	lea	32(%edx, %ecx), %edx
1843	lea	32(%eax, %ecx), %eax
1844	POP (%edi)
1845	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1846
1847	CFI_PUSH (%edi)
1848
1849	.p2align 4
1850L(sh_14_no_prefetch):
1851	lea	-32(%ecx), %ecx
1852	lea	-14(%eax), %eax
1853	xor	%edi, %edi
1854
1855	.p2align 4
1856L(sh_14_no_prefetch_loop):
1857	movdqa	16(%eax, %edi), %xmm2
1858	sub	$32, %ecx
1859	movdqa	32(%eax, %edi), %xmm3
1860	movdqa	%xmm3, %xmm4
1861	palignr	$14, %xmm2, %xmm3
1862	palignr	$14, %xmm1, %xmm2
1863	lea	32(%edi), %edi
1864	movdqa	%xmm2, -32(%edx, %edi)
1865	movdqa	%xmm3, -16(%edx, %edi)
1866	jb	L(sh_14_end_no_prefetch_loop)
1867
1868	movdqa	16(%eax, %edi), %xmm2
1869	sub	$32, %ecx
1870	movdqa	32(%eax, %edi), %xmm3
1871	movdqa	%xmm3, %xmm1
1872	palignr	$14, %xmm2, %xmm3
1873	palignr	$14, %xmm4, %xmm2
1874	lea	32(%edi), %edi
1875	movdqa	%xmm2, -32(%edx, %edi)
1876	movdqa	%xmm3, -16(%edx, %edi)
1877	jae	L(sh_14_no_prefetch_loop)
1878
1879L(sh_14_end_no_prefetch_loop):
1880	lea	32(%ecx), %ecx
1881	add	%ecx, %edi
1882	add	%edi, %edx
1883	lea	14(%edi, %eax), %eax
1884	POP	(%edi)
1885	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1886
1887	CFI_PUSH (%edi)
1888
1889	.p2align 4
1890L(shl_15):
1891#ifndef USE_AS_MEMMOVE
1892	movaps	-15(%eax), %xmm1
1893#else
1894	movl	DEST+4(%esp), %edi
1895	movaps	-15(%eax), %xmm1
1896	movdqu	%xmm0, (%edi)
1897#endif
1898#ifdef DATA_CACHE_SIZE_HALF
1899	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1900#else
1901# if (defined SHARED || defined __PIC__)
1902	SETUP_PIC_REG(bx)
1903	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1904	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1905# else
1906	cmp	__x86_data_cache_size_half, %ecx
1907# endif
1908#endif
1909	jb L(sh_15_no_prefetch)
1910
1911	lea	-64(%ecx), %ecx
1912
1913	.p2align 4
1914L(Shl15LoopStart):
1915	prefetcht0 0x1c0(%eax)
1916	prefetcht0 0x1c0(%edx)
1917	movaps	1(%eax), %xmm2
1918	movaps	17(%eax), %xmm3
1919	movaps	33(%eax), %xmm4
1920	movaps	49(%eax), %xmm5
1921	movaps	%xmm5, %xmm7
1922	palignr	$15, %xmm4, %xmm5
1923	palignr	$15, %xmm3, %xmm4
1924	movaps	%xmm5, 48(%edx)
1925	palignr	$15, %xmm2, %xmm3
1926	lea	64(%eax), %eax
1927	palignr	$15, %xmm1, %xmm2
1928	movaps	%xmm4, 32(%edx)
1929	movaps	%xmm3, 16(%edx)
1930	movaps	%xmm7, %xmm1
1931	movaps	%xmm2, (%edx)
1932	lea	64(%edx), %edx
1933	sub	$64, %ecx
1934	ja	L(Shl15LoopStart)
1935
1936L(Shl15LoopLeave):
1937	add	$32, %ecx
1938	jle	L(shl_end_0)
1939
1940	movaps	1(%eax), %xmm2
1941	movaps	17(%eax), %xmm3
1942	palignr	$15, %xmm2, %xmm3
1943	palignr	$15, %xmm1, %xmm2
1944
1945	movaps	%xmm2, (%edx)
1946	movaps	%xmm3, 16(%edx)
1947	lea	32(%edx, %ecx), %edx
1948	lea	32(%eax, %ecx), %eax
1949	POP (%edi)
1950	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1951
1952	CFI_PUSH (%edi)
1953
1954	.p2align 4
1955L(sh_15_no_prefetch):
1956	lea	-32(%ecx), %ecx
1957	lea	-15(%eax), %eax
1958	xor	%edi, %edi
1959
1960	.p2align 4
1961L(sh_15_no_prefetch_loop):
1962	movdqa	16(%eax, %edi), %xmm2
1963	sub	$32, %ecx
1964	movdqa	32(%eax, %edi), %xmm3
1965	movdqa	%xmm3, %xmm4
1966	palignr	$15, %xmm2, %xmm3
1967	palignr	$15, %xmm1, %xmm2
1968	lea	32(%edi), %edi
1969	movdqa	%xmm2, -32(%edx, %edi)
1970	movdqa	%xmm3, -16(%edx, %edi)
1971	jb	L(sh_15_end_no_prefetch_loop)
1972
1973	movdqa	16(%eax, %edi), %xmm2
1974	sub	$32, %ecx
1975	movdqa	32(%eax, %edi), %xmm3
1976	movdqa	%xmm3, %xmm1
1977	palignr	$15, %xmm2, %xmm3
1978	palignr	$15, %xmm4, %xmm2
1979	lea	32(%edi), %edi
1980	movdqa	%xmm2, -32(%edx, %edi)
1981	movdqa	%xmm3, -16(%edx, %edi)
1982	jae	L(sh_15_no_prefetch_loop)
1983
1984L(sh_15_end_no_prefetch_loop):
1985	lea	32(%ecx), %ecx
1986	add	%ecx, %edi
1987	add	%edi, %edx
1988	lea	15(%edi, %eax), %eax
1989	POP	(%edi)
1990	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1991
1992	CFI_PUSH (%edi)
1993
1994	.p2align 4
1995L(shl_end_0):
1996	lea	32(%ecx), %ecx
1997	lea	(%edx, %ecx), %edx
1998	lea	(%eax, %ecx), %eax
1999	POP	(%edi)
2000	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
2001
2002	.p2align 4
2003L(fwd_write_44bytes):
2004	movq	-44(%eax), %xmm0
2005	movq	%xmm0, -44(%edx)
2006L(fwd_write_36bytes):
2007	movq	-36(%eax), %xmm0
2008	movq	%xmm0, -36(%edx)
2009L(fwd_write_28bytes):
2010	movq	-28(%eax), %xmm0
2011	movq	%xmm0, -28(%edx)
2012L(fwd_write_20bytes):
2013	movq	-20(%eax), %xmm0
2014	movq	%xmm0, -20(%edx)
2015L(fwd_write_12bytes):
2016	movq	-12(%eax), %xmm0
2017	movq	%xmm0, -12(%edx)
2018L(fwd_write_4bytes):
2019	movl	-4(%eax), %ecx
2020	movl	%ecx, -4(%edx)
2021#ifndef USE_AS_BCOPY
2022# ifdef USE_AS_MEMPCPY
2023	movl	%edx, %eax
2024# else
2025	movl	DEST(%esp), %eax
2026# endif
2027#endif
2028	RETURN
2029
2030	.p2align 4
2031L(fwd_write_40bytes):
2032	movq	-40(%eax), %xmm0
2033	movq	%xmm0, -40(%edx)
2034L(fwd_write_32bytes):
2035	movq	-32(%eax), %xmm0
2036	movq	%xmm0, -32(%edx)
2037L(fwd_write_24bytes):
2038	movq	-24(%eax), %xmm0
2039	movq	%xmm0, -24(%edx)
2040L(fwd_write_16bytes):
2041	movq	-16(%eax), %xmm0
2042	movq	%xmm0, -16(%edx)
2043L(fwd_write_8bytes):
2044	movq	-8(%eax), %xmm0
2045	movq	%xmm0, -8(%edx)
2046L(fwd_write_0bytes):
2047#ifndef USE_AS_BCOPY
2048# ifdef USE_AS_MEMPCPY
2049	movl	%edx, %eax
2050# else
2051	movl	DEST(%esp), %eax
2052# endif
2053#endif
2054	RETURN
2055
2056	.p2align 4
2057L(fwd_write_5bytes):
2058	movl	-5(%eax), %ecx
2059	movl	-4(%eax), %eax
2060	movl	%ecx, -5(%edx)
2061	movl	%eax, -4(%edx)
2062#ifndef USE_AS_BCOPY
2063# ifdef USE_AS_MEMPCPY
2064	movl	%edx, %eax
2065# else
2066	movl	DEST(%esp), %eax
2067# endif
2068#endif
2069	RETURN
2070
2071	.p2align 4
2072L(fwd_write_45bytes):
2073	movq	-45(%eax), %xmm0
2074	movq	%xmm0, -45(%edx)
2075L(fwd_write_37bytes):
2076	movq	-37(%eax), %xmm0
2077	movq	%xmm0, -37(%edx)
2078L(fwd_write_29bytes):
2079	movq	-29(%eax), %xmm0
2080	movq	%xmm0, -29(%edx)
2081L(fwd_write_21bytes):
2082	movq	-21(%eax), %xmm0
2083	movq	%xmm0, -21(%edx)
2084L(fwd_write_13bytes):
2085	movq	-13(%eax), %xmm0
2086	movq	%xmm0, -13(%edx)
2087	movl	-5(%eax), %ecx
2088	movl	%ecx, -5(%edx)
2089	movzbl	-1(%eax), %ecx
2090	movb	%cl, -1(%edx)
2091#ifndef USE_AS_BCOPY
2092# ifdef USE_AS_MEMPCPY
2093	movl	%edx, %eax
2094# else
2095	movl	DEST(%esp), %eax
2096# endif
2097#endif
2098	RETURN
2099
2100	.p2align 4
2101L(fwd_write_41bytes):
2102	movq	-41(%eax), %xmm0
2103	movq	%xmm0, -41(%edx)
2104L(fwd_write_33bytes):
2105	movq	-33(%eax), %xmm0
2106	movq	%xmm0, -33(%edx)
2107L(fwd_write_25bytes):
2108	movq	-25(%eax), %xmm0
2109	movq	%xmm0, -25(%edx)
2110L(fwd_write_17bytes):
2111	movq	-17(%eax), %xmm0
2112	movq	%xmm0, -17(%edx)
2113L(fwd_write_9bytes):
2114	movq	-9(%eax), %xmm0
2115	movq	%xmm0, -9(%edx)
2116L(fwd_write_1bytes):
2117	movzbl	-1(%eax), %ecx
2118	movb	%cl, -1(%edx)
2119#ifndef USE_AS_BCOPY
2120# ifdef USE_AS_MEMPCPY
2121	movl	%edx, %eax
2122# else
2123	movl	DEST(%esp), %eax
2124# endif
2125#endif
2126	RETURN
2127
2128	.p2align 4
2129L(fwd_write_46bytes):
2130	movq	-46(%eax), %xmm0
2131	movq	%xmm0, -46(%edx)
2132L(fwd_write_38bytes):
2133	movq	-38(%eax), %xmm0
2134	movq	%xmm0, -38(%edx)
2135L(fwd_write_30bytes):
2136	movq	-30(%eax), %xmm0
2137	movq	%xmm0, -30(%edx)
2138L(fwd_write_22bytes):
2139	movq	-22(%eax), %xmm0
2140	movq	%xmm0, -22(%edx)
2141L(fwd_write_14bytes):
2142	movq	-14(%eax), %xmm0
2143	movq	%xmm0, -14(%edx)
2144L(fwd_write_6bytes):
2145	movl	-6(%eax), %ecx
2146	movl	%ecx, -6(%edx)
2147	movzwl	-2(%eax), %ecx
2148	movw	%cx, -2(%edx)
2149#ifndef USE_AS_BCOPY
2150# ifdef USE_AS_MEMPCPY
2151	movl	%edx, %eax
2152# else
2153	movl	DEST(%esp), %eax
2154# endif
2155#endif
2156	RETURN
2157
2158	.p2align 4
2159L(fwd_write_42bytes):
2160	movq	-42(%eax), %xmm0
2161	movq	%xmm0, -42(%edx)
2162L(fwd_write_34bytes):
2163	movq	-34(%eax), %xmm0
2164	movq	%xmm0, -34(%edx)
2165L(fwd_write_26bytes):
2166	movq	-26(%eax), %xmm0
2167	movq	%xmm0, -26(%edx)
2168L(fwd_write_18bytes):
2169	movq	-18(%eax), %xmm0
2170	movq	%xmm0, -18(%edx)
2171L(fwd_write_10bytes):
2172	movq	-10(%eax), %xmm0
2173	movq	%xmm0, -10(%edx)
2174L(fwd_write_2bytes):
2175	movzwl	-2(%eax), %ecx
2176	movw	%cx, -2(%edx)
2177#ifndef USE_AS_BCOPY
2178# ifdef USE_AS_MEMPCPY
2179	movl	%edx, %eax
2180# else
2181	movl	DEST(%esp), %eax
2182# endif
2183#endif
2184	RETURN
2185
2186	.p2align 4
2187L(fwd_write_47bytes):
2188	movq	-47(%eax), %xmm0
2189	movq	%xmm0, -47(%edx)
2190L(fwd_write_39bytes):
2191	movq	-39(%eax), %xmm0
2192	movq	%xmm0, -39(%edx)
2193L(fwd_write_31bytes):
2194	movq	-31(%eax), %xmm0
2195	movq	%xmm0, -31(%edx)
2196L(fwd_write_23bytes):
2197	movq	-23(%eax), %xmm0
2198	movq	%xmm0, -23(%edx)
2199L(fwd_write_15bytes):
2200	movq	-15(%eax), %xmm0
2201	movq	%xmm0, -15(%edx)
2202L(fwd_write_7bytes):
2203	movl	-7(%eax), %ecx
2204	movl	%ecx, -7(%edx)
2205	movzwl	-3(%eax), %ecx
2206	movzbl	-1(%eax), %eax
2207	movw	%cx, -3(%edx)
2208	movb	%al, -1(%edx)
2209#ifndef USE_AS_BCOPY
2210# ifdef USE_AS_MEMPCPY
2211	movl	%edx, %eax
2212# else
2213	movl	DEST(%esp), %eax
2214# endif
2215#endif
2216	RETURN
2217
2218	.p2align 4
2219L(fwd_write_43bytes):
2220	movq	-43(%eax), %xmm0
2221	movq	%xmm0, -43(%edx)
2222L(fwd_write_35bytes):
2223	movq	-35(%eax), %xmm0
2224	movq	%xmm0, -35(%edx)
2225L(fwd_write_27bytes):
2226	movq	-27(%eax), %xmm0
2227	movq	%xmm0, -27(%edx)
2228L(fwd_write_19bytes):
2229	movq	-19(%eax), %xmm0
2230	movq	%xmm0, -19(%edx)
2231L(fwd_write_11bytes):
2232	movq	-11(%eax), %xmm0
2233	movq	%xmm0, -11(%edx)
2234L(fwd_write_3bytes):
2235	movzwl	-3(%eax), %ecx
2236	movzbl	-1(%eax), %eax
2237	movw	%cx, -3(%edx)
2238	movb	%al, -1(%edx)
2239#ifndef USE_AS_BCOPY
2240# ifdef USE_AS_MEMPCPY
2241	movl	%edx, %eax
2242# else
2243	movl	DEST(%esp), %eax
2244# endif
2245#endif
2246	RETURN
2247
2248	.p2align 4
2249L(fwd_write_40bytes_align):
2250	movdqa	-40(%eax), %xmm0
2251	movdqa	%xmm0, -40(%edx)
2252L(fwd_write_24bytes_align):
2253	movdqa	-24(%eax), %xmm0
2254	movdqa	%xmm0, -24(%edx)
2255L(fwd_write_8bytes_align):
2256	movq	-8(%eax), %xmm0
2257	movq	%xmm0, -8(%edx)
2258L(fwd_write_0bytes_align):
2259#ifndef USE_AS_BCOPY
2260# ifdef USE_AS_MEMPCPY
2261	movl	%edx, %eax
2262# else
2263	movl	DEST(%esp), %eax
2264# endif
2265#endif
2266	RETURN
2267
2268	.p2align 4
2269L(fwd_write_32bytes_align):
2270	movdqa	-32(%eax), %xmm0
2271	movdqa	%xmm0, -32(%edx)
2272L(fwd_write_16bytes_align):
2273	movdqa	-16(%eax), %xmm0
2274	movdqa	%xmm0, -16(%edx)
2275#ifndef USE_AS_BCOPY
2276# ifdef USE_AS_MEMPCPY
2277	movl	%edx, %eax
2278# else
2279	movl	DEST(%esp), %eax
2280# endif
2281#endif
2282	RETURN
2283
2284	.p2align 4
2285L(fwd_write_5bytes_align):
2286	movl	-5(%eax), %ecx
2287	movl	-4(%eax), %eax
2288	movl	%ecx, -5(%edx)
2289	movl	%eax, -4(%edx)
2290#ifndef USE_AS_BCOPY
2291# ifdef USE_AS_MEMPCPY
2292	movl	%edx, %eax
2293# else
2294	movl	DEST(%esp), %eax
2295# endif
2296#endif
2297	RETURN
2298
2299	.p2align 4
2300L(fwd_write_45bytes_align):
2301	movdqa	-45(%eax), %xmm0
2302	movdqa	%xmm0, -45(%edx)
2303L(fwd_write_29bytes_align):
2304	movdqa	-29(%eax), %xmm0
2305	movdqa	%xmm0, -29(%edx)
2306L(fwd_write_13bytes_align):
2307	movq	-13(%eax), %xmm0
2308	movq	%xmm0, -13(%edx)
2309	movl	-5(%eax), %ecx
2310	movl	%ecx, -5(%edx)
2311	movzbl	-1(%eax), %ecx
2312	movb	%cl, -1(%edx)
2313#ifndef USE_AS_BCOPY
2314# ifdef USE_AS_MEMPCPY
2315	movl	%edx, %eax
2316# else
2317	movl	DEST(%esp), %eax
2318# endif
2319#endif
2320	RETURN
2321
2322	.p2align 4
2323L(fwd_write_37bytes_align):
2324	movdqa	-37(%eax), %xmm0
2325	movdqa	%xmm0, -37(%edx)
2326L(fwd_write_21bytes_align):
2327	movdqa	-21(%eax), %xmm0
2328	movdqa	%xmm0, -21(%edx)
2329	movl	-5(%eax), %ecx
2330	movl	%ecx, -5(%edx)
2331	movzbl	-1(%eax), %ecx
2332	movb	%cl, -1(%edx)
2333#ifndef USE_AS_BCOPY
2334# ifdef USE_AS_MEMPCPY
2335	movl	%edx, %eax
2336# else
2337	movl	DEST(%esp), %eax
2338# endif
2339#endif
2340	RETURN
2341
2342	.p2align 4
2343L(fwd_write_41bytes_align):
2344	movdqa	-41(%eax), %xmm0
2345	movdqa	%xmm0, -41(%edx)
2346L(fwd_write_25bytes_align):
2347	movdqa	-25(%eax), %xmm0
2348	movdqa	%xmm0, -25(%edx)
2349L(fwd_write_9bytes_align):
2350	movq	-9(%eax), %xmm0
2351	movq	%xmm0, -9(%edx)
2352L(fwd_write_1bytes_align):
2353	movzbl	-1(%eax), %ecx
2354	movb	%cl, -1(%edx)
2355#ifndef USE_AS_BCOPY
2356# ifdef USE_AS_MEMPCPY
2357	movl	%edx, %eax
2358# else
2359	movl	DEST(%esp), %eax
2360# endif
2361#endif
2362	RETURN
2363
2364	.p2align 4
2365L(fwd_write_33bytes_align):
2366	movdqa	-33(%eax), %xmm0
2367	movdqa	%xmm0, -33(%edx)
2368L(fwd_write_17bytes_align):
2369	movdqa	-17(%eax), %xmm0
2370	movdqa	%xmm0, -17(%edx)
2371	movzbl	-1(%eax), %ecx
2372	movb	%cl, -1(%edx)
2373#ifndef USE_AS_BCOPY
2374# ifdef USE_AS_MEMPCPY
2375	movl	%edx, %eax
2376# else
2377	movl	DEST(%esp), %eax
2378# endif
2379#endif
2380	RETURN
2381
2382	.p2align 4
2383L(fwd_write_46bytes_align):
2384	movdqa	-46(%eax), %xmm0
2385	movdqa	%xmm0, -46(%edx)
2386L(fwd_write_30bytes_align):
2387	movdqa	-30(%eax), %xmm0
2388	movdqa	%xmm0, -30(%edx)
2389L(fwd_write_14bytes_align):
2390	movq	-14(%eax), %xmm0
2391	movq	%xmm0, -14(%edx)
2392L(fwd_write_6bytes_align):
2393	movl	-6(%eax), %ecx
2394	movl	%ecx, -6(%edx)
2395	movzwl	-2(%eax), %ecx
2396	movw	%cx, -2(%edx)
2397#ifndef USE_AS_BCOPY
2398# ifdef USE_AS_MEMPCPY
2399	movl	%edx, %eax
2400# else
2401	movl	DEST(%esp), %eax
2402# endif
2403#endif
2404	RETURN
2405
2406	.p2align 4
2407L(fwd_write_38bytes_align):
2408	movdqa	-38(%eax), %xmm0
2409	movdqa	%xmm0, -38(%edx)
2410L(fwd_write_22bytes_align):
2411	movdqa	-22(%eax), %xmm0
2412	movdqa	%xmm0, -22(%edx)
2413	movl	-6(%eax), %ecx
2414	movl	%ecx, -6(%edx)
2415	movzwl	-2(%eax), %ecx
2416	movw	%cx, -2(%edx)
2417#ifndef USE_AS_BCOPY
2418# ifdef USE_AS_MEMPCPY
2419	movl	%edx, %eax
2420# else
2421	movl	DEST(%esp), %eax
2422# endif
2423#endif
2424	RETURN
2425
2426	.p2align 4
2427L(fwd_write_42bytes_align):
2428	movdqa	-42(%eax), %xmm0
2429	movdqa	%xmm0, -42(%edx)
2430L(fwd_write_26bytes_align):
2431	movdqa	-26(%eax), %xmm0
2432	movdqa	%xmm0, -26(%edx)
2433L(fwd_write_10bytes_align):
2434	movq	-10(%eax), %xmm0
2435	movq	%xmm0, -10(%edx)
2436L(fwd_write_2bytes_align):
2437	movzwl	-2(%eax), %ecx
2438	movw	%cx, -2(%edx)
2439#ifndef USE_AS_BCOPY
2440# ifdef USE_AS_MEMPCPY
2441	movl	%edx, %eax
2442# else
2443	movl	DEST(%esp), %eax
2444# endif
2445#endif
2446	RETURN
2447
2448	.p2align 4
2449L(fwd_write_34bytes_align):
2450	movdqa	-34(%eax), %xmm0
2451	movdqa	%xmm0, -34(%edx)
2452L(fwd_write_18bytes_align):
2453	movdqa	-18(%eax), %xmm0
2454	movdqa	%xmm0, -18(%edx)
2455	movzwl	-2(%eax), %ecx
2456	movw	%cx, -2(%edx)
2457#ifndef USE_AS_BCOPY
2458# ifdef USE_AS_MEMPCPY
2459	movl	%edx, %eax
2460# else
2461	movl	DEST(%esp), %eax
2462# endif
2463#endif
2464	RETURN
2465
2466	.p2align 4
2467L(fwd_write_47bytes_align):
2468	movdqa	-47(%eax), %xmm0
2469	movdqa	%xmm0, -47(%edx)
2470L(fwd_write_31bytes_align):
2471	movdqa	-31(%eax), %xmm0
2472	movdqa	%xmm0, -31(%edx)
2473L(fwd_write_15bytes_align):
2474	movq	-15(%eax), %xmm0
2475	movq	%xmm0, -15(%edx)
2476L(fwd_write_7bytes_align):
2477	movl	-7(%eax), %ecx
2478	movl	%ecx, -7(%edx)
2479	movzwl	-3(%eax), %ecx
2480	movzbl	-1(%eax), %eax
2481	movw	%cx, -3(%edx)
2482	movb	%al, -1(%edx)
2483#ifndef USE_AS_BCOPY
2484# ifdef USE_AS_MEMPCPY
2485	movl	%edx, %eax
2486# else
2487	movl	DEST(%esp), %eax
2488# endif
2489#endif
2490	RETURN
2491
2492	.p2align 4
2493L(fwd_write_39bytes_align):
2494	movdqa	-39(%eax), %xmm0
2495	movdqa	%xmm0, -39(%edx)
2496L(fwd_write_23bytes_align):
2497	movdqa	-23(%eax), %xmm0
2498	movdqa	%xmm0, -23(%edx)
2499	movl	-7(%eax), %ecx
2500	movl	%ecx, -7(%edx)
2501	movzwl	-3(%eax), %ecx
2502	movzbl	-1(%eax), %eax
2503	movw	%cx, -3(%edx)
2504	movb	%al, -1(%edx)
2505#ifndef USE_AS_BCOPY
2506# ifdef USE_AS_MEMPCPY
2507	movl	%edx, %eax
2508# else
2509	movl	DEST(%esp), %eax
2510# endif
2511#endif
2512	RETURN
2513
2514	.p2align 4
2515L(fwd_write_43bytes_align):
2516	movdqa	-43(%eax), %xmm0
2517	movdqa	%xmm0, -43(%edx)
2518L(fwd_write_27bytes_align):
2519	movdqa	-27(%eax), %xmm0
2520	movdqa	%xmm0, -27(%edx)
2521L(fwd_write_11bytes_align):
2522	movq	-11(%eax), %xmm0
2523	movq	%xmm0, -11(%edx)
2524L(fwd_write_3bytes_align):
2525	movzwl	-3(%eax), %ecx
2526	movzbl	-1(%eax), %eax
2527	movw	%cx, -3(%edx)
2528	movb	%al, -1(%edx)
2529#ifndef USE_AS_BCOPY
2530# ifdef USE_AS_MEMPCPY
2531	movl	%edx, %eax
2532# else
2533	movl	DEST(%esp), %eax
2534# endif
2535#endif
2536	RETURN
2537
2538	.p2align 4
2539L(fwd_write_35bytes_align):
2540	movdqa	-35(%eax), %xmm0
2541	movdqa	%xmm0, -35(%edx)
2542L(fwd_write_19bytes_align):
2543	movdqa	-19(%eax), %xmm0
2544	movdqa	%xmm0, -19(%edx)
2545	movzwl	-3(%eax), %ecx
2546	movzbl	-1(%eax), %eax
2547	movw	%cx, -3(%edx)
2548	movb	%al, -1(%edx)
2549#ifndef USE_AS_BCOPY
2550# ifdef USE_AS_MEMPCPY
2551	movl	%edx, %eax
2552# else
2553	movl	DEST(%esp), %eax
2554# endif
2555#endif
2556	RETURN
2557
2558	.p2align 4
2559L(fwd_write_44bytes_align):
2560	movdqa	-44(%eax), %xmm0
2561	movdqa	%xmm0, -44(%edx)
2562L(fwd_write_28bytes_align):
2563	movdqa	-28(%eax), %xmm0
2564	movdqa	%xmm0, -28(%edx)
2565L(fwd_write_12bytes_align):
2566	movq	-12(%eax), %xmm0
2567	movq	%xmm0, -12(%edx)
2568L(fwd_write_4bytes_align):
2569	movl	-4(%eax), %ecx
2570	movl	%ecx, -4(%edx)
2571#ifndef USE_AS_BCOPY
2572# ifdef USE_AS_MEMPCPY
2573	movl	%edx, %eax
2574# else
2575	movl	DEST(%esp), %eax
2576# endif
2577#endif
2578	RETURN
2579
2580	.p2align 4
2581L(fwd_write_36bytes_align):
2582	movdqa	-36(%eax), %xmm0
2583	movdqa	%xmm0, -36(%edx)
2584L(fwd_write_20bytes_align):
2585	movdqa	-20(%eax), %xmm0
2586	movdqa	%xmm0, -20(%edx)
2587	movl	-4(%eax), %ecx
2588	movl	%ecx, -4(%edx)
2589#ifndef USE_AS_BCOPY
2590# ifdef USE_AS_MEMPCPY
2591	movl	%edx, %eax
2592# else
2593	movl	DEST(%esp), %eax
2594# endif
2595#endif
2596	RETURN_END
2597
2598	CFI_PUSH (%edi)
2599
2600	.p2align 4
2601L(large_page):
2602	movdqu	(%eax), %xmm1
2603#ifdef USE_AS_MEMMOVE
2604	movl	DEST+4(%esp), %edi
2605	movdqu	%xmm0, (%edi)
2606#endif
2607	lea	16(%eax), %eax
2608	movntdq	%xmm1, (%edx)
2609	lea	16(%edx), %edx
2610	lea	-0x90(%ecx), %ecx
2611	POP (%edi)
2612
2613	.p2align 4
2614L(large_page_loop):
2615	movdqu	(%eax), %xmm0
2616	movdqu	0x10(%eax), %xmm1
2617	movdqu	0x20(%eax), %xmm2
2618	movdqu	0x30(%eax), %xmm3
2619	movdqu	0x40(%eax), %xmm4
2620	movdqu	0x50(%eax), %xmm5
2621	movdqu	0x60(%eax), %xmm6
2622	movdqu	0x70(%eax), %xmm7
2623	lea	0x80(%eax), %eax
2624
2625	sub	$0x80, %ecx
2626	movntdq	%xmm0, (%edx)
2627	movntdq	%xmm1, 0x10(%edx)
2628	movntdq	%xmm2, 0x20(%edx)
2629	movntdq	%xmm3, 0x30(%edx)
2630	movntdq	%xmm4, 0x40(%edx)
2631	movntdq	%xmm5, 0x50(%edx)
2632	movntdq	%xmm6, 0x60(%edx)
2633	movntdq	%xmm7, 0x70(%edx)
2634	lea	0x80(%edx), %edx
2635	jae	L(large_page_loop)
2636	cmp	$-0x40, %ecx
2637	lea	0x80(%ecx), %ecx
2638	jl	L(large_page_less_64bytes)
2639
2640	movdqu	(%eax), %xmm0
2641	movdqu	0x10(%eax), %xmm1
2642	movdqu	0x20(%eax), %xmm2
2643	movdqu	0x30(%eax), %xmm3
2644	lea	0x40(%eax), %eax
2645
2646	movntdq	%xmm0, (%edx)
2647	movntdq	%xmm1, 0x10(%edx)
2648	movntdq	%xmm2, 0x20(%edx)
2649	movntdq	%xmm3, 0x30(%edx)
2650	lea	0x40(%edx), %edx
2651	sub	$0x40, %ecx
2652L(large_page_less_64bytes):
2653	cmp	$32, %ecx
2654	jb	L(large_page_less_32bytes)
2655	movdqu	(%eax), %xmm0
2656	movdqu	0x10(%eax), %xmm1
2657	lea	0x20(%eax), %eax
2658	movntdq	%xmm0, (%edx)
2659	movntdq	%xmm1, 0x10(%edx)
2660	lea	0x20(%edx), %edx
2661	sub	$0x20, %ecx
2662L(large_page_less_32bytes):
2663	add	%ecx, %edx
2664	add	%ecx, %eax
2665	sfence
2666	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2667
2668	.p2align 4
2669L(bk_write_44bytes):
2670	movq	36(%eax), %xmm0
2671	movq	%xmm0, 36(%edx)
2672L(bk_write_36bytes):
2673	movq	28(%eax), %xmm0
2674	movq	%xmm0, 28(%edx)
2675L(bk_write_28bytes):
2676	movq	20(%eax), %xmm0
2677	movq	%xmm0, 20(%edx)
2678L(bk_write_20bytes):
2679	movq	12(%eax), %xmm0
2680	movq	%xmm0, 12(%edx)
2681L(bk_write_12bytes):
2682	movq	4(%eax), %xmm0
2683	movq	%xmm0, 4(%edx)
2684L(bk_write_4bytes):
2685	movl	(%eax), %ecx
2686	movl	%ecx, (%edx)
2687L(bk_write_0bytes):
2688#ifndef USE_AS_BCOPY
2689	movl	DEST(%esp), %eax
2690# ifdef USE_AS_MEMPCPY
2691	movl	LEN(%esp), %ecx
2692	add	%ecx, %eax
2693# endif
2694#endif
2695	RETURN
2696
2697	.p2align 4
2698L(bk_write_40bytes):
2699	movq	32(%eax), %xmm0
2700	movq	%xmm0, 32(%edx)
2701L(bk_write_32bytes):
2702	movq	24(%eax), %xmm0
2703	movq	%xmm0, 24(%edx)
2704L(bk_write_24bytes):
2705	movq	16(%eax), %xmm0
2706	movq	%xmm0, 16(%edx)
2707L(bk_write_16bytes):
2708	movq	8(%eax), %xmm0
2709	movq	%xmm0, 8(%edx)
2710L(bk_write_8bytes):
2711	movq	(%eax), %xmm0
2712	movq	%xmm0, (%edx)
2713#ifndef USE_AS_BCOPY
2714	movl	DEST(%esp), %eax
2715# ifdef USE_AS_MEMPCPY
2716	movl	LEN(%esp), %ecx
2717	add	%ecx, %eax
2718# endif
2719#endif
2720	RETURN
2721
2722	.p2align 4
2723L(bk_write_45bytes):
2724	movq	37(%eax), %xmm0
2725	movq	%xmm0, 37(%edx)
2726L(bk_write_37bytes):
2727	movq	29(%eax), %xmm0
2728	movq	%xmm0, 29(%edx)
2729L(bk_write_29bytes):
2730	movq	21(%eax), %xmm0
2731	movq	%xmm0, 21(%edx)
2732L(bk_write_21bytes):
2733	movq	13(%eax), %xmm0
2734	movq	%xmm0, 13(%edx)
2735L(bk_write_13bytes):
2736	movq	5(%eax), %xmm0
2737	movq	%xmm0, 5(%edx)
2738L(bk_write_5bytes):
2739	movl	1(%eax), %ecx
2740	movl	%ecx, 1(%edx)
2741L(bk_write_1bytes):
2742	movzbl	(%eax), %ecx
2743	movb	%cl, (%edx)
2744#ifndef USE_AS_BCOPY
2745	movl	DEST(%esp), %eax
2746# ifdef USE_AS_MEMPCPY
2747	movl	LEN(%esp), %ecx
2748	add	%ecx, %eax
2749# endif
2750#endif
2751	RETURN
2752
2753	.p2align 4
2754L(bk_write_41bytes):
2755	movq	33(%eax), %xmm0
2756	movq	%xmm0, 33(%edx)
2757L(bk_write_33bytes):
2758	movq	25(%eax), %xmm0
2759	movq	%xmm0, 25(%edx)
2760L(bk_write_25bytes):
2761	movq	17(%eax), %xmm0
2762	movq	%xmm0, 17(%edx)
2763L(bk_write_17bytes):
2764	movq	9(%eax), %xmm0
2765	movq	%xmm0, 9(%edx)
2766L(bk_write_9bytes):
2767	movq	1(%eax), %xmm0
2768	movq	%xmm0, 1(%edx)
2769	movzbl	(%eax), %ecx
2770	movb	%cl, (%edx)
2771#ifndef USE_AS_BCOPY
2772	movl	DEST(%esp), %eax
2773# ifdef USE_AS_MEMPCPY
2774	movl	LEN(%esp), %ecx
2775	add	%ecx, %eax
2776# endif
2777#endif
2778	RETURN
2779
2780	.p2align 4
2781L(bk_write_46bytes):
2782	movq	38(%eax), %xmm0
2783	movq	%xmm0, 38(%edx)
2784L(bk_write_38bytes):
2785	movq	30(%eax), %xmm0
2786	movq	%xmm0, 30(%edx)
2787L(bk_write_30bytes):
2788	movq	22(%eax), %xmm0
2789	movq	%xmm0, 22(%edx)
2790L(bk_write_22bytes):
2791	movq	14(%eax), %xmm0
2792	movq	%xmm0, 14(%edx)
2793L(bk_write_14bytes):
2794	movq	6(%eax), %xmm0
2795	movq	%xmm0, 6(%edx)
2796L(bk_write_6bytes):
2797	movl	2(%eax), %ecx
2798	movl	%ecx, 2(%edx)
2799	movzwl	(%eax), %ecx
2800	movw	%cx, (%edx)
2801#ifndef USE_AS_BCOPY
2802	movl	DEST(%esp), %eax
2803# ifdef USE_AS_MEMPCPY
2804	movl	LEN(%esp), %ecx
2805	add	%ecx, %eax
2806# endif
2807#endif
2808	RETURN
2809
2810	.p2align 4
2811L(bk_write_42bytes):
2812	movq	34(%eax), %xmm0
2813	movq	%xmm0, 34(%edx)
2814L(bk_write_34bytes):
2815	movq	26(%eax), %xmm0
2816	movq	%xmm0, 26(%edx)
2817L(bk_write_26bytes):
2818	movq	18(%eax), %xmm0
2819	movq	%xmm0, 18(%edx)
2820L(bk_write_18bytes):
2821	movq	10(%eax), %xmm0
2822	movq	%xmm0, 10(%edx)
2823L(bk_write_10bytes):
2824	movq	2(%eax), %xmm0
2825	movq	%xmm0, 2(%edx)
2826L(bk_write_2bytes):
2827	movzwl	(%eax), %ecx
2828	movw	%cx, (%edx)
2829#ifndef USE_AS_BCOPY
2830	movl	DEST(%esp), %eax
2831# ifdef USE_AS_MEMPCPY
2832	movl	LEN(%esp), %ecx
2833	add	%ecx, %eax
2834# endif
2835#endif
2836	RETURN
2837
2838	.p2align 4
2839L(bk_write_47bytes):
2840	movq	39(%eax), %xmm0
2841	movq	%xmm0, 39(%edx)
2842L(bk_write_39bytes):
2843	movq	31(%eax), %xmm0
2844	movq	%xmm0, 31(%edx)
2845L(bk_write_31bytes):
2846	movq	23(%eax), %xmm0
2847	movq	%xmm0, 23(%edx)
2848L(bk_write_23bytes):
2849	movq	15(%eax), %xmm0
2850	movq	%xmm0, 15(%edx)
2851L(bk_write_15bytes):
2852	movq	7(%eax), %xmm0
2853	movq	%xmm0, 7(%edx)
2854L(bk_write_7bytes):
2855	movl	3(%eax), %ecx
2856	movl	%ecx, 3(%edx)
2857	movzwl	1(%eax), %ecx
2858	movw	%cx, 1(%edx)
2859	movzbl	(%eax), %eax
2860	movb	%al, (%edx)
2861#ifndef USE_AS_BCOPY
2862	movl	DEST(%esp), %eax
2863# ifdef USE_AS_MEMPCPY
2864	movl	LEN(%esp), %ecx
2865	add	%ecx, %eax
2866# endif
2867#endif
2868	RETURN
2869
2870	.p2align 4
2871L(bk_write_43bytes):
2872	movq	35(%eax), %xmm0
2873	movq	%xmm0, 35(%edx)
2874L(bk_write_35bytes):
2875	movq	27(%eax), %xmm0
2876	movq	%xmm0, 27(%edx)
2877L(bk_write_27bytes):
2878	movq	19(%eax), %xmm0
2879	movq	%xmm0, 19(%edx)
2880L(bk_write_19bytes):
2881	movq	11(%eax), %xmm0
2882	movq	%xmm0, 11(%edx)
2883L(bk_write_11bytes):
2884	movq	3(%eax), %xmm0
2885	movq	%xmm0, 3(%edx)
2886L(bk_write_3bytes):
2887	movzwl	1(%eax), %ecx
2888	movw	%cx, 1(%edx)
2889	movzbl	(%eax), %eax
2890	movb	%al, (%edx)
2891#ifndef USE_AS_BCOPY
2892	movl	DEST(%esp), %eax
2893# ifdef USE_AS_MEMPCPY
2894	movl	LEN(%esp), %ecx
2895	add	%ecx, %eax
2896# endif
2897#endif
2898	RETURN_END
2899
2900
2901	.pushsection .rodata.ssse3,"a",@progbits
2902	.p2align 2
2903L(table_48bytes_fwd):
2904	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2905	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2906	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2907	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2908	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2909	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2910	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2911	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2912	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2913	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2914	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2915	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2916	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2917	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2918	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2919	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2920	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2921	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2922	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2923	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2924	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2925	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2926	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2927	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2928	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2929	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2930	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2931	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2932	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2933	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2934	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2935	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2936	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2937	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2938	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2939	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2940	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2941	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2942	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2943	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2944	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2945	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2946	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2947	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2948	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2949	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2950	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2951	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2952
2953	.p2align 2
2954L(table_48bytes_fwd_align):
2955	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2956	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2957	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2958	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2959	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2960	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2961	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2962	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2963	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2964	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2965	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2966	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2967	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2968	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2969	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2970	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2971	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2972	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2973	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2974	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2975	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2976	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2977	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2978	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2979	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2980	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2981	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2982	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2983	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2984	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2985	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2986	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2987	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2988	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2989	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2990	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2991	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2992	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2993	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2994	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2995	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2996	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2997	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2998	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2999	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
3000	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
3001	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
3002	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
3003
3004	.p2align 2
3005L(shl_table):
3006	.int	JMPTBL (L(shl_0), L(shl_table))
3007	.int	JMPTBL (L(shl_1), L(shl_table))
3008	.int	JMPTBL (L(shl_2), L(shl_table))
3009	.int	JMPTBL (L(shl_3), L(shl_table))
3010	.int	JMPTBL (L(shl_4), L(shl_table))
3011	.int	JMPTBL (L(shl_5), L(shl_table))
3012	.int	JMPTBL (L(shl_6), L(shl_table))
3013	.int	JMPTBL (L(shl_7), L(shl_table))
3014	.int	JMPTBL (L(shl_8), L(shl_table))
3015	.int	JMPTBL (L(shl_9), L(shl_table))
3016	.int	JMPTBL (L(shl_10), L(shl_table))
3017	.int	JMPTBL (L(shl_11), L(shl_table))
3018	.int	JMPTBL (L(shl_12), L(shl_table))
3019	.int	JMPTBL (L(shl_13), L(shl_table))
3020	.int	JMPTBL (L(shl_14), L(shl_table))
3021	.int	JMPTBL (L(shl_15), L(shl_table))
3022
3023	.p2align 2
3024L(table_48_bytes_bwd):
3025	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
3026	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
3027	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
3028	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
3029	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
3030	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
3031	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
3032	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
3033	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
3034	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
3035	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
3036	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
3037	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
3038	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
3039	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
3040	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
3041	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
3042	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
3043	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
3044	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
3045	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
3046	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
3047	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
3048	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
3049	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
3050	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
3051	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
3052	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
3053	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
3054	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
3055	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
3056	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
3057	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
3058	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
3059	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
3060	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
3061	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
3062	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
3063	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3064	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3065	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3066	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3067	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3068	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3069	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3070	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3071	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3072	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3073
3074	.popsection
3075
3076#ifdef USE_AS_MEMMOVE
3077	.p2align 4
3078L(copy_backward):
3079	PUSH (%edi)
3080	movl	%eax, %edi
3081	lea	(%ecx,%edx,1),%edx
3082	lea	(%ecx,%edi,1),%edi
3083	testl	$0x3, %edx
3084	jnz	L(bk_align)
3085
3086L(bk_aligned_4):
3087	cmp	$64, %ecx
3088	jae	L(bk_write_more64bytes)
3089
3090L(bk_write_64bytesless):
3091	cmp	$32, %ecx
3092	jb	L(bk_write_less32bytes)
3093
3094L(bk_write_more32bytes):
3095	/* Copy 32 bytes at a time.  */
3096	sub	$32, %ecx
3097	movq	-8(%edi), %xmm0
3098	movq	%xmm0, -8(%edx)
3099	movq	-16(%edi), %xmm0
3100	movq	%xmm0, -16(%edx)
3101	movq	-24(%edi), %xmm0
3102	movq	%xmm0, -24(%edx)
3103	movq	-32(%edi), %xmm0
3104	movq	%xmm0, -32(%edx)
3105	sub	$32, %edx
3106	sub	$32, %edi
3107
3108L(bk_write_less32bytes):
3109	movl	%edi, %eax
3110	sub	%ecx, %edx
3111	sub	%ecx, %eax
3112	POP (%edi)
3113L(bk_write_less32bytes_2):
3114	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3115
3116	CFI_PUSH (%edi)
3117
3118	.p2align 4
3119L(bk_align):
3120	cmp	$8, %ecx
3121	jbe	L(bk_write_less32bytes)
3122	testl	$1, %edx
3123	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3124	then	(EDX & 2) must be != 0.  */
3125	jz	L(bk_got2)
3126	sub	$1, %edi
3127	sub	$1, %ecx
3128	sub	$1, %edx
3129	movzbl	(%edi), %eax
3130	movb	%al, (%edx)
3131
3132	testl	$2, %edx
3133	jz	L(bk_aligned_4)
3134
3135L(bk_got2):
3136	sub	$2, %edi
3137	sub	$2, %ecx
3138	sub	$2, %edx
3139	movzwl	(%edi), %eax
3140	movw	%ax, (%edx)
3141	jmp	L(bk_aligned_4)
3142
3143	.p2align 4
3144L(bk_write_more64bytes):
3145	/* Check alignment of last byte.  */
3146	testl	$15, %edx
3147	jz	L(bk_ssse3_cpy_pre)
3148
3149/* EDX is aligned 4 bytes, but not 16 bytes.  */
3150L(bk_ssse3_align):
3151	sub	$4, %edi
3152	sub	$4, %ecx
3153	sub	$4, %edx
3154	movl	(%edi), %eax
3155	movl	%eax, (%edx)
3156
3157	testl	$15, %edx
3158	jz	L(bk_ssse3_cpy_pre)
3159
3160	sub	$4, %edi
3161	sub	$4, %ecx
3162	sub	$4, %edx
3163	movl	(%edi), %eax
3164	movl	%eax, (%edx)
3165
3166	testl	$15, %edx
3167	jz	L(bk_ssse3_cpy_pre)
3168
3169	sub	$4, %edi
3170	sub	$4, %ecx
3171	sub	$4, %edx
3172	movl	(%edi), %eax
3173	movl	%eax, (%edx)
3174
3175L(bk_ssse3_cpy_pre):
3176	cmp	$64, %ecx
3177	jb	L(bk_write_more32bytes)
3178
3179	.p2align 4
3180L(bk_ssse3_cpy):
3181	sub	$64, %edi
3182	sub	$64, %ecx
3183	sub	$64, %edx
3184	movdqu	0x30(%edi), %xmm3
3185	movdqa	%xmm3, 0x30(%edx)
3186	movdqu	0x20(%edi), %xmm2
3187	movdqa	%xmm2, 0x20(%edx)
3188	movdqu	0x10(%edi), %xmm1
3189	movdqa	%xmm1, 0x10(%edx)
3190	movdqu	(%edi), %xmm0
3191	movdqa	%xmm0, (%edx)
3192	cmp	$64, %ecx
3193	jae	L(bk_ssse3_cpy)
3194	jmp	L(bk_write_64bytesless)
3195
3196#endif
3197
3198END (MEMCPY)
3199