sse2-memset5-atom.S revision 8ff1a2759a6389bed30d7862d0beb76077032c99
1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef ALIGN
36# define ALIGN(n)	.p2align n
37#endif
38
39#ifndef cfi_startproc
40# define cfi_startproc			.cfi_startproc
41#endif
42
43#ifndef cfi_endproc
44# define cfi_endproc			.cfi_endproc
45#endif
46
47#ifndef cfi_rel_offset
48# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
49#endif
50
51#ifndef cfi_restore
52# define cfi_restore(reg)		.cfi_restore (reg)
53#endif
54
55#ifndef cfi_adjust_cfa_offset
56# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
57#endif
58
59#ifndef ENTRY
60# define ENTRY(name)			\
61	.type name,  @function; 	\
62	.globl name;			\
63	.p2align 4;			\
64name:					\
65	cfi_startproc
66#endif
67
68#ifndef END
69# define END(name)			\
70	cfi_endproc;			\
71	.size name, .-name
72#endif
73
74#define CFI_PUSH(REG)						\
75  cfi_adjust_cfa_offset (4);					\
76  cfi_rel_offset (REG, 0)
77
78#define CFI_POP(REG)						\
79  cfi_adjust_cfa_offset (-4);					\
80  cfi_restore (REG)
81
82#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
83#define POP(REG)	popl REG; CFI_POP (REG)
84
85#ifdef USE_AS_BZERO
86# define DEST		PARMS
87# define LEN		DEST+4
88# define SETRTNVAL
89#else
90# define DEST		PARMS
91# define CHR		DEST+4
92# define LEN		CHR+4
93# define SETRTNVAL	movl DEST(%esp), %eax
94#endif
95
96#ifdef SHARED
97# define ENTRANCE	PUSH (%ebx);
98# define RETURN_END	POP (%ebx); ret
99# define RETURN		RETURN_END; CFI_PUSH (%ebx)
100# define PARMS		8		/* Preserve EBX.  */
101# define JMPTBL(I, B)	I - B
102
103/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
104   jump table with relative offsets.   */
105# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
106    /* We first load PC into EBX.  */				\
107    call	__i686.get_pc_thunk.bx;				\
108    /* Get the address of the jump table.  */			\
109    add		$(TABLE - .), %ebx;				\
110    /* Get the entry and convert the relative offset to the	\
111       absolute address.  */					\
112    add		(%ebx,%ecx,4), %ebx;				\
113    add		%ecx, %edx;					\
114    /* We loaded the jump table and adjuested EDX. Go.  */	\
115    jmp		*%ebx
116
117	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
118	.globl	__i686.get_pc_thunk.bx
119	.hidden	__i686.get_pc_thunk.bx
120	ALIGN (4)
121	.type	__i686.get_pc_thunk.bx,@function
122__i686.get_pc_thunk.bx:
123	movl	(%esp), %ebx
124	ret
125#else
126# define ENTRANCE
127# define RETURN_END	ret
128# define RETURN		RETURN_END
129# define PARMS		4
130# define JMPTBL(I, B)	I
131
132/* Branch to an entry in a jump table.  TABLE is a jump table with
133   absolute offsets.  */
134# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
135    add		%ecx, %edx;					\
136    jmp		*TABLE(,%ecx,4)
137#endif
138
139	.section .text.sse2,"ax",@progbits
140	ALIGN (4)
141ENTRY (sse2_memset5_atom)
142	ENTRANCE
143
144	movl	LEN(%esp), %ecx
145#ifdef USE_AS_BZERO
146	xor	%eax, %eax
147#else
148	movzbl	CHR(%esp), %eax
149	movb	%al, %ah
150	/* Fill the whole EAX with pattern.  */
151	movl	%eax, %edx
152	shl	$16, %eax
153	or	%edx, %eax
154#endif
155	movl	DEST(%esp), %edx
156	cmp	$32, %ecx
157	jae	L(32bytesormore)
158
159L(write_less32bytes):
160	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
161
162
163	.pushsection .rodata.sse2,"a",@progbits
164	ALIGN (2)
165L(table_less_32bytes):
166	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
167	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
168	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
169	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
170	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
171	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
172	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
173	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
174	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
175	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
176	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
177	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
178	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
179	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
180	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
181	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
182	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
183	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
184	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
185	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
186	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
187	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
188	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
189	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
190	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
191	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
192	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
193	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
194	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
195	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
196	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
197	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
198	.popsection
199
200	ALIGN (4)
201L(write_28bytes):
202	movl	%eax, -28(%edx)
203L(write_24bytes):
204	movl	%eax, -24(%edx)
205L(write_20bytes):
206	movl	%eax, -20(%edx)
207L(write_16bytes):
208	movl	%eax, -16(%edx)
209L(write_12bytes):
210	movl	%eax, -12(%edx)
211L(write_8bytes):
212	movl	%eax, -8(%edx)
213L(write_4bytes):
214	movl	%eax, -4(%edx)
215L(write_0bytes):
216	SETRTNVAL
217	RETURN
218
219	ALIGN (4)
220L(write_29bytes):
221	movl	%eax, -29(%edx)
222L(write_25bytes):
223	movl	%eax, -25(%edx)
224L(write_21bytes):
225	movl	%eax, -21(%edx)
226L(write_17bytes):
227	movl	%eax, -17(%edx)
228L(write_13bytes):
229	movl	%eax, -13(%edx)
230L(write_9bytes):
231	movl	%eax, -9(%edx)
232L(write_5bytes):
233	movl	%eax, -5(%edx)
234L(write_1bytes):
235	movb	%al, -1(%edx)
236	SETRTNVAL
237	RETURN
238
239	ALIGN (4)
240L(write_30bytes):
241	movl	%eax, -30(%edx)
242L(write_26bytes):
243	movl	%eax, -26(%edx)
244L(write_22bytes):
245	movl	%eax, -22(%edx)
246L(write_18bytes):
247	movl	%eax, -18(%edx)
248L(write_14bytes):
249	movl	%eax, -14(%edx)
250L(write_10bytes):
251	movl	%eax, -10(%edx)
252L(write_6bytes):
253	movl	%eax, -6(%edx)
254L(write_2bytes):
255	movw	%ax, -2(%edx)
256	SETRTNVAL
257	RETURN
258
259	ALIGN (4)
260L(write_31bytes):
261	movl	%eax, -31(%edx)
262L(write_27bytes):
263	movl	%eax, -27(%edx)
264L(write_23bytes):
265	movl	%eax, -23(%edx)
266L(write_19bytes):
267	movl	%eax, -19(%edx)
268L(write_15bytes):
269	movl	%eax, -15(%edx)
270L(write_11bytes):
271	movl	%eax, -11(%edx)
272L(write_7bytes):
273	movl	%eax, -7(%edx)
274L(write_3bytes):
275	movw	%ax, -3(%edx)
276	movb	%al, -1(%edx)
277	SETRTNVAL
278	RETURN
279
280	ALIGN (4)
281/* ECX > 32 and EDX is 4 byte aligned.  */
282L(32bytesormore):
283	/* Fill xmm0 with the pattern.  */
284#ifdef USE_AS_BZERO
285	pxor	%xmm0, %xmm0
286#else
287	movd	%eax, %xmm0
288	punpcklbw %xmm0, %xmm0
289	pshufd	$0, %xmm0, %xmm0
290#endif
291	testl	$0xf, %edx
292	jz	L(aligned_16)
293/* ECX > 32 and EDX is not 16 byte aligned.  */
294L(not_aligned_16):
295	movdqu	%xmm0, (%edx)
296	movl	%edx, %eax
297	and	$-16, %edx
298	add	$16, %edx
299	sub	%edx, %eax
300	add	%eax, %ecx
301	movd	%xmm0, %eax
302
303	ALIGN (4)
304L(aligned_16):
305	cmp	$128, %ecx
306	jae	L(128bytesormore)
307
308L(aligned_16_less128bytes):
309	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
310
311	ALIGN (4)
312L(128bytesormore):
313#ifdef SHARED_CACHE_SIZE
314	PUSH (%ebx)
315	mov	$SHARED_CACHE_SIZE, %ebx
316#else
317# ifdef SHARED
318	call	__i686.get_pc_thunk.bx
319	add	$_GLOBAL_OFFSET_TABLE_, %ebx
320	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
321# else
322	PUSH (%ebx)
323	mov	__x86_shared_cache_size, %ebx
324# endif
325#endif
326	cmp	%ebx, %ecx
327	jae	L(128bytesormore_nt_start)
328
329
330#ifdef DATA_CACHE_SIZE
331	POP (%ebx)
332	cmp	$DATA_CACHE_SIZE, %ecx
333#else
334# ifdef SHARED
335	call	__i686.get_pc_thunk.bx
336	add	$_GLOBAL_OFFSET_TABLE_, %ebx
337	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
338# else
339	POP (%ebx)
340	cmp	__x86_data_cache_size, %ecx
341# endif
342#endif
343
344	jae	L(128bytes_L2_normal)
345	subl	$128, %ecx
346L(128bytesormore_normal):
347	sub	$128, %ecx
348	movdqa	%xmm0, (%edx)
349	movdqa	%xmm0, 0x10(%edx)
350	movdqa	%xmm0, 0x20(%edx)
351	movdqa	%xmm0, 0x30(%edx)
352	movdqa	%xmm0, 0x40(%edx)
353	movdqa	%xmm0, 0x50(%edx)
354	movdqa	%xmm0, 0x60(%edx)
355	movdqa	%xmm0, 0x70(%edx)
356	lea	128(%edx), %edx
357	jb	L(128bytesless_normal)
358
359
360	sub	$128, %ecx
361	movdqa	%xmm0, (%edx)
362	movdqa	%xmm0, 0x10(%edx)
363	movdqa	%xmm0, 0x20(%edx)
364	movdqa	%xmm0, 0x30(%edx)
365	movdqa	%xmm0, 0x40(%edx)
366	movdqa	%xmm0, 0x50(%edx)
367	movdqa	%xmm0, 0x60(%edx)
368	movdqa	%xmm0, 0x70(%edx)
369	lea	128(%edx), %edx
370	jae	L(128bytesormore_normal)
371
372L(128bytesless_normal):
373	lea	128(%ecx), %ecx
374	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
375
376	ALIGN (4)
377L(128bytes_L2_normal):
378	prefetcht0	0x380(%edx)
379	prefetcht0	0x3c0(%edx)
380	sub	$128, %ecx
381	movdqa	%xmm0, (%edx)
382	movaps	%xmm0, 0x10(%edx)
383	movaps	%xmm0, 0x20(%edx)
384	movaps	%xmm0, 0x30(%edx)
385	movaps	%xmm0, 0x40(%edx)
386	movaps	%xmm0, 0x50(%edx)
387	movaps	%xmm0, 0x60(%edx)
388	movaps	%xmm0, 0x70(%edx)
389	add	$128, %edx
390	cmp	$128, %ecx
391	jae	L(128bytes_L2_normal)
392
393L(128bytesless_L2_normal):
394	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
395
396L(128bytesormore_nt_start):
397	sub	%ebx, %ecx
398	ALIGN (4)
399L(128bytesormore_shared_cache_loop):
400	prefetcht0	0x3c0(%edx)
401	prefetcht0	0x380(%edx)
402	sub	$0x80, %ebx
403	movdqa	%xmm0, (%edx)
404	movdqa	%xmm0, 0x10(%edx)
405	movdqa	%xmm0, 0x20(%edx)
406	movdqa	%xmm0, 0x30(%edx)
407	movdqa	%xmm0, 0x40(%edx)
408	movdqa	%xmm0, 0x50(%edx)
409	movdqa	%xmm0, 0x60(%edx)
410	movdqa	%xmm0, 0x70(%edx)
411	add	$0x80, %edx
412	cmp	$0x80, %ebx
413	jae	L(128bytesormore_shared_cache_loop)
414	cmp	$0x80, %ecx
415	jb	L(shared_cache_loop_end)
416	ALIGN (4)
417L(128bytesormore_nt):
418	sub	$0x80, %ecx
419	movntdq	%xmm0, (%edx)
420	movntdq	%xmm0, 0x10(%edx)
421	movntdq	%xmm0, 0x20(%edx)
422	movntdq	%xmm0, 0x30(%edx)
423	movntdq	%xmm0, 0x40(%edx)
424	movntdq	%xmm0, 0x50(%edx)
425	movntdq	%xmm0, 0x60(%edx)
426	movntdq	%xmm0, 0x70(%edx)
427	add	$0x80, %edx
428	cmp	$0x80, %ecx
429	jae	L(128bytesormore_nt)
430	sfence
431L(shared_cache_loop_end):
432#if defined DATA_CACHE_SIZE || !defined SHARED
433	POP (%ebx)
434#endif
435	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
436
437
438	.pushsection .rodata.sse2,"a",@progbits
439	ALIGN (2)
440L(table_16_128bytes):
441	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
442	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
443	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
444	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
445	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
446	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
447	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
448	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
449	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
450	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
451	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
452	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
453	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
454	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
455	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
456	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
457	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
458	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
459	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
460	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
461	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
462	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
463	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
464	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
465	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
466	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
467	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
468	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
469	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
470	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
471	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
472	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
473	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
474	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
475	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
476	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
477	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
478	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
479	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
480	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
481	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
482	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
483	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
484	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
485	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
486	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
487	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
488	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
489	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
490	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
491	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
492	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
493	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
494	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
495	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
496	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
497	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
498	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
499	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
500	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
501	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
502	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
503	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
504	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
505	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
506	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
507	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
508	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
509	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
510	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
511	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
512	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
513	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
514	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
515	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
516	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
517	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
518	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
519	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
520	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
521	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
522	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
523	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
524	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
525	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
526	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
527	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
528	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
529	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
530	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
531	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
532	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
533	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
534	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
535	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
536	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
537	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
538	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
539	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
540	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
541	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
542	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
543	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
544	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
545	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
546	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
547	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
548	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
549	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
550	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
551	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
552	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
553	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
554	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
555	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
556	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
557	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
558	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
559	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
560	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
561	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
562	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
563	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
564	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
565	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
566	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
567	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
568	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
569	.popsection
570
571	ALIGN (4)
572L(aligned_16_112bytes):
573	movdqa	%xmm0, -112(%edx)
574L(aligned_16_96bytes):
575	movdqa	%xmm0, -96(%edx)
576L(aligned_16_80bytes):
577	movdqa	%xmm0, -80(%edx)
578L(aligned_16_64bytes):
579	movdqa	%xmm0, -64(%edx)
580L(aligned_16_48bytes):
581	movdqa	%xmm0, -48(%edx)
582L(aligned_16_32bytes):
583	movdqa	%xmm0, -32(%edx)
584L(aligned_16_16bytes):
585	movdqa	%xmm0, -16(%edx)
586L(aligned_16_0bytes):
587	SETRTNVAL
588	RETURN
589
590	ALIGN (4)
591L(aligned_16_113bytes):
592	movdqa	%xmm0, -113(%edx)
593L(aligned_16_97bytes):
594	movdqa	%xmm0, -97(%edx)
595L(aligned_16_81bytes):
596	movdqa	%xmm0, -81(%edx)
597L(aligned_16_65bytes):
598	movdqa	%xmm0, -65(%edx)
599L(aligned_16_49bytes):
600	movdqa	%xmm0, -49(%edx)
601L(aligned_16_33bytes):
602	movdqa	%xmm0, -33(%edx)
603L(aligned_16_17bytes):
604	movdqa	%xmm0, -17(%edx)
605L(aligned_16_1bytes):
606	movb	%al, -1(%edx)
607	SETRTNVAL
608	RETURN
609
610	ALIGN (4)
611L(aligned_16_114bytes):
612	movdqa	%xmm0, -114(%edx)
613L(aligned_16_98bytes):
614	movdqa	%xmm0, -98(%edx)
615L(aligned_16_82bytes):
616	movdqa	%xmm0, -82(%edx)
617L(aligned_16_66bytes):
618	movdqa	%xmm0, -66(%edx)
619L(aligned_16_50bytes):
620	movdqa	%xmm0, -50(%edx)
621L(aligned_16_34bytes):
622	movdqa	%xmm0, -34(%edx)
623L(aligned_16_18bytes):
624	movdqa	%xmm0, -18(%edx)
625L(aligned_16_2bytes):
626	movw	%ax, -2(%edx)
627	SETRTNVAL
628	RETURN
629
630	ALIGN (4)
631L(aligned_16_115bytes):
632	movdqa	%xmm0, -115(%edx)
633L(aligned_16_99bytes):
634	movdqa	%xmm0, -99(%edx)
635L(aligned_16_83bytes):
636	movdqa	%xmm0, -83(%edx)
637L(aligned_16_67bytes):
638	movdqa	%xmm0, -67(%edx)
639L(aligned_16_51bytes):
640	movdqa	%xmm0, -51(%edx)
641L(aligned_16_35bytes):
642	movdqa	%xmm0, -35(%edx)
643L(aligned_16_19bytes):
644	movdqa	%xmm0, -19(%edx)
645L(aligned_16_3bytes):
646	movw	%ax, -3(%edx)
647	movb	%al, -1(%edx)
648	SETRTNVAL
649	RETURN
650
651	ALIGN (4)
652L(aligned_16_116bytes):
653	movdqa	%xmm0, -116(%edx)
654L(aligned_16_100bytes):
655	movdqa	%xmm0, -100(%edx)
656L(aligned_16_84bytes):
657	movdqa	%xmm0, -84(%edx)
658L(aligned_16_68bytes):
659	movdqa	%xmm0, -68(%edx)
660L(aligned_16_52bytes):
661	movdqa	%xmm0, -52(%edx)
662L(aligned_16_36bytes):
663	movdqa	%xmm0, -36(%edx)
664L(aligned_16_20bytes):
665	movdqa	%xmm0, -20(%edx)
666L(aligned_16_4bytes):
667	movl	%eax, -4(%edx)
668	SETRTNVAL
669	RETURN
670
671	ALIGN (4)
672L(aligned_16_117bytes):
673	movdqa	%xmm0, -117(%edx)
674L(aligned_16_101bytes):
675	movdqa	%xmm0, -101(%edx)
676L(aligned_16_85bytes):
677	movdqa	%xmm0, -85(%edx)
678L(aligned_16_69bytes):
679	movdqa	%xmm0, -69(%edx)
680L(aligned_16_53bytes):
681	movdqa	%xmm0, -53(%edx)
682L(aligned_16_37bytes):
683	movdqa	%xmm0, -37(%edx)
684L(aligned_16_21bytes):
685	movdqa	%xmm0, -21(%edx)
686L(aligned_16_5bytes):
687	movl	%eax, -5(%edx)
688	movb	%al, -1(%edx)
689	SETRTNVAL
690	RETURN
691
692	ALIGN (4)
693L(aligned_16_118bytes):
694	movdqa	%xmm0, -118(%edx)
695L(aligned_16_102bytes):
696	movdqa	%xmm0, -102(%edx)
697L(aligned_16_86bytes):
698	movdqa	%xmm0, -86(%edx)
699L(aligned_16_70bytes):
700	movdqa	%xmm0, -70(%edx)
701L(aligned_16_54bytes):
702	movdqa	%xmm0, -54(%edx)
703L(aligned_16_38bytes):
704	movdqa	%xmm0, -38(%edx)
705L(aligned_16_22bytes):
706	movdqa	%xmm0, -22(%edx)
707L(aligned_16_6bytes):
708	movl	%eax, -6(%edx)
709	movw	%ax, -2(%edx)
710	SETRTNVAL
711	RETURN
712
713	ALIGN (4)
714L(aligned_16_119bytes):
715	movdqa	%xmm0, -119(%edx)
716L(aligned_16_103bytes):
717	movdqa	%xmm0, -103(%edx)
718L(aligned_16_87bytes):
719	movdqa	%xmm0, -87(%edx)
720L(aligned_16_71bytes):
721	movdqa	%xmm0, -71(%edx)
722L(aligned_16_55bytes):
723	movdqa	%xmm0, -55(%edx)
724L(aligned_16_39bytes):
725	movdqa	%xmm0, -39(%edx)
726L(aligned_16_23bytes):
727	movdqa	%xmm0, -23(%edx)
728L(aligned_16_7bytes):
729	movl	%eax, -7(%edx)
730	movw	%ax, -3(%edx)
731	movb	%al, -1(%edx)
732	SETRTNVAL
733	RETURN
734
735	ALIGN (4)
736L(aligned_16_120bytes):
737	movdqa	%xmm0, -120(%edx)
738L(aligned_16_104bytes):
739	movdqa	%xmm0, -104(%edx)
740L(aligned_16_88bytes):
741	movdqa	%xmm0, -88(%edx)
742L(aligned_16_72bytes):
743	movdqa	%xmm0, -72(%edx)
744L(aligned_16_56bytes):
745	movdqa	%xmm0, -56(%edx)
746L(aligned_16_40bytes):
747	movdqa	%xmm0, -40(%edx)
748L(aligned_16_24bytes):
749	movdqa	%xmm0, -24(%edx)
750L(aligned_16_8bytes):
751	movq	%xmm0, -8(%edx)
752	SETRTNVAL
753	RETURN
754
755	ALIGN (4)
756L(aligned_16_121bytes):
757	movdqa	%xmm0, -121(%edx)
758L(aligned_16_105bytes):
759	movdqa	%xmm0, -105(%edx)
760L(aligned_16_89bytes):
761	movdqa	%xmm0, -89(%edx)
762L(aligned_16_73bytes):
763	movdqa	%xmm0, -73(%edx)
764L(aligned_16_57bytes):
765	movdqa	%xmm0, -57(%edx)
766L(aligned_16_41bytes):
767	movdqa	%xmm0, -41(%edx)
768L(aligned_16_25bytes):
769	movdqa	%xmm0, -25(%edx)
770L(aligned_16_9bytes):
771	movq	%xmm0, -9(%edx)
772	movb	%al, -1(%edx)
773	SETRTNVAL
774	RETURN
775
776	ALIGN (4)
777L(aligned_16_122bytes):
778	movdqa	%xmm0, -122(%edx)
779L(aligned_16_106bytes):
780	movdqa	%xmm0, -106(%edx)
781L(aligned_16_90bytes):
782	movdqa	%xmm0, -90(%edx)
783L(aligned_16_74bytes):
784	movdqa	%xmm0, -74(%edx)
785L(aligned_16_58bytes):
786	movdqa	%xmm0, -58(%edx)
787L(aligned_16_42bytes):
788	movdqa	%xmm0, -42(%edx)
789L(aligned_16_26bytes):
790	movdqa	%xmm0, -26(%edx)
791L(aligned_16_10bytes):
792	movq	%xmm0, -10(%edx)
793	movw	%ax, -2(%edx)
794	SETRTNVAL
795	RETURN
796
797	ALIGN (4)
798L(aligned_16_123bytes):
799	movdqa	%xmm0, -123(%edx)
800L(aligned_16_107bytes):
801	movdqa	%xmm0, -107(%edx)
802L(aligned_16_91bytes):
803	movdqa	%xmm0, -91(%edx)
804L(aligned_16_75bytes):
805	movdqa	%xmm0, -75(%edx)
806L(aligned_16_59bytes):
807	movdqa	%xmm0, -59(%edx)
808L(aligned_16_43bytes):
809	movdqa	%xmm0, -43(%edx)
810L(aligned_16_27bytes):
811	movdqa	%xmm0, -27(%edx)
812L(aligned_16_11bytes):
813	movq	%xmm0, -11(%edx)
814	movw	%ax, -3(%edx)
815	movb	%al, -1(%edx)
816	SETRTNVAL
817	RETURN
818
819	ALIGN (4)
820L(aligned_16_124bytes):
821	movdqa	%xmm0, -124(%edx)
822L(aligned_16_108bytes):
823	movdqa	%xmm0, -108(%edx)
824L(aligned_16_92bytes):
825	movdqa	%xmm0, -92(%edx)
826L(aligned_16_76bytes):
827	movdqa	%xmm0, -76(%edx)
828L(aligned_16_60bytes):
829	movdqa	%xmm0, -60(%edx)
830L(aligned_16_44bytes):
831	movdqa	%xmm0, -44(%edx)
832L(aligned_16_28bytes):
833	movdqa	%xmm0, -28(%edx)
834L(aligned_16_12bytes):
835	movq	%xmm0, -12(%edx)
836	movl	%eax, -4(%edx)
837	SETRTNVAL
838	RETURN
839
840	ALIGN (4)
841L(aligned_16_125bytes):
842	movdqa	%xmm0, -125(%edx)
843L(aligned_16_109bytes):
844	movdqa	%xmm0, -109(%edx)
845L(aligned_16_93bytes):
846	movdqa	%xmm0, -93(%edx)
847L(aligned_16_77bytes):
848	movdqa	%xmm0, -77(%edx)
849L(aligned_16_61bytes):
850	movdqa	%xmm0, -61(%edx)
851L(aligned_16_45bytes):
852	movdqa	%xmm0, -45(%edx)
853L(aligned_16_29bytes):
854	movdqa	%xmm0, -29(%edx)
855L(aligned_16_13bytes):
856	movq	%xmm0, -13(%edx)
857	movl	%eax, -5(%edx)
858	movb	%al, -1(%edx)
859	SETRTNVAL
860	RETURN
861
862	ALIGN (4)
863L(aligned_16_126bytes):
864	movdqa	%xmm0, -126(%edx)
865L(aligned_16_110bytes):
866	movdqa	%xmm0, -110(%edx)
867L(aligned_16_94bytes):
868	movdqa	%xmm0, -94(%edx)
869L(aligned_16_78bytes):
870	movdqa	%xmm0, -78(%edx)
871L(aligned_16_62bytes):
872	movdqa	%xmm0, -62(%edx)
873L(aligned_16_46bytes):
874	movdqa	%xmm0, -46(%edx)
875L(aligned_16_30bytes):
876	movdqa	%xmm0, -30(%edx)
877L(aligned_16_14bytes):
878	movq	%xmm0, -14(%edx)
879	movl	%eax, -6(%edx)
880	movw	%ax, -2(%edx)
881	SETRTNVAL
882	RETURN
883
884	ALIGN (4)
885L(aligned_16_127bytes):
886	movdqa	%xmm0, -127(%edx)
887L(aligned_16_111bytes):
888	movdqa	%xmm0, -111(%edx)
889L(aligned_16_95bytes):
890	movdqa	%xmm0, -95(%edx)
891L(aligned_16_79bytes):
892	movdqa	%xmm0, -79(%edx)
893L(aligned_16_63bytes):
894	movdqa	%xmm0, -63(%edx)
895L(aligned_16_47bytes):
896	movdqa	%xmm0, -47(%edx)
897L(aligned_16_31bytes):
898	movdqa	%xmm0, -31(%edx)
899L(aligned_16_15bytes):
900	movq	%xmm0, -15(%edx)
901	movl	%eax, -7(%edx)
902	movw	%ax, -3(%edx)
903	movb	%al, -1(%edx)
904	SETRTNVAL
905	RETURN_END
906
907END (sse2_memset5_atom)
908