sse2-memset-atom.S revision 5a92284167ffba6d45210ef6889fa7d255c15d4f
1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef L
34# define L(label)	.L##label
35#endif
36
37#ifndef ALIGN
38# define ALIGN(n)	.p2align n
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc			.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc			.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)		.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)			\
63	.type name,  @function; 	\
64	.globl name;			\
65	.p2align 4;			\
66name:					\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)			\
72	cfi_endproc;			\
73	.size name, .-name
74#endif
75
76#define CFI_PUSH(REG)						\
77  cfi_adjust_cfa_offset (4);					\
78  cfi_rel_offset (REG, 0)
79
80#define CFI_POP(REG)						\
81  cfi_adjust_cfa_offset (-4);					\
82  cfi_restore (REG)
83
84#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
85#define POP(REG)	popl REG; CFI_POP (REG)
86
87#ifdef USE_AS_BZERO
88# define DEST		PARMS
89# define LEN		DEST+4
90# define SETRTNVAL
91#else
92# define DEST		PARMS
93# define CHR		DEST+4
94# define LEN		CHR+4
95# define SETRTNVAL	movl DEST(%esp), %eax
96#endif
97
98#if (defined SHARED || defined __PIC__)
99# define ENTRANCE	PUSH (%ebx);
100# define RETURN_END	POP (%ebx); ret
101# define RETURN		RETURN_END; CFI_PUSH (%ebx)
102# define PARMS		8		/* Preserve EBX.  */
103# define JMPTBL(I, B)	I - B
104
105/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
106   jump table with relative offsets.   */
107# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
108    /* We first load PC into EBX.  */				\
109    call	__x86.get_pc_thunk.bx;				\
110    /* Get the address of the jump table.  */			\
111    add		$(TABLE - .), %ebx;				\
112    /* Get the entry and convert the relative offset to the	\
113       absolute address.  */					\
114    add		(%ebx,%ecx,4), %ebx;				\
115    add		%ecx, %edx;					\
116    /* We loaded the jump table and adjuested EDX. Go.  */	\
117    jmp		*%ebx
118
119	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
120	.globl	__x86.get_pc_thunk.bx
121	.hidden	__x86.get_pc_thunk.bx
122	ALIGN (4)
123	.type	__x86.get_pc_thunk.bx,@function
124__x86.get_pc_thunk.bx:
125	movl	(%esp), %ebx
126	ret
127#else
128# define ENTRANCE
129# define RETURN_END	ret
130# define RETURN		RETURN_END
131# define PARMS		4
132# define JMPTBL(I, B)	I
133
134/* Branch to an entry in a jump table.  TABLE is a jump table with
135   absolute offsets.  */
136# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
137    add		%ecx, %edx;					\
138    jmp		*TABLE(,%ecx,4)
139#endif
140
141#ifndef MEMSET
142# define MEMSET memset
143#endif
144
145	.section .text.sse2,"ax",@progbits
146	ALIGN (4)
147ENTRY (MEMSET)
148	ENTRANCE
149
150	movl	LEN(%esp), %ecx
151#ifdef USE_AS_BZERO
152	xor	%eax, %eax
153#else
154	movzbl	CHR(%esp), %eax
155	movb	%al, %ah
156	/* Fill the whole EAX with pattern.  */
157	movl	%eax, %edx
158	shl	$16, %eax
159	or	%edx, %eax
160#endif
161	movl	DEST(%esp), %edx
162	cmp	$32, %ecx
163	jae	L(32bytesormore)
164
165L(write_less32bytes):
166	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
167
168
169	.pushsection .rodata.sse2,"a",@progbits
170	ALIGN (2)
171L(table_less_32bytes):
172	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
173	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
174	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
175	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
176	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
177	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
178	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
179	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
180	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
181	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
182	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
183	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
184	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
185	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
186	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
187	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
188	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
189	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
190	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
191	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
192	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
193	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
194	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
195	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
196	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
197	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
198	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
199	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
200	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
201	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
202	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
203	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
204	.popsection
205
206	ALIGN (4)
207L(write_28bytes):
208	movl	%eax, -28(%edx)
209L(write_24bytes):
210	movl	%eax, -24(%edx)
211L(write_20bytes):
212	movl	%eax, -20(%edx)
213L(write_16bytes):
214	movl	%eax, -16(%edx)
215L(write_12bytes):
216	movl	%eax, -12(%edx)
217L(write_8bytes):
218	movl	%eax, -8(%edx)
219L(write_4bytes):
220	movl	%eax, -4(%edx)
221L(write_0bytes):
222	SETRTNVAL
223	RETURN
224
225	ALIGN (4)
226L(write_29bytes):
227	movl	%eax, -29(%edx)
228L(write_25bytes):
229	movl	%eax, -25(%edx)
230L(write_21bytes):
231	movl	%eax, -21(%edx)
232L(write_17bytes):
233	movl	%eax, -17(%edx)
234L(write_13bytes):
235	movl	%eax, -13(%edx)
236L(write_9bytes):
237	movl	%eax, -9(%edx)
238L(write_5bytes):
239	movl	%eax, -5(%edx)
240L(write_1bytes):
241	movb	%al, -1(%edx)
242	SETRTNVAL
243	RETURN
244
245	ALIGN (4)
246L(write_30bytes):
247	movl	%eax, -30(%edx)
248L(write_26bytes):
249	movl	%eax, -26(%edx)
250L(write_22bytes):
251	movl	%eax, -22(%edx)
252L(write_18bytes):
253	movl	%eax, -18(%edx)
254L(write_14bytes):
255	movl	%eax, -14(%edx)
256L(write_10bytes):
257	movl	%eax, -10(%edx)
258L(write_6bytes):
259	movl	%eax, -6(%edx)
260L(write_2bytes):
261	movw	%ax, -2(%edx)
262	SETRTNVAL
263	RETURN
264
265	ALIGN (4)
266L(write_31bytes):
267	movl	%eax, -31(%edx)
268L(write_27bytes):
269	movl	%eax, -27(%edx)
270L(write_23bytes):
271	movl	%eax, -23(%edx)
272L(write_19bytes):
273	movl	%eax, -19(%edx)
274L(write_15bytes):
275	movl	%eax, -15(%edx)
276L(write_11bytes):
277	movl	%eax, -11(%edx)
278L(write_7bytes):
279	movl	%eax, -7(%edx)
280L(write_3bytes):
281	movw	%ax, -3(%edx)
282	movb	%al, -1(%edx)
283	SETRTNVAL
284	RETURN
285
286	ALIGN (4)
287/* ECX > 32 and EDX is 4 byte aligned.  */
288L(32bytesormore):
289	/* Fill xmm0 with the pattern.  */
290#ifdef USE_AS_BZERO
291	pxor	%xmm0, %xmm0
292#else
293	movd	%eax, %xmm0
294	pshufd	$0, %xmm0, %xmm0
295#endif
296	testl	$0xf, %edx
297	jz	L(aligned_16)
298/* ECX > 32 and EDX is not 16 byte aligned.  */
299L(not_aligned_16):
300	movdqu	%xmm0, (%edx)
301	movl	%edx, %eax
302	and	$-16, %edx
303	add	$16, %edx
304	sub	%edx, %eax
305	add	%eax, %ecx
306	movd	%xmm0, %eax
307
308	ALIGN (4)
309L(aligned_16):
310	cmp	$128, %ecx
311	jae	L(128bytesormore)
312
313L(aligned_16_less128bytes):
314	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
315
316	ALIGN (4)
317L(128bytesormore):
318#ifdef SHARED_CACHE_SIZE
319	PUSH (%ebx)
320	mov	$SHARED_CACHE_SIZE, %ebx
321#else
322# if (defined SHARED || defined __PIC__)
323	call	__x86.get_pc_thunk.bx
324	add	$_GLOBAL_OFFSET_TABLE_, %ebx
325	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
326# else
327	PUSH (%ebx)
328	mov	__x86_shared_cache_size, %ebx
329# endif
330#endif
331	cmp	%ebx, %ecx
332	jae	L(128bytesormore_nt_start)
333
334
335#ifdef DATA_CACHE_SIZE
336	POP (%ebx)
337# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
338	cmp	$DATA_CACHE_SIZE, %ecx
339#else
340# if (defined SHARED || defined __PIC__)
341#  define RESTORE_EBX_STATE
342	call	__x86.get_pc_thunk.bx
343	add	$_GLOBAL_OFFSET_TABLE_, %ebx
344	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
345# else
346	POP (%ebx)
347#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
348	cmp	__x86_data_cache_size, %ecx
349# endif
350#endif
351
352	jae	L(128bytes_L2_normal)
353	subl	$128, %ecx
354L(128bytesormore_normal):
355	sub	$128, %ecx
356	movdqa	%xmm0, (%edx)
357	movdqa	%xmm0, 0x10(%edx)
358	movdqa	%xmm0, 0x20(%edx)
359	movdqa	%xmm0, 0x30(%edx)
360	movdqa	%xmm0, 0x40(%edx)
361	movdqa	%xmm0, 0x50(%edx)
362	movdqa	%xmm0, 0x60(%edx)
363	movdqa	%xmm0, 0x70(%edx)
364	lea	128(%edx), %edx
365	jb	L(128bytesless_normal)
366
367
368	sub	$128, %ecx
369	movdqa	%xmm0, (%edx)
370	movdqa	%xmm0, 0x10(%edx)
371	movdqa	%xmm0, 0x20(%edx)
372	movdqa	%xmm0, 0x30(%edx)
373	movdqa	%xmm0, 0x40(%edx)
374	movdqa	%xmm0, 0x50(%edx)
375	movdqa	%xmm0, 0x60(%edx)
376	movdqa	%xmm0, 0x70(%edx)
377	lea	128(%edx), %edx
378	jae	L(128bytesormore_normal)
379
380L(128bytesless_normal):
381	add	$128, %ecx
382	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
383
384	ALIGN (4)
385L(128bytes_L2_normal):
386	prefetcht0	0x380(%edx)
387	prefetcht0	0x3c0(%edx)
388	sub	$128, %ecx
389	movdqa	%xmm0, (%edx)
390	movaps	%xmm0, 0x10(%edx)
391	movaps	%xmm0, 0x20(%edx)
392	movaps	%xmm0, 0x30(%edx)
393	movaps	%xmm0, 0x40(%edx)
394	movaps	%xmm0, 0x50(%edx)
395	movaps	%xmm0, 0x60(%edx)
396	movaps	%xmm0, 0x70(%edx)
397	add	$128, %edx
398	cmp	$128, %ecx
399	jae	L(128bytes_L2_normal)
400
401L(128bytesless_L2_normal):
402	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
403
404	RESTORE_EBX_STATE
405L(128bytesormore_nt_start):
406	sub	%ebx, %ecx
407	mov	%ebx, %eax
408	and	$0x7f, %eax
409	add	%eax, %ecx
410	movd	%xmm0, %eax
411	ALIGN (4)
412L(128bytesormore_shared_cache_loop):
413	prefetcht0	0x3c0(%edx)
414	prefetcht0	0x380(%edx)
415	sub	$0x80, %ebx
416	movdqa	%xmm0, (%edx)
417	movdqa	%xmm0, 0x10(%edx)
418	movdqa	%xmm0, 0x20(%edx)
419	movdqa	%xmm0, 0x30(%edx)
420	movdqa	%xmm0, 0x40(%edx)
421	movdqa	%xmm0, 0x50(%edx)
422	movdqa	%xmm0, 0x60(%edx)
423	movdqa	%xmm0, 0x70(%edx)
424	add	$0x80, %edx
425	cmp	$0x80, %ebx
426	jae	L(128bytesormore_shared_cache_loop)
427	cmp	$0x80, %ecx
428	jb	L(shared_cache_loop_end)
429	ALIGN (4)
430L(128bytesormore_nt):
431	sub	$0x80, %ecx
432	movntdq	%xmm0, (%edx)
433	movntdq	%xmm0, 0x10(%edx)
434	movntdq	%xmm0, 0x20(%edx)
435	movntdq	%xmm0, 0x30(%edx)
436	movntdq	%xmm0, 0x40(%edx)
437	movntdq	%xmm0, 0x50(%edx)
438	movntdq	%xmm0, 0x60(%edx)
439	movntdq	%xmm0, 0x70(%edx)
440	add	$0x80, %edx
441	cmp	$0x80, %ecx
442	jae	L(128bytesormore_nt)
443	sfence
444L(shared_cache_loop_end):
445#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
446	POP (%ebx)
447#endif
448	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
449
450
451	.pushsection .rodata.sse2,"a",@progbits
452	ALIGN (2)
453L(table_16_128bytes):
454	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
455	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
456	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
457	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
458	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
459	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
460	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
461	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
462	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
463	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
464	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
465	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
466	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
467	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
468	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
469	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
470	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
471	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
472	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
473	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
474	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
475	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
476	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
477	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
478	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
479	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
480	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
481	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
482	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
483	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
484	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
485	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
486	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
487	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
488	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
489	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
490	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
491	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
492	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
493	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
494	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
495	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
496	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
497	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
498	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
499	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
500	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
501	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
502	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
503	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
504	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
505	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
506	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
507	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
508	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
509	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
510	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
511	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
512	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
513	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
514	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
515	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
516	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
517	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
518	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
519	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
520	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
521	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
522	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
523	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
524	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
525	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
526	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
527	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
528	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
529	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
530	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
531	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
532	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
533	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
534	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
535	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
536	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
537	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
538	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
539	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
540	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
541	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
542	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
543	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
544	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
545	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
546	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
547	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
548	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
549	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
550	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
551	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
552	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
553	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
554	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
555	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
556	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
557	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
558	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
559	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
560	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
561	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
562	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
563	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
564	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
565	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
566	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
567	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
568	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
569	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
570	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
571	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
572	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
573	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
574	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
575	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
576	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
577	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
578	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
579	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
580	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
581	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
582	.popsection
583
584	ALIGN (4)
585L(aligned_16_112bytes):
586	movdqa	%xmm0, -112(%edx)
587L(aligned_16_96bytes):
588	movdqa	%xmm0, -96(%edx)
589L(aligned_16_80bytes):
590	movdqa	%xmm0, -80(%edx)
591L(aligned_16_64bytes):
592	movdqa	%xmm0, -64(%edx)
593L(aligned_16_48bytes):
594	movdqa	%xmm0, -48(%edx)
595L(aligned_16_32bytes):
596	movdqa	%xmm0, -32(%edx)
597L(aligned_16_16bytes):
598	movdqa	%xmm0, -16(%edx)
599L(aligned_16_0bytes):
600	SETRTNVAL
601	RETURN
602
603	ALIGN (4)
604L(aligned_16_113bytes):
605	movdqa	%xmm0, -113(%edx)
606L(aligned_16_97bytes):
607	movdqa	%xmm0, -97(%edx)
608L(aligned_16_81bytes):
609	movdqa	%xmm0, -81(%edx)
610L(aligned_16_65bytes):
611	movdqa	%xmm0, -65(%edx)
612L(aligned_16_49bytes):
613	movdqa	%xmm0, -49(%edx)
614L(aligned_16_33bytes):
615	movdqa	%xmm0, -33(%edx)
616L(aligned_16_17bytes):
617	movdqa	%xmm0, -17(%edx)
618L(aligned_16_1bytes):
619	movb	%al, -1(%edx)
620	SETRTNVAL
621	RETURN
622
623	ALIGN (4)
624L(aligned_16_114bytes):
625	movdqa	%xmm0, -114(%edx)
626L(aligned_16_98bytes):
627	movdqa	%xmm0, -98(%edx)
628L(aligned_16_82bytes):
629	movdqa	%xmm0, -82(%edx)
630L(aligned_16_66bytes):
631	movdqa	%xmm0, -66(%edx)
632L(aligned_16_50bytes):
633	movdqa	%xmm0, -50(%edx)
634L(aligned_16_34bytes):
635	movdqa	%xmm0, -34(%edx)
636L(aligned_16_18bytes):
637	movdqa	%xmm0, -18(%edx)
638L(aligned_16_2bytes):
639	movw	%ax, -2(%edx)
640	SETRTNVAL
641	RETURN
642
643	ALIGN (4)
644L(aligned_16_115bytes):
645	movdqa	%xmm0, -115(%edx)
646L(aligned_16_99bytes):
647	movdqa	%xmm0, -99(%edx)
648L(aligned_16_83bytes):
649	movdqa	%xmm0, -83(%edx)
650L(aligned_16_67bytes):
651	movdqa	%xmm0, -67(%edx)
652L(aligned_16_51bytes):
653	movdqa	%xmm0, -51(%edx)
654L(aligned_16_35bytes):
655	movdqa	%xmm0, -35(%edx)
656L(aligned_16_19bytes):
657	movdqa	%xmm0, -19(%edx)
658L(aligned_16_3bytes):
659	movw	%ax, -3(%edx)
660	movb	%al, -1(%edx)
661	SETRTNVAL
662	RETURN
663
664	ALIGN (4)
665L(aligned_16_116bytes):
666	movdqa	%xmm0, -116(%edx)
667L(aligned_16_100bytes):
668	movdqa	%xmm0, -100(%edx)
669L(aligned_16_84bytes):
670	movdqa	%xmm0, -84(%edx)
671L(aligned_16_68bytes):
672	movdqa	%xmm0, -68(%edx)
673L(aligned_16_52bytes):
674	movdqa	%xmm0, -52(%edx)
675L(aligned_16_36bytes):
676	movdqa	%xmm0, -36(%edx)
677L(aligned_16_20bytes):
678	movdqa	%xmm0, -20(%edx)
679L(aligned_16_4bytes):
680	movl	%eax, -4(%edx)
681	SETRTNVAL
682	RETURN
683
684	ALIGN (4)
685L(aligned_16_117bytes):
686	movdqa	%xmm0, -117(%edx)
687L(aligned_16_101bytes):
688	movdqa	%xmm0, -101(%edx)
689L(aligned_16_85bytes):
690	movdqa	%xmm0, -85(%edx)
691L(aligned_16_69bytes):
692	movdqa	%xmm0, -69(%edx)
693L(aligned_16_53bytes):
694	movdqa	%xmm0, -53(%edx)
695L(aligned_16_37bytes):
696	movdqa	%xmm0, -37(%edx)
697L(aligned_16_21bytes):
698	movdqa	%xmm0, -21(%edx)
699L(aligned_16_5bytes):
700	movl	%eax, -5(%edx)
701	movb	%al, -1(%edx)
702	SETRTNVAL
703	RETURN
704
705	ALIGN (4)
706L(aligned_16_118bytes):
707	movdqa	%xmm0, -118(%edx)
708L(aligned_16_102bytes):
709	movdqa	%xmm0, -102(%edx)
710L(aligned_16_86bytes):
711	movdqa	%xmm0, -86(%edx)
712L(aligned_16_70bytes):
713	movdqa	%xmm0, -70(%edx)
714L(aligned_16_54bytes):
715	movdqa	%xmm0, -54(%edx)
716L(aligned_16_38bytes):
717	movdqa	%xmm0, -38(%edx)
718L(aligned_16_22bytes):
719	movdqa	%xmm0, -22(%edx)
720L(aligned_16_6bytes):
721	movl	%eax, -6(%edx)
722	movw	%ax, -2(%edx)
723	SETRTNVAL
724	RETURN
725
726	ALIGN (4)
727L(aligned_16_119bytes):
728	movdqa	%xmm0, -119(%edx)
729L(aligned_16_103bytes):
730	movdqa	%xmm0, -103(%edx)
731L(aligned_16_87bytes):
732	movdqa	%xmm0, -87(%edx)
733L(aligned_16_71bytes):
734	movdqa	%xmm0, -71(%edx)
735L(aligned_16_55bytes):
736	movdqa	%xmm0, -55(%edx)
737L(aligned_16_39bytes):
738	movdqa	%xmm0, -39(%edx)
739L(aligned_16_23bytes):
740	movdqa	%xmm0, -23(%edx)
741L(aligned_16_7bytes):
742	movl	%eax, -7(%edx)
743	movw	%ax, -3(%edx)
744	movb	%al, -1(%edx)
745	SETRTNVAL
746	RETURN
747
748	ALIGN (4)
749L(aligned_16_120bytes):
750	movdqa	%xmm0, -120(%edx)
751L(aligned_16_104bytes):
752	movdqa	%xmm0, -104(%edx)
753L(aligned_16_88bytes):
754	movdqa	%xmm0, -88(%edx)
755L(aligned_16_72bytes):
756	movdqa	%xmm0, -72(%edx)
757L(aligned_16_56bytes):
758	movdqa	%xmm0, -56(%edx)
759L(aligned_16_40bytes):
760	movdqa	%xmm0, -40(%edx)
761L(aligned_16_24bytes):
762	movdqa	%xmm0, -24(%edx)
763L(aligned_16_8bytes):
764	movq	%xmm0, -8(%edx)
765	SETRTNVAL
766	RETURN
767
768	ALIGN (4)
769L(aligned_16_121bytes):
770	movdqa	%xmm0, -121(%edx)
771L(aligned_16_105bytes):
772	movdqa	%xmm0, -105(%edx)
773L(aligned_16_89bytes):
774	movdqa	%xmm0, -89(%edx)
775L(aligned_16_73bytes):
776	movdqa	%xmm0, -73(%edx)
777L(aligned_16_57bytes):
778	movdqa	%xmm0, -57(%edx)
779L(aligned_16_41bytes):
780	movdqa	%xmm0, -41(%edx)
781L(aligned_16_25bytes):
782	movdqa	%xmm0, -25(%edx)
783L(aligned_16_9bytes):
784	movq	%xmm0, -9(%edx)
785	movb	%al, -1(%edx)
786	SETRTNVAL
787	RETURN
788
789	ALIGN (4)
790L(aligned_16_122bytes):
791	movdqa	%xmm0, -122(%edx)
792L(aligned_16_106bytes):
793	movdqa	%xmm0, -106(%edx)
794L(aligned_16_90bytes):
795	movdqa	%xmm0, -90(%edx)
796L(aligned_16_74bytes):
797	movdqa	%xmm0, -74(%edx)
798L(aligned_16_58bytes):
799	movdqa	%xmm0, -58(%edx)
800L(aligned_16_42bytes):
801	movdqa	%xmm0, -42(%edx)
802L(aligned_16_26bytes):
803	movdqa	%xmm0, -26(%edx)
804L(aligned_16_10bytes):
805	movq	%xmm0, -10(%edx)
806	movw	%ax, -2(%edx)
807	SETRTNVAL
808	RETURN
809
810	ALIGN (4)
811L(aligned_16_123bytes):
812	movdqa	%xmm0, -123(%edx)
813L(aligned_16_107bytes):
814	movdqa	%xmm0, -107(%edx)
815L(aligned_16_91bytes):
816	movdqa	%xmm0, -91(%edx)
817L(aligned_16_75bytes):
818	movdqa	%xmm0, -75(%edx)
819L(aligned_16_59bytes):
820	movdqa	%xmm0, -59(%edx)
821L(aligned_16_43bytes):
822	movdqa	%xmm0, -43(%edx)
823L(aligned_16_27bytes):
824	movdqa	%xmm0, -27(%edx)
825L(aligned_16_11bytes):
826	movq	%xmm0, -11(%edx)
827	movw	%ax, -3(%edx)
828	movb	%al, -1(%edx)
829	SETRTNVAL
830	RETURN
831
832	ALIGN (4)
833L(aligned_16_124bytes):
834	movdqa	%xmm0, -124(%edx)
835L(aligned_16_108bytes):
836	movdqa	%xmm0, -108(%edx)
837L(aligned_16_92bytes):
838	movdqa	%xmm0, -92(%edx)
839L(aligned_16_76bytes):
840	movdqa	%xmm0, -76(%edx)
841L(aligned_16_60bytes):
842	movdqa	%xmm0, -60(%edx)
843L(aligned_16_44bytes):
844	movdqa	%xmm0, -44(%edx)
845L(aligned_16_28bytes):
846	movdqa	%xmm0, -28(%edx)
847L(aligned_16_12bytes):
848	movq	%xmm0, -12(%edx)
849	movl	%eax, -4(%edx)
850	SETRTNVAL
851	RETURN
852
853	ALIGN (4)
854L(aligned_16_125bytes):
855	movdqa	%xmm0, -125(%edx)
856L(aligned_16_109bytes):
857	movdqa	%xmm0, -109(%edx)
858L(aligned_16_93bytes):
859	movdqa	%xmm0, -93(%edx)
860L(aligned_16_77bytes):
861	movdqa	%xmm0, -77(%edx)
862L(aligned_16_61bytes):
863	movdqa	%xmm0, -61(%edx)
864L(aligned_16_45bytes):
865	movdqa	%xmm0, -45(%edx)
866L(aligned_16_29bytes):
867	movdqa	%xmm0, -29(%edx)
868L(aligned_16_13bytes):
869	movq	%xmm0, -13(%edx)
870	movl	%eax, -5(%edx)
871	movb	%al, -1(%edx)
872	SETRTNVAL
873	RETURN
874
875	ALIGN (4)
876L(aligned_16_126bytes):
877	movdqa	%xmm0, -126(%edx)
878L(aligned_16_110bytes):
879	movdqa	%xmm0, -110(%edx)
880L(aligned_16_94bytes):
881	movdqa	%xmm0, -94(%edx)
882L(aligned_16_78bytes):
883	movdqa	%xmm0, -78(%edx)
884L(aligned_16_62bytes):
885	movdqa	%xmm0, -62(%edx)
886L(aligned_16_46bytes):
887	movdqa	%xmm0, -46(%edx)
888L(aligned_16_30bytes):
889	movdqa	%xmm0, -30(%edx)
890L(aligned_16_14bytes):
891	movq	%xmm0, -14(%edx)
892	movl	%eax, -6(%edx)
893	movw	%ax, -2(%edx)
894	SETRTNVAL
895	RETURN
896
897	ALIGN (4)
898L(aligned_16_127bytes):
899	movdqa	%xmm0, -127(%edx)
900L(aligned_16_111bytes):
901	movdqa	%xmm0, -111(%edx)
902L(aligned_16_95bytes):
903	movdqa	%xmm0, -95(%edx)
904L(aligned_16_79bytes):
905	movdqa	%xmm0, -79(%edx)
906L(aligned_16_63bytes):
907	movdqa	%xmm0, -63(%edx)
908L(aligned_16_47bytes):
909	movdqa	%xmm0, -47(%edx)
910L(aligned_16_31bytes):
911	movdqa	%xmm0, -31(%edx)
912L(aligned_16_15bytes):
913	movq	%xmm0, -15(%edx)
914	movl	%eax, -7(%edx)
915	movw	%ax, -3(%edx)
916	movb	%al, -1(%edx)
917	SETRTNVAL
918	RETURN_END
919
920END (MEMSET)
921