sse2-memset-atom.S revision 81d6a18c69b71288c0ab0f65e0ee594f752febc8
1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include <private/bionic_asm.h>
32
33#include "cache.h"
34
35#ifndef L
36# define L(label)	.L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n)	.p2align n
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg)		.cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
53#endif
54
55#define CFI_PUSH(REG)						\
56  cfi_adjust_cfa_offset (4);					\
57  cfi_rel_offset (REG, 0)
58
59#define CFI_POP(REG)						\
60  cfi_adjust_cfa_offset (-4);					\
61  cfi_restore (REG)
62
63#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
64#define POP(REG)	popl REG; CFI_POP (REG)
65
66#define DST		PARMS
67#define CHR		DST+4
68#define LEN		CHR+4
69#define CHK_DST_LEN (LEN+4)
70#define SETRTNVAL	movl DST(%esp), %eax
71
72#if (defined SHARED || defined __PIC__)
73# define ENTRANCE	PUSH (%ebx);
74# define RETURN_END	POP (%ebx); ret
75# define RETURN		RETURN_END; CFI_PUSH (%ebx)
76# define PARMS		8		/* Preserve EBX.  */
77# define JMPTBL(I, B)	I - B
78
79/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
80   jump table with relative offsets.   */
81# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
82    /* We first load PC into EBX.  */				\
83    call	__x86.get_pc_thunk.bx;				\
84    /* Get the address of the jump table.  */			\
85    add		$(TABLE - .), %ebx;				\
86    /* Get the entry and convert the relative offset to the	\
87       absolute address.  */					\
88    add		(%ebx,%ecx,4), %ebx;				\
89    add		%ecx, %edx;					\
90    /* We loaded the jump table and adjuested EDX. Go.  */	\
91    jmp		*%ebx
92
93	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
94	.globl	__x86.get_pc_thunk.bx
95	.hidden	__x86.get_pc_thunk.bx
96	ALIGN (4)
97	.type	__x86.get_pc_thunk.bx,@function
98__x86.get_pc_thunk.bx:
99	movl	(%esp), %ebx
100	ret
101#else
102# define ENTRANCE
103# define RETURN_END	ret
104# define RETURN		RETURN_END
105# define PARMS		4
106# define JMPTBL(I, B)	I
107
108/* Branch to an entry in a jump table.  TABLE is a jump table with
109   absolute offsets.  */
110# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
111    add		%ecx, %edx;					\
112    jmp		*TABLE(,%ecx,4)
113#endif
114
115ENTRY(__memset_chk)
116  movl LEN(%esp), %ecx
117  cmpl %ecx, CHK_DST_LEN(%esp)
118  jbe memset
119
120  jmp __memset_chk_fail
121END(__memset_chk)
122
123	.section .text.sse2,"ax",@progbits
124	ALIGN (4)
125ENTRY (memset)
126	ENTRANCE
127
128	movl	LEN(%esp), %ecx
129	movzbl	CHR(%esp), %eax
130	movb	%al, %ah
131	/* Fill the whole EAX with pattern.  */
132	movl	%eax, %edx
133	shl	$16, %eax
134	or	%edx, %eax
135	movl	DST(%esp), %edx
136	cmp	$32, %ecx
137	jae	L(32bytesormore)
138
139L(write_less32bytes):
140	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
141
142
143	.pushsection .rodata.sse2,"a",@progbits
144	ALIGN (2)
145L(table_less_32bytes):
146	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
147	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
148	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
149	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
150	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
151	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
152	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
153	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
154	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
155	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
156	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
157	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
158	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
159	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
160	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
161	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
162	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
163	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
164	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
165	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
166	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
167	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
168	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
169	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
170	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
171	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
172	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
173	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
174	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
175	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
176	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
177	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
178	.popsection
179
180	ALIGN (4)
181L(write_28bytes):
182	movl	%eax, -28(%edx)
183L(write_24bytes):
184	movl	%eax, -24(%edx)
185L(write_20bytes):
186	movl	%eax, -20(%edx)
187L(write_16bytes):
188	movl	%eax, -16(%edx)
189L(write_12bytes):
190	movl	%eax, -12(%edx)
191L(write_8bytes):
192	movl	%eax, -8(%edx)
193L(write_4bytes):
194	movl	%eax, -4(%edx)
195L(write_0bytes):
196	SETRTNVAL
197	RETURN
198
199	ALIGN (4)
200L(write_29bytes):
201	movl	%eax, -29(%edx)
202L(write_25bytes):
203	movl	%eax, -25(%edx)
204L(write_21bytes):
205	movl	%eax, -21(%edx)
206L(write_17bytes):
207	movl	%eax, -17(%edx)
208L(write_13bytes):
209	movl	%eax, -13(%edx)
210L(write_9bytes):
211	movl	%eax, -9(%edx)
212L(write_5bytes):
213	movl	%eax, -5(%edx)
214L(write_1bytes):
215	movb	%al, -1(%edx)
216	SETRTNVAL
217	RETURN
218
219	ALIGN (4)
220L(write_30bytes):
221	movl	%eax, -30(%edx)
222L(write_26bytes):
223	movl	%eax, -26(%edx)
224L(write_22bytes):
225	movl	%eax, -22(%edx)
226L(write_18bytes):
227	movl	%eax, -18(%edx)
228L(write_14bytes):
229	movl	%eax, -14(%edx)
230L(write_10bytes):
231	movl	%eax, -10(%edx)
232L(write_6bytes):
233	movl	%eax, -6(%edx)
234L(write_2bytes):
235	movw	%ax, -2(%edx)
236	SETRTNVAL
237	RETURN
238
239	ALIGN (4)
240L(write_31bytes):
241	movl	%eax, -31(%edx)
242L(write_27bytes):
243	movl	%eax, -27(%edx)
244L(write_23bytes):
245	movl	%eax, -23(%edx)
246L(write_19bytes):
247	movl	%eax, -19(%edx)
248L(write_15bytes):
249	movl	%eax, -15(%edx)
250L(write_11bytes):
251	movl	%eax, -11(%edx)
252L(write_7bytes):
253	movl	%eax, -7(%edx)
254L(write_3bytes):
255	movw	%ax, -3(%edx)
256	movb	%al, -1(%edx)
257	SETRTNVAL
258	RETURN
259
260	ALIGN (4)
261/* ECX > 32 and EDX is 4 byte aligned.  */
262L(32bytesormore):
263	/* Fill xmm0 with the pattern.  */
264	movd	%eax, %xmm0
265	pshufd	$0, %xmm0, %xmm0
266	testl	$0xf, %edx
267	jz	L(aligned_16)
268/* ECX > 32 and EDX is not 16 byte aligned.  */
269L(not_aligned_16):
270	movdqu	%xmm0, (%edx)
271	movl	%edx, %eax
272	and	$-16, %edx
273	add	$16, %edx
274	sub	%edx, %eax
275	add	%eax, %ecx
276	movd	%xmm0, %eax
277
278	ALIGN (4)
279L(aligned_16):
280	cmp	$128, %ecx
281	jae	L(128bytesormore)
282
283L(aligned_16_less128bytes):
284	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
285
286	ALIGN (4)
287L(128bytesormore):
288#ifdef SHARED_CACHE_SIZE
289	PUSH (%ebx)
290	mov	$SHARED_CACHE_SIZE, %ebx
291#else
292# if (defined SHARED || defined __PIC__)
293	call	__x86.get_pc_thunk.bx
294	add	$_GLOBAL_OFFSET_TABLE_, %ebx
295	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
296# else
297	PUSH (%ebx)
298	mov	__x86_shared_cache_size, %ebx
299# endif
300#endif
301	cmp	%ebx, %ecx
302	jae	L(128bytesormore_nt_start)
303
304
305#ifdef DATA_CACHE_SIZE
306	POP (%ebx)
307# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
308	cmp	$DATA_CACHE_SIZE, %ecx
309#else
310# if (defined SHARED || defined __PIC__)
311#  define RESTORE_EBX_STATE
312	call	__x86.get_pc_thunk.bx
313	add	$_GLOBAL_OFFSET_TABLE_, %ebx
314	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
315# else
316	POP (%ebx)
317#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
318	cmp	__x86_data_cache_size, %ecx
319# endif
320#endif
321
322	jae	L(128bytes_L2_normal)
323	subl	$128, %ecx
324L(128bytesormore_normal):
325	sub	$128, %ecx
326	movdqa	%xmm0, (%edx)
327	movdqa	%xmm0, 0x10(%edx)
328	movdqa	%xmm0, 0x20(%edx)
329	movdqa	%xmm0, 0x30(%edx)
330	movdqa	%xmm0, 0x40(%edx)
331	movdqa	%xmm0, 0x50(%edx)
332	movdqa	%xmm0, 0x60(%edx)
333	movdqa	%xmm0, 0x70(%edx)
334	lea	128(%edx), %edx
335	jb	L(128bytesless_normal)
336
337
338	sub	$128, %ecx
339	movdqa	%xmm0, (%edx)
340	movdqa	%xmm0, 0x10(%edx)
341	movdqa	%xmm0, 0x20(%edx)
342	movdqa	%xmm0, 0x30(%edx)
343	movdqa	%xmm0, 0x40(%edx)
344	movdqa	%xmm0, 0x50(%edx)
345	movdqa	%xmm0, 0x60(%edx)
346	movdqa	%xmm0, 0x70(%edx)
347	lea	128(%edx), %edx
348	jae	L(128bytesormore_normal)
349
350L(128bytesless_normal):
351	add	$128, %ecx
352	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
353
354	ALIGN (4)
355L(128bytes_L2_normal):
356	prefetcht0	0x380(%edx)
357	prefetcht0	0x3c0(%edx)
358	sub	$128, %ecx
359	movdqa	%xmm0, (%edx)
360	movaps	%xmm0, 0x10(%edx)
361	movaps	%xmm0, 0x20(%edx)
362	movaps	%xmm0, 0x30(%edx)
363	movaps	%xmm0, 0x40(%edx)
364	movaps	%xmm0, 0x50(%edx)
365	movaps	%xmm0, 0x60(%edx)
366	movaps	%xmm0, 0x70(%edx)
367	add	$128, %edx
368	cmp	$128, %ecx
369	jae	L(128bytes_L2_normal)
370
371L(128bytesless_L2_normal):
372	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
373
374	RESTORE_EBX_STATE
375L(128bytesormore_nt_start):
376	sub	%ebx, %ecx
377	mov	%ebx, %eax
378	and	$0x7f, %eax
379	add	%eax, %ecx
380	movd	%xmm0, %eax
381	ALIGN (4)
382L(128bytesormore_shared_cache_loop):
383	prefetcht0	0x3c0(%edx)
384	prefetcht0	0x380(%edx)
385	sub	$0x80, %ebx
386	movdqa	%xmm0, (%edx)
387	movdqa	%xmm0, 0x10(%edx)
388	movdqa	%xmm0, 0x20(%edx)
389	movdqa	%xmm0, 0x30(%edx)
390	movdqa	%xmm0, 0x40(%edx)
391	movdqa	%xmm0, 0x50(%edx)
392	movdqa	%xmm0, 0x60(%edx)
393	movdqa	%xmm0, 0x70(%edx)
394	add	$0x80, %edx
395	cmp	$0x80, %ebx
396	jae	L(128bytesormore_shared_cache_loop)
397	cmp	$0x80, %ecx
398	jb	L(shared_cache_loop_end)
399	ALIGN (4)
400L(128bytesormore_nt):
401	sub	$0x80, %ecx
402	movntdq	%xmm0, (%edx)
403	movntdq	%xmm0, 0x10(%edx)
404	movntdq	%xmm0, 0x20(%edx)
405	movntdq	%xmm0, 0x30(%edx)
406	movntdq	%xmm0, 0x40(%edx)
407	movntdq	%xmm0, 0x50(%edx)
408	movntdq	%xmm0, 0x60(%edx)
409	movntdq	%xmm0, 0x70(%edx)
410	add	$0x80, %edx
411	cmp	$0x80, %ecx
412	jae	L(128bytesormore_nt)
413	sfence
414L(shared_cache_loop_end):
415#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
416	POP (%ebx)
417#endif
418	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
419
420
421	.pushsection .rodata.sse2,"a",@progbits
422	ALIGN (2)
423L(table_16_128bytes):
424	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
425	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
426	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
427	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
428	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
429	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
430	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
431	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
432	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
433	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
434	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
435	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
436	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
437	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
438	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
439	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
440	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
441	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
442	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
443	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
444	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
445	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
446	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
447	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
448	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
449	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
450	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
451	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
452	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
453	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
454	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
455	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
456	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
457	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
458	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
459	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
460	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
461	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
462	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
463	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
464	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
465	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
466	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
467	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
468	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
469	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
470	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
471	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
472	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
473	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
474	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
475	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
476	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
477	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
478	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
479	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
480	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
481	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
482	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
483	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
484	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
485	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
486	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
487	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
488	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
489	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
490	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
491	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
492	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
493	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
494	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
495	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
496	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
497	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
498	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
499	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
500	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
501	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
502	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
503	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
504	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
505	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
506	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
507	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
508	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
509	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
510	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
511	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
512	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
513	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
514	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
515	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
516	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
517	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
518	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
519	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
520	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
521	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
522	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
523	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
524	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
525	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
526	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
527	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
528	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
529	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
530	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
531	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
532	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
533	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
534	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
535	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
536	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
537	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
538	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
539	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
540	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
541	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
542	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
543	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
544	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
545	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
546	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
547	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
548	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
549	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
550	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
551	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
552	.popsection
553
554	ALIGN (4)
555L(aligned_16_112bytes):
556	movdqa	%xmm0, -112(%edx)
557L(aligned_16_96bytes):
558	movdqa	%xmm0, -96(%edx)
559L(aligned_16_80bytes):
560	movdqa	%xmm0, -80(%edx)
561L(aligned_16_64bytes):
562	movdqa	%xmm0, -64(%edx)
563L(aligned_16_48bytes):
564	movdqa	%xmm0, -48(%edx)
565L(aligned_16_32bytes):
566	movdqa	%xmm0, -32(%edx)
567L(aligned_16_16bytes):
568	movdqa	%xmm0, -16(%edx)
569L(aligned_16_0bytes):
570	SETRTNVAL
571	RETURN
572
573	ALIGN (4)
574L(aligned_16_113bytes):
575	movdqa	%xmm0, -113(%edx)
576L(aligned_16_97bytes):
577	movdqa	%xmm0, -97(%edx)
578L(aligned_16_81bytes):
579	movdqa	%xmm0, -81(%edx)
580L(aligned_16_65bytes):
581	movdqa	%xmm0, -65(%edx)
582L(aligned_16_49bytes):
583	movdqa	%xmm0, -49(%edx)
584L(aligned_16_33bytes):
585	movdqa	%xmm0, -33(%edx)
586L(aligned_16_17bytes):
587	movdqa	%xmm0, -17(%edx)
588L(aligned_16_1bytes):
589	movb	%al, -1(%edx)
590	SETRTNVAL
591	RETURN
592
593	ALIGN (4)
594L(aligned_16_114bytes):
595	movdqa	%xmm0, -114(%edx)
596L(aligned_16_98bytes):
597	movdqa	%xmm0, -98(%edx)
598L(aligned_16_82bytes):
599	movdqa	%xmm0, -82(%edx)
600L(aligned_16_66bytes):
601	movdqa	%xmm0, -66(%edx)
602L(aligned_16_50bytes):
603	movdqa	%xmm0, -50(%edx)
604L(aligned_16_34bytes):
605	movdqa	%xmm0, -34(%edx)
606L(aligned_16_18bytes):
607	movdqa	%xmm0, -18(%edx)
608L(aligned_16_2bytes):
609	movw	%ax, -2(%edx)
610	SETRTNVAL
611	RETURN
612
613	ALIGN (4)
614L(aligned_16_115bytes):
615	movdqa	%xmm0, -115(%edx)
616L(aligned_16_99bytes):
617	movdqa	%xmm0, -99(%edx)
618L(aligned_16_83bytes):
619	movdqa	%xmm0, -83(%edx)
620L(aligned_16_67bytes):
621	movdqa	%xmm0, -67(%edx)
622L(aligned_16_51bytes):
623	movdqa	%xmm0, -51(%edx)
624L(aligned_16_35bytes):
625	movdqa	%xmm0, -35(%edx)
626L(aligned_16_19bytes):
627	movdqa	%xmm0, -19(%edx)
628L(aligned_16_3bytes):
629	movw	%ax, -3(%edx)
630	movb	%al, -1(%edx)
631	SETRTNVAL
632	RETURN
633
634	ALIGN (4)
635L(aligned_16_116bytes):
636	movdqa	%xmm0, -116(%edx)
637L(aligned_16_100bytes):
638	movdqa	%xmm0, -100(%edx)
639L(aligned_16_84bytes):
640	movdqa	%xmm0, -84(%edx)
641L(aligned_16_68bytes):
642	movdqa	%xmm0, -68(%edx)
643L(aligned_16_52bytes):
644	movdqa	%xmm0, -52(%edx)
645L(aligned_16_36bytes):
646	movdqa	%xmm0, -36(%edx)
647L(aligned_16_20bytes):
648	movdqa	%xmm0, -20(%edx)
649L(aligned_16_4bytes):
650	movl	%eax, -4(%edx)
651	SETRTNVAL
652	RETURN
653
654	ALIGN (4)
655L(aligned_16_117bytes):
656	movdqa	%xmm0, -117(%edx)
657L(aligned_16_101bytes):
658	movdqa	%xmm0, -101(%edx)
659L(aligned_16_85bytes):
660	movdqa	%xmm0, -85(%edx)
661L(aligned_16_69bytes):
662	movdqa	%xmm0, -69(%edx)
663L(aligned_16_53bytes):
664	movdqa	%xmm0, -53(%edx)
665L(aligned_16_37bytes):
666	movdqa	%xmm0, -37(%edx)
667L(aligned_16_21bytes):
668	movdqa	%xmm0, -21(%edx)
669L(aligned_16_5bytes):
670	movl	%eax, -5(%edx)
671	movb	%al, -1(%edx)
672	SETRTNVAL
673	RETURN
674
675	ALIGN (4)
676L(aligned_16_118bytes):
677	movdqa	%xmm0, -118(%edx)
678L(aligned_16_102bytes):
679	movdqa	%xmm0, -102(%edx)
680L(aligned_16_86bytes):
681	movdqa	%xmm0, -86(%edx)
682L(aligned_16_70bytes):
683	movdqa	%xmm0, -70(%edx)
684L(aligned_16_54bytes):
685	movdqa	%xmm0, -54(%edx)
686L(aligned_16_38bytes):
687	movdqa	%xmm0, -38(%edx)
688L(aligned_16_22bytes):
689	movdqa	%xmm0, -22(%edx)
690L(aligned_16_6bytes):
691	movl	%eax, -6(%edx)
692	movw	%ax, -2(%edx)
693	SETRTNVAL
694	RETURN
695
696	ALIGN (4)
697L(aligned_16_119bytes):
698	movdqa	%xmm0, -119(%edx)
699L(aligned_16_103bytes):
700	movdqa	%xmm0, -103(%edx)
701L(aligned_16_87bytes):
702	movdqa	%xmm0, -87(%edx)
703L(aligned_16_71bytes):
704	movdqa	%xmm0, -71(%edx)
705L(aligned_16_55bytes):
706	movdqa	%xmm0, -55(%edx)
707L(aligned_16_39bytes):
708	movdqa	%xmm0, -39(%edx)
709L(aligned_16_23bytes):
710	movdqa	%xmm0, -23(%edx)
711L(aligned_16_7bytes):
712	movl	%eax, -7(%edx)
713	movw	%ax, -3(%edx)
714	movb	%al, -1(%edx)
715	SETRTNVAL
716	RETURN
717
718	ALIGN (4)
719L(aligned_16_120bytes):
720	movdqa	%xmm0, -120(%edx)
721L(aligned_16_104bytes):
722	movdqa	%xmm0, -104(%edx)
723L(aligned_16_88bytes):
724	movdqa	%xmm0, -88(%edx)
725L(aligned_16_72bytes):
726	movdqa	%xmm0, -72(%edx)
727L(aligned_16_56bytes):
728	movdqa	%xmm0, -56(%edx)
729L(aligned_16_40bytes):
730	movdqa	%xmm0, -40(%edx)
731L(aligned_16_24bytes):
732	movdqa	%xmm0, -24(%edx)
733L(aligned_16_8bytes):
734	movq	%xmm0, -8(%edx)
735	SETRTNVAL
736	RETURN
737
738	ALIGN (4)
739L(aligned_16_121bytes):
740	movdqa	%xmm0, -121(%edx)
741L(aligned_16_105bytes):
742	movdqa	%xmm0, -105(%edx)
743L(aligned_16_89bytes):
744	movdqa	%xmm0, -89(%edx)
745L(aligned_16_73bytes):
746	movdqa	%xmm0, -73(%edx)
747L(aligned_16_57bytes):
748	movdqa	%xmm0, -57(%edx)
749L(aligned_16_41bytes):
750	movdqa	%xmm0, -41(%edx)
751L(aligned_16_25bytes):
752	movdqa	%xmm0, -25(%edx)
753L(aligned_16_9bytes):
754	movq	%xmm0, -9(%edx)
755	movb	%al, -1(%edx)
756	SETRTNVAL
757	RETURN
758
759	ALIGN (4)
760L(aligned_16_122bytes):
761	movdqa	%xmm0, -122(%edx)
762L(aligned_16_106bytes):
763	movdqa	%xmm0, -106(%edx)
764L(aligned_16_90bytes):
765	movdqa	%xmm0, -90(%edx)
766L(aligned_16_74bytes):
767	movdqa	%xmm0, -74(%edx)
768L(aligned_16_58bytes):
769	movdqa	%xmm0, -58(%edx)
770L(aligned_16_42bytes):
771	movdqa	%xmm0, -42(%edx)
772L(aligned_16_26bytes):
773	movdqa	%xmm0, -26(%edx)
774L(aligned_16_10bytes):
775	movq	%xmm0, -10(%edx)
776	movw	%ax, -2(%edx)
777	SETRTNVAL
778	RETURN
779
780	ALIGN (4)
781L(aligned_16_123bytes):
782	movdqa	%xmm0, -123(%edx)
783L(aligned_16_107bytes):
784	movdqa	%xmm0, -107(%edx)
785L(aligned_16_91bytes):
786	movdqa	%xmm0, -91(%edx)
787L(aligned_16_75bytes):
788	movdqa	%xmm0, -75(%edx)
789L(aligned_16_59bytes):
790	movdqa	%xmm0, -59(%edx)
791L(aligned_16_43bytes):
792	movdqa	%xmm0, -43(%edx)
793L(aligned_16_27bytes):
794	movdqa	%xmm0, -27(%edx)
795L(aligned_16_11bytes):
796	movq	%xmm0, -11(%edx)
797	movw	%ax, -3(%edx)
798	movb	%al, -1(%edx)
799	SETRTNVAL
800	RETURN
801
802	ALIGN (4)
803L(aligned_16_124bytes):
804	movdqa	%xmm0, -124(%edx)
805L(aligned_16_108bytes):
806	movdqa	%xmm0, -108(%edx)
807L(aligned_16_92bytes):
808	movdqa	%xmm0, -92(%edx)
809L(aligned_16_76bytes):
810	movdqa	%xmm0, -76(%edx)
811L(aligned_16_60bytes):
812	movdqa	%xmm0, -60(%edx)
813L(aligned_16_44bytes):
814	movdqa	%xmm0, -44(%edx)
815L(aligned_16_28bytes):
816	movdqa	%xmm0, -28(%edx)
817L(aligned_16_12bytes):
818	movq	%xmm0, -12(%edx)
819	movl	%eax, -4(%edx)
820	SETRTNVAL
821	RETURN
822
823	ALIGN (4)
824L(aligned_16_125bytes):
825	movdqa	%xmm0, -125(%edx)
826L(aligned_16_109bytes):
827	movdqa	%xmm0, -109(%edx)
828L(aligned_16_93bytes):
829	movdqa	%xmm0, -93(%edx)
830L(aligned_16_77bytes):
831	movdqa	%xmm0, -77(%edx)
832L(aligned_16_61bytes):
833	movdqa	%xmm0, -61(%edx)
834L(aligned_16_45bytes):
835	movdqa	%xmm0, -45(%edx)
836L(aligned_16_29bytes):
837	movdqa	%xmm0, -29(%edx)
838L(aligned_16_13bytes):
839	movq	%xmm0, -13(%edx)
840	movl	%eax, -5(%edx)
841	movb	%al, -1(%edx)
842	SETRTNVAL
843	RETURN
844
845	ALIGN (4)
846L(aligned_16_126bytes):
847	movdqa	%xmm0, -126(%edx)
848L(aligned_16_110bytes):
849	movdqa	%xmm0, -110(%edx)
850L(aligned_16_94bytes):
851	movdqa	%xmm0, -94(%edx)
852L(aligned_16_78bytes):
853	movdqa	%xmm0, -78(%edx)
854L(aligned_16_62bytes):
855	movdqa	%xmm0, -62(%edx)
856L(aligned_16_46bytes):
857	movdqa	%xmm0, -46(%edx)
858L(aligned_16_30bytes):
859	movdqa	%xmm0, -30(%edx)
860L(aligned_16_14bytes):
861	movq	%xmm0, -14(%edx)
862	movl	%eax, -6(%edx)
863	movw	%ax, -2(%edx)
864	SETRTNVAL
865	RETURN
866
867	ALIGN (4)
868L(aligned_16_127bytes):
869	movdqa	%xmm0, -127(%edx)
870L(aligned_16_111bytes):
871	movdqa	%xmm0, -111(%edx)
872L(aligned_16_95bytes):
873	movdqa	%xmm0, -95(%edx)
874L(aligned_16_79bytes):
875	movdqa	%xmm0, -79(%edx)
876L(aligned_16_63bytes):
877	movdqa	%xmm0, -63(%edx)
878L(aligned_16_47bytes):
879	movdqa	%xmm0, -47(%edx)
880L(aligned_16_31bytes):
881	movdqa	%xmm0, -31(%edx)
882L(aligned_16_15bytes):
883	movq	%xmm0, -15(%edx)
884	movl	%eax, -7(%edx)
885	movw	%ax, -3(%edx)
886	movb	%al, -1(%edx)
887	SETRTNVAL
888	RETURN_END
889
890END (memset)
891