sse2-memset16-atom.S revision bb12ac9b85adae96cbd38b2220c5da9a9d80bc54
1/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16/*
17 * Contributed by: Intel Corporation
18 */
19
20#ifndef L
21# define L(label)	.L##label
22#endif
23
24#ifndef ALIGN
25# define ALIGN(n)	.p2align n
26#endif
27
28#ifndef cfi_startproc
29# define cfi_startproc			.cfi_startproc
30#endif
31
32#ifndef cfi_endproc
33# define cfi_endproc			.cfi_endproc
34#endif
35
36#ifndef cfi_rel_offset
37# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
38#endif
39
40#ifndef cfi_restore
41# define cfi_restore(reg)		.cfi_restore reg
42#endif
43
44#ifndef cfi_adjust_cfa_offset
45# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
46#endif
47
48#ifndef ENTRY
49# define ENTRY(name)			\
50	.type name,  @function; 	\
51	.globl name;			\
52	.p2align 4;			\
53name:					\
54	cfi_startproc
55#endif
56
57#ifndef END
58# define END(name)			\
59	cfi_endproc;			\
60	.size name, .-name
61#endif
62
63#define CFI_PUSH(REG)						\
64  cfi_adjust_cfa_offset (4);					\
65  cfi_rel_offset (REG, 0)
66
67#define CFI_POP(REG)						\
68  cfi_adjust_cfa_offset (-4);					\
69  cfi_restore (REG)
70
71#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
72#define POP(REG)	popl REG; CFI_POP (REG)
73
74#ifdef USE_AS_BZERO16
75# define DEST		PARMS
76# define LEN		DEST+4
77#else
78# define DEST		PARMS
79# define CHR		DEST+4
80# define LEN		CHR+4
81#endif
82
83#if 1
84# define SETRTNVAL
85#else
86# define SETRTNVAL	movl DEST(%esp), %eax
87#endif
88
89#ifdef SHARED
90# define ENTRANCE	PUSH (%ebx);
91# define RETURN_END	POP (%ebx); ret
92# define RETURN		RETURN_END; CFI_PUSH (%ebx)
93# define PARMS		8		/* Preserve EBX.  */
94# define JMPTBL(I, B)	I - B
95
96/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
97   jump table with relative offsets.   */
98# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
99    /* We first load PC into EBX.  */				\
100    call	__i686.get_pc_thunk.bx;				\
101    /* Get the address of the jump table.  */			\
102    add		$(TABLE - .), %ebx;				\
103    /* Get the entry and convert the relative offset to the	\
104       absolute address.  */					\
105    add		(%ebx,%ecx,4), %ebx;				\
106    /* We loaded the jump table and adjuested EDX. Go.  */	\
107    jmp		*%ebx
108
109	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
110	.globl	__i686.get_pc_thunk.bx
111	.hidden	__i686.get_pc_thunk.bx
112	ALIGN (4)
113	.type	__i686.get_pc_thunk.bx,@function
114__i686.get_pc_thunk.bx:
115	movl	(%esp), %ebx
116	ret
117#else
118# define ENTRANCE
119# define RETURN_END	ret
120# define RETURN		RETURN_END
121# define PARMS		4
122# define JMPTBL(I, B)	I
123
124/* Branch to an entry in a jump table.  TABLE is a jump table with
125   absolute offsets.  */
126# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
127    jmp		*TABLE(,%ecx,4)
128#endif
129
130	.section .text.sse2,"ax",@progbits
131	ALIGN (4)
132ENTRY (sse2_memset16_atom)
133	ENTRANCE
134
135	movl	LEN(%esp), %ecx
136#ifdef USE_AS_ANDROID
137	shr	$1, %ecx
138#endif
139#ifdef USE_AS_BZERO16
140	xor	%eax, %eax
141#else
142	movzwl	CHR(%esp), %eax
143	mov	%eax, %edx
144	shl	$16, %eax
145	or	%edx, %eax
146#endif
147	movl	DEST(%esp), %edx
148	cmp	$32, %ecx
149	jae	L(32wordsormore)
150
151L(write_less32words):
152	lea	(%edx, %ecx, 2), %edx
153	BRANCH_TO_JMPTBL_ENTRY (L(table_less32words))
154
155
156	.pushsection .rodata.sse2,"a",@progbits
157	ALIGN (2)
158L(table_less32words):
159	.int	JMPTBL (L(write_0words), L(table_less32words))
160	.int	JMPTBL (L(write_1words), L(table_less32words))
161	.int	JMPTBL (L(write_2words), L(table_less32words))
162	.int	JMPTBL (L(write_3words), L(table_less32words))
163	.int	JMPTBL (L(write_4words), L(table_less32words))
164	.int	JMPTBL (L(write_5words), L(table_less32words))
165	.int	JMPTBL (L(write_6words), L(table_less32words))
166	.int	JMPTBL (L(write_7words), L(table_less32words))
167	.int	JMPTBL (L(write_8words), L(table_less32words))
168	.int	JMPTBL (L(write_9words), L(table_less32words))
169	.int	JMPTBL (L(write_10words), L(table_less32words))
170	.int	JMPTBL (L(write_11words), L(table_less32words))
171	.int	JMPTBL (L(write_12words), L(table_less32words))
172	.int	JMPTBL (L(write_13words), L(table_less32words))
173	.int	JMPTBL (L(write_14words), L(table_less32words))
174	.int	JMPTBL (L(write_15words), L(table_less32words))
175	.int	JMPTBL (L(write_16words), L(table_less32words))
176	.int	JMPTBL (L(write_17words), L(table_less32words))
177	.int	JMPTBL (L(write_18words), L(table_less32words))
178	.int	JMPTBL (L(write_19words), L(table_less32words))
179	.int	JMPTBL (L(write_20words), L(table_less32words))
180	.int	JMPTBL (L(write_21words), L(table_less32words))
181	.int	JMPTBL (L(write_22words), L(table_less32words))
182	.int	JMPTBL (L(write_23words), L(table_less32words))
183	.int	JMPTBL (L(write_24words), L(table_less32words))
184	.int	JMPTBL (L(write_25words), L(table_less32words))
185	.int	JMPTBL (L(write_26words), L(table_less32words))
186	.int	JMPTBL (L(write_27words), L(table_less32words))
187	.int	JMPTBL (L(write_28words), L(table_less32words))
188	.int	JMPTBL (L(write_29words), L(table_less32words))
189	.int	JMPTBL (L(write_30words), L(table_less32words))
190	.int	JMPTBL (L(write_31words), L(table_less32words))
191	.popsection
192
193	ALIGN (4)
194L(write_28words):
195	movl	%eax, -56(%edx)
196	movl	%eax, -52(%edx)
197L(write_24words):
198	movl	%eax, -48(%edx)
199	movl	%eax, -44(%edx)
200L(write_20words):
201	movl	%eax, -40(%edx)
202	movl	%eax, -36(%edx)
203L(write_16words):
204	movl	%eax, -32(%edx)
205	movl	%eax, -28(%edx)
206L(write_12words):
207	movl	%eax, -24(%edx)
208	movl	%eax, -20(%edx)
209L(write_8words):
210	movl	%eax, -16(%edx)
211	movl	%eax, -12(%edx)
212L(write_4words):
213	movl	%eax, -8(%edx)
214	movl	%eax, -4(%edx)
215L(write_0words):
216	SETRTNVAL
217	RETURN
218
219	ALIGN (4)
220L(write_29words):
221	movl	%eax, -58(%edx)
222	movl	%eax, -54(%edx)
223L(write_25words):
224	movl	%eax, -50(%edx)
225	movl	%eax, -46(%edx)
226L(write_21words):
227	movl	%eax, -42(%edx)
228	movl	%eax, -38(%edx)
229L(write_17words):
230	movl	%eax, -34(%edx)
231	movl	%eax, -30(%edx)
232L(write_13words):
233	movl	%eax, -26(%edx)
234	movl	%eax, -22(%edx)
235L(write_9words):
236	movl	%eax, -18(%edx)
237	movl	%eax, -14(%edx)
238L(write_5words):
239	movl	%eax, -10(%edx)
240	movl	%eax, -6(%edx)
241L(write_1words):
242	mov	%ax, -2(%edx)
243	SETRTNVAL
244	RETURN
245
246	ALIGN (4)
247L(write_30words):
248	movl	%eax, -60(%edx)
249	movl	%eax, -56(%edx)
250L(write_26words):
251	movl	%eax, -52(%edx)
252	movl	%eax, -48(%edx)
253L(write_22words):
254	movl	%eax, -44(%edx)
255	movl	%eax, -40(%edx)
256L(write_18words):
257	movl	%eax, -36(%edx)
258	movl	%eax, -32(%edx)
259L(write_14words):
260	movl	%eax, -28(%edx)
261	movl	%eax, -24(%edx)
262L(write_10words):
263	movl	%eax, -20(%edx)
264	movl	%eax, -16(%edx)
265L(write_6words):
266	movl	%eax, -12(%edx)
267	movl	%eax, -8(%edx)
268L(write_2words):
269	movl	%eax, -4(%edx)
270	SETRTNVAL
271	RETURN
272
273	ALIGN (4)
274L(write_31words):
275	movl	%eax, -62(%edx)
276	movl	%eax, -58(%edx)
277L(write_27words):
278	movl	%eax, -54(%edx)
279	movl	%eax, -50(%edx)
280L(write_23words):
281	movl	%eax, -46(%edx)
282	movl	%eax, -42(%edx)
283L(write_19words):
284	movl	%eax, -38(%edx)
285	movl	%eax, -34(%edx)
286L(write_15words):
287	movl	%eax, -30(%edx)
288	movl	%eax, -26(%edx)
289L(write_11words):
290	movl	%eax, -22(%edx)
291	movl	%eax, -18(%edx)
292L(write_7words):
293	movl	%eax, -14(%edx)
294	movl	%eax, -10(%edx)
295L(write_3words):
296	movl	%eax, -6(%edx)
297	movw	%ax, -2(%edx)
298	SETRTNVAL
299	RETURN
300
301	ALIGN (4)
302
303L(32wordsormore):
304	shl	$1, %ecx
305	test	$0x01, %edx
306	jz	L(aligned2bytes)
307	mov	%eax, (%edx)
308	mov	%eax, -4(%edx, %ecx)
309	sub	$2, %ecx
310	add	$1, %edx
311	rol	$8, %eax
312L(aligned2bytes):
313#ifdef USE_AS_BZERO16
314	pxor	%xmm0, %xmm0
315#else
316	movd	%eax, %xmm0
317	pshufd	$0, %xmm0, %xmm0
318#endif
319	testl	$0xf, %edx
320	jz	L(aligned_16)
321/* ECX > 32 and EDX is not 16 byte aligned.  */
322L(not_aligned_16):
323	movdqu	%xmm0, (%edx)
324	movl	%edx, %eax
325	and	$-16, %edx
326	add	$16, %edx
327	sub	%edx, %eax
328	add	%eax, %ecx
329	movd	%xmm0, %eax
330
331	ALIGN (4)
332L(aligned_16):
333	cmp	$128, %ecx
334	jae	L(128bytesormore)
335
336L(aligned_16_less128bytes):
337	add	%ecx, %edx
338	shr	$1, %ecx
339	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
340
341	ALIGN (4)
342L(128bytesormore):
343#ifdef SHARED_CACHE_SIZE
344	PUSH (%ebx)
345	mov	$SHARED_CACHE_SIZE, %ebx
346#else
347# ifdef SHARED
348	call	__i686.get_pc_thunk.bx
349	add	$_GLOBAL_OFFSET_TABLE_, %ebx
350	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
351# else
352	PUSH (%ebx)
353	mov	__x86_shared_cache_size, %ebx
354# endif
355#endif
356	cmp	%ebx, %ecx
357	jae	L(128bytesormore_nt_start)
358
359
360#ifdef DATA_CACHE_SIZE
361	POP (%ebx)
362# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
363	cmp	$DATA_CACHE_SIZE, %ecx
364#else
365# ifdef SHARED
366#  define RESTORE_EBX_STATE
367	call	__i686.get_pc_thunk.bx
368	add	$_GLOBAL_OFFSET_TABLE_, %ebx
369	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
370# else
371	POP (%ebx)
372#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
373	cmp	__x86_data_cache_size, %ecx
374# endif
375#endif
376
377	jae	L(128bytes_L2_normal)
378	subl	$128, %ecx
379L(128bytesormore_normal):
380	sub	$128, %ecx
381	movdqa	%xmm0, (%edx)
382	movdqa	%xmm0, 0x10(%edx)
383	movdqa	%xmm0, 0x20(%edx)
384	movdqa	%xmm0, 0x30(%edx)
385	movdqa	%xmm0, 0x40(%edx)
386	movdqa	%xmm0, 0x50(%edx)
387	movdqa	%xmm0, 0x60(%edx)
388	movdqa	%xmm0, 0x70(%edx)
389	lea	128(%edx), %edx
390	jb	L(128bytesless_normal)
391
392
393	sub	$128, %ecx
394	movdqa	%xmm0, (%edx)
395	movdqa	%xmm0, 0x10(%edx)
396	movdqa	%xmm0, 0x20(%edx)
397	movdqa	%xmm0, 0x30(%edx)
398	movdqa	%xmm0, 0x40(%edx)
399	movdqa	%xmm0, 0x50(%edx)
400	movdqa	%xmm0, 0x60(%edx)
401	movdqa	%xmm0, 0x70(%edx)
402	lea	128(%edx), %edx
403	jae	L(128bytesormore_normal)
404
405L(128bytesless_normal):
406	lea	128(%ecx), %ecx
407	add	%ecx, %edx
408	shr	$1, %ecx
409	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
410
411	ALIGN (4)
412L(128bytes_L2_normal):
413	prefetcht0	0x380(%edx)
414	prefetcht0	0x3c0(%edx)
415	sub	$128, %ecx
416	movdqa	%xmm0, (%edx)
417	movaps	%xmm0, 0x10(%edx)
418	movaps	%xmm0, 0x20(%edx)
419	movaps	%xmm0, 0x30(%edx)
420	movaps	%xmm0, 0x40(%edx)
421	movaps	%xmm0, 0x50(%edx)
422	movaps	%xmm0, 0x60(%edx)
423	movaps	%xmm0, 0x70(%edx)
424	add	$128, %edx
425	cmp	$128, %ecx
426	jae	L(128bytes_L2_normal)
427
428L(128bytesless_L2_normal):
429	add	%ecx, %edx
430	shr	$1, %ecx
431	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
432
433	RESTORE_EBX_STATE
434L(128bytesormore_nt_start):
435	sub	%ebx, %ecx
436	mov	%ebx, %eax
437	and	$0x7f, %eax
438	add	%eax, %ecx
439	movd	%xmm0, %eax
440	ALIGN (4)
441L(128bytesormore_shared_cache_loop):
442	prefetcht0	0x3c0(%edx)
443	prefetcht0	0x380(%edx)
444	sub	$0x80, %ebx
445	movdqa	%xmm0, (%edx)
446	movdqa	%xmm0, 0x10(%edx)
447	movdqa	%xmm0, 0x20(%edx)
448	movdqa	%xmm0, 0x30(%edx)
449	movdqa	%xmm0, 0x40(%edx)
450	movdqa	%xmm0, 0x50(%edx)
451	movdqa	%xmm0, 0x60(%edx)
452	movdqa	%xmm0, 0x70(%edx)
453	add	$0x80, %edx
454	cmp	$0x80, %ebx
455	jae	L(128bytesormore_shared_cache_loop)
456	cmp	$0x80, %ecx
457	jb	L(shared_cache_loop_end)
458	ALIGN (4)
459L(128bytesormore_nt):
460	sub	$0x80, %ecx
461	movntdq	%xmm0, (%edx)
462	movntdq	%xmm0, 0x10(%edx)
463	movntdq	%xmm0, 0x20(%edx)
464	movntdq	%xmm0, 0x30(%edx)
465	movntdq	%xmm0, 0x40(%edx)
466	movntdq	%xmm0, 0x50(%edx)
467	movntdq	%xmm0, 0x60(%edx)
468	movntdq	%xmm0, 0x70(%edx)
469	add	$0x80, %edx
470	cmp	$0x80, %ecx
471	jae	L(128bytesormore_nt)
472	sfence
473L(shared_cache_loop_end):
474#if defined DATA_CACHE_SIZE || !defined SHARED
475	POP (%ebx)
476#endif
477	add	%ecx, %edx
478	shr	$1, %ecx
479	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
480
481
482	.pushsection .rodata.sse2,"a",@progbits
483	ALIGN (2)
484L(table_16_128bytes):
485	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
486	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
487	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
488	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
489	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
490	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
491	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
492	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
493	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
494	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
495	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
496	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
497	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
498	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
499	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
500	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
501	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
502	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
503	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
504	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
505	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
506	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
507	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
508	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
509	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
510	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
511	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
512	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
513	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
514	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
515	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
516	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
517	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
518	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
519	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
520	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
521	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
522	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
523	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
524	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
525	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
526	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
527	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
528	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
529	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
530	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
531	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
532	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
533	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
534	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
535	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
536	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
537	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
538	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
539	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
540	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
541	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
542	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
543	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
544	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
545	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
546	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
547	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
548	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
549	.popsection
550
551
552	ALIGN (4)
553L(aligned_16_112bytes):
554	movdqa	%xmm0, -112(%edx)
555L(aligned_16_96bytes):
556	movdqa	%xmm0, -96(%edx)
557L(aligned_16_80bytes):
558	movdqa	%xmm0, -80(%edx)
559L(aligned_16_64bytes):
560	movdqa	%xmm0, -64(%edx)
561L(aligned_16_48bytes):
562	movdqa	%xmm0, -48(%edx)
563L(aligned_16_32bytes):
564	movdqa	%xmm0, -32(%edx)
565L(aligned_16_16bytes):
566	movdqa	%xmm0, -16(%edx)
567L(aligned_16_0bytes):
568	SETRTNVAL
569	RETURN
570
571
572	ALIGN (4)
573L(aligned_16_114bytes):
574	movdqa	%xmm0, -114(%edx)
575L(aligned_16_98bytes):
576	movdqa	%xmm0, -98(%edx)
577L(aligned_16_82bytes):
578	movdqa	%xmm0, -82(%edx)
579L(aligned_16_66bytes):
580	movdqa	%xmm0, -66(%edx)
581L(aligned_16_50bytes):
582	movdqa	%xmm0, -50(%edx)
583L(aligned_16_34bytes):
584	movdqa	%xmm0, -34(%edx)
585L(aligned_16_18bytes):
586	movdqa	%xmm0, -18(%edx)
587L(aligned_16_2bytes):
588	movw	%ax, -2(%edx)
589	SETRTNVAL
590	RETURN
591
592	ALIGN (4)
593L(aligned_16_116bytes):
594	movdqa	%xmm0, -116(%edx)
595L(aligned_16_100bytes):
596	movdqa	%xmm0, -100(%edx)
597L(aligned_16_84bytes):
598	movdqa	%xmm0, -84(%edx)
599L(aligned_16_68bytes):
600	movdqa	%xmm0, -68(%edx)
601L(aligned_16_52bytes):
602	movdqa	%xmm0, -52(%edx)
603L(aligned_16_36bytes):
604	movdqa	%xmm0, -36(%edx)
605L(aligned_16_20bytes):
606	movdqa	%xmm0, -20(%edx)
607L(aligned_16_4bytes):
608	movl	%eax, -4(%edx)
609	SETRTNVAL
610	RETURN
611
612
613	ALIGN (4)
614L(aligned_16_118bytes):
615	movdqa	%xmm0, -118(%edx)
616L(aligned_16_102bytes):
617	movdqa	%xmm0, -102(%edx)
618L(aligned_16_86bytes):
619	movdqa	%xmm0, -86(%edx)
620L(aligned_16_70bytes):
621	movdqa	%xmm0, -70(%edx)
622L(aligned_16_54bytes):
623	movdqa	%xmm0, -54(%edx)
624L(aligned_16_38bytes):
625	movdqa	%xmm0, -38(%edx)
626L(aligned_16_22bytes):
627	movdqa	%xmm0, -22(%edx)
628L(aligned_16_6bytes):
629	movl	%eax, -6(%edx)
630	movw	%ax, -2(%edx)
631	SETRTNVAL
632	RETURN
633
634
635	ALIGN (4)
636L(aligned_16_120bytes):
637	movdqa	%xmm0, -120(%edx)
638L(aligned_16_104bytes):
639	movdqa	%xmm0, -104(%edx)
640L(aligned_16_88bytes):
641	movdqa	%xmm0, -88(%edx)
642L(aligned_16_72bytes):
643	movdqa	%xmm0, -72(%edx)
644L(aligned_16_56bytes):
645	movdqa	%xmm0, -56(%edx)
646L(aligned_16_40bytes):
647	movdqa	%xmm0, -40(%edx)
648L(aligned_16_24bytes):
649	movdqa	%xmm0, -24(%edx)
650L(aligned_16_8bytes):
651	movq	%xmm0, -8(%edx)
652	SETRTNVAL
653	RETURN
654
655
656	ALIGN (4)
657L(aligned_16_122bytes):
658	movdqa	%xmm0, -122(%edx)
659L(aligned_16_106bytes):
660	movdqa	%xmm0, -106(%edx)
661L(aligned_16_90bytes):
662	movdqa	%xmm0, -90(%edx)
663L(aligned_16_74bytes):
664	movdqa	%xmm0, -74(%edx)
665L(aligned_16_58bytes):
666	movdqa	%xmm0, -58(%edx)
667L(aligned_16_42bytes):
668	movdqa	%xmm0, -42(%edx)
669L(aligned_16_26bytes):
670	movdqa	%xmm0, -26(%edx)
671L(aligned_16_10bytes):
672	movq	%xmm0, -10(%edx)
673	movw	%ax, -2(%edx)
674	SETRTNVAL
675	RETURN
676
677
678	ALIGN (4)
679L(aligned_16_124bytes):
680	movdqa	%xmm0, -124(%edx)
681L(aligned_16_108bytes):
682	movdqa	%xmm0, -108(%edx)
683L(aligned_16_92bytes):
684	movdqa	%xmm0, -92(%edx)
685L(aligned_16_76bytes):
686	movdqa	%xmm0, -76(%edx)
687L(aligned_16_60bytes):
688	movdqa	%xmm0, -60(%edx)
689L(aligned_16_44bytes):
690	movdqa	%xmm0, -44(%edx)
691L(aligned_16_28bytes):
692	movdqa	%xmm0, -28(%edx)
693L(aligned_16_12bytes):
694	movq	%xmm0, -12(%edx)
695	movl	%eax, -4(%edx)
696	SETRTNVAL
697	RETURN
698
699
700	ALIGN (4)
701L(aligned_16_126bytes):
702	movdqa	%xmm0, -126(%edx)
703L(aligned_16_110bytes):
704	movdqa	%xmm0, -110(%edx)
705L(aligned_16_94bytes):
706	movdqa	%xmm0, -94(%edx)
707L(aligned_16_78bytes):
708	movdqa	%xmm0, -78(%edx)
709L(aligned_16_62bytes):
710	movdqa	%xmm0, -62(%edx)
711L(aligned_16_46bytes):
712	movdqa	%xmm0, -46(%edx)
713L(aligned_16_30bytes):
714	movdqa	%xmm0, -30(%edx)
715L(aligned_16_14bytes):
716	movq	%xmm0, -14(%edx)
717	movl	%eax, -6(%edx)
718	movw	%ax, -2(%edx)
719	SETRTNVAL
720	RETURN
721
722END (sse2_memset16_atom)
723