1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMSET
34# define MEMSET memset
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef ALIGN
42# define ALIGN(n)	.p2align n
43#endif
44
45#ifndef cfi_startproc
46# define cfi_startproc			.cfi_startproc
47#endif
48
49#ifndef cfi_endproc
50# define cfi_endproc			.cfi_endproc
51#endif
52
53#ifndef cfi_rel_offset
54# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
55#endif
56
57#ifndef cfi_restore
58# define cfi_restore(reg)		.cfi_restore reg
59#endif
60
61#ifndef cfi_adjust_cfa_offset
62# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
63#endif
64
65#ifndef ENTRY
66# define ENTRY(name)			\
67	.type name,  @function;		\
68	.globl name;			\
69	.p2align 4;			\
70name:					\
71	cfi_startproc
72#endif
73
74#ifndef END
75# define END(name)			\
76	cfi_endproc;			\
77	.size name, .-name
78#endif
79
80#define CFI_PUSH(REG)						\
81  cfi_adjust_cfa_offset (4);					\
82  cfi_rel_offset (REG, 0)
83
84#define CFI_POP(REG)						\
85  cfi_adjust_cfa_offset (-4);					\
86  cfi_restore (REG)
87
88#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
89#define POP(REG)	popl REG; CFI_POP (REG)
90
91#ifdef USE_AS_BZERO
92# define DEST		PARMS
93# define LEN		DEST+4
94# define SETRTNVAL
95#else
96# define DEST		PARMS
97# define CHR		DEST+4
98# define LEN		CHR+4
99# define SETRTNVAL	movl DEST(%esp), %eax
100#endif
101
102#if (defined SHARED || defined __PIC__)
103# define ENTRANCE	PUSH (%ebx);
104# define RETURN_END	POP (%ebx); ret
105# define RETURN		RETURN_END; CFI_PUSH (%ebx)
106# define PARMS		8		/* Preserve EBX.  */
107# define JMPTBL(I, B)	I - B
108
109/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
110   jump table with relative offsets.   */
111# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
112    /* We first load PC into EBX.  */				\
113    call	__x86.get_pc_thunk.bx;				\
114    /* Get the address of the jump table.  */			\
115    add		$(TABLE - .), %ebx;				\
116    /* Get the entry and convert the relative offset to the	\
117       absolute address.  */					\
118    add		(%ebx,%ecx,4), %ebx;				\
119    add		%ecx, %edx;					\
120    /* We loaded the jump table and adjuested EDX. Go.  */	\
121    jmp		*%ebx
122
123	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
124	.globl	__x86.get_pc_thunk.bx
125	.hidden	__x86.get_pc_thunk.bx
126	ALIGN (4)
127	.type	__x86.get_pc_thunk.bx,@function
128__x86.get_pc_thunk.bx:
129	movl	(%esp), %ebx
130	ret
131#else
132# define ENTRANCE
133# define RETURN_END	ret
134# define RETURN		RETURN_END
135# define PARMS		4
136# define JMPTBL(I, B)	I
137
138/* Branch to an entry in a jump table.  TABLE is a jump table with
139   absolute offsets.  */
140# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
141    add		%ecx, %edx;					\
142    jmp		*TABLE(,%ecx,4)
143#endif
144
145	.section .text.sse2,"ax",@progbits
146	ALIGN (4)
147ENTRY (MEMSET)
148	ENTRANCE
149
150	movl	LEN(%esp), %ecx
151	cmp	$0, %ecx
152	ja	L(1byteormore)
153	SETRTNVAL
154	RETURN
155
156L(1byteormore):
157#ifdef USE_AS_BZERO
158	xor	%eax, %eax
159#else
160	movzbl	CHR(%esp), %eax
161	movb	%al, %ah
162	/* Fill the whole EAX with pattern.  */
163	movl	%eax, %edx
164	shl	 $16, %eax
165	or	%edx, %eax
166#endif
167	movl	DEST(%esp), %edx
168	cmp	$1, %ecx
169	je	L(1byte)
170	cmp	$16, %ecx
171	jae	L(16bytesormore)
172
173	cmp	$4, %ecx
174	jb	L(4bytesless)
175	movl	%eax, (%edx)
176	movl	%eax, -4(%edx, %ecx)
177	cmp	$8, %ecx
178	jb	L(8bytesless)
179	movl	%eax, 4(%edx)
180	movl	%eax, -8(%edx, %ecx)
181L(8bytesless):
182	SETRTNVAL
183	RETURN
184
185L(4bytesless):
186	movw	%ax, (%edx)
187	movw	%ax, -2(%edx, %ecx)
188	SETRTNVAL
189	RETURN
190
191L(1byte):
192	movb	%al, (%edx)
193	SETRTNVAL
194	RETURN
195
196	ALIGN (4)
197L(16bytesormore):
198#ifdef USE_AS_BZERO
199	pxor	%xmm0, %xmm0
200#else
201	movd	%eax, %xmm0
202	pshufd	$0, %xmm0, %xmm0
203#endif
204
205	cmp	$64, %ecx
206	ja	L(64bytesmore)
207	movdqu	%xmm0, (%edx)
208	movdqu	%xmm0, -16(%edx, %ecx)
209	cmp	$32, %ecx
210	jbe	L(32bytesless)
211	movdqu	%xmm0, 16(%edx)
212	movdqu	%xmm0, -32(%edx, %ecx)
213L(32bytesless):
214	SETRTNVAL
215	RETURN
216
217L(64bytesmore):
218	testl	$0xf, %edx
219	jz	L(aligned_16)
220L(not_aligned_16):
221	movdqu	%xmm0, (%edx)
222	movl	%edx, %eax
223	and	$-16, %edx
224	add	$16, %edx
225	sub	%edx, %eax
226	add	%eax, %ecx
227	movd	%xmm0, %eax
228
229	ALIGN (4)
230L(aligned_16):
231	cmp	$128, %ecx
232	jae	L(128bytesormore)
233
234L(aligned_16_less128bytes):
235	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
236
237	ALIGN (4)
238L(128bytesormore):
239#ifdef SHARED_CACHE_SIZE
240	PUSH (%ebx)
241	mov	$SHARED_CACHE_SIZE, %ebx
242#else
243# if (defined SHARED || defined __PIC__)
244	call	__x86.get_pc_thunk.bx
245	add	$_GLOBAL_OFFSET_TABLE_, %ebx
246	mov	$__x86_shared_cache_size@GOTOFF(%ebx), %ebx
247# else
248	PUSH (%ebx)
249	mov	$__x86_shared_cache_size, %ebx
250# endif
251#endif
252	cmp	%ebx, %ecx
253	jae	L(128bytesormore_nt_start)
254
255	POP (%ebx)
256
257#ifdef DATA_CACHE_SIZE
258	PUSH (%ebx)
259	mov	$DATA_CACHE_SIZE, %ebx
260#else
261# if (defined SHARED || defined __PIC__)
262	call	__x86.get_pc_thunk.bx
263	add	$_GLOBAL_OFFSET_TABLE_, %ebx
264	mov	$__x86_data_cache_size@GOTOFF(%ebx), %ebx
265# else
266	PUSH (%ebx)
267	mov	$__x86_data_cache_size, %ebx
268# endif
269#endif
270
271	cmp	%ebx, %ecx
272	jae	L(128bytes_L2_normal)
273	subl	$128, %ecx
274L(128bytesormore_normal):
275	sub	$128, %ecx
276	movdqa	%xmm0, (%edx)
277	movaps	%xmm0, 0x10(%edx)
278	movaps	%xmm0, 0x20(%edx)
279	movaps	%xmm0, 0x30(%edx)
280	movaps	%xmm0, 0x40(%edx)
281	movaps	%xmm0, 0x50(%edx)
282	movaps	%xmm0, 0x60(%edx)
283	movaps	%xmm0, 0x70(%edx)
284	lea	128(%edx), %edx
285	jb	L(128bytesless_normal)
286
287
288	sub	$128, %ecx
289	movdqa	%xmm0, (%edx)
290	movaps	%xmm0, 0x10(%edx)
291	movaps	%xmm0, 0x20(%edx)
292	movaps	%xmm0, 0x30(%edx)
293	movaps	%xmm0, 0x40(%edx)
294	movaps	%xmm0, 0x50(%edx)
295	movaps	%xmm0, 0x60(%edx)
296	movaps	%xmm0, 0x70(%edx)
297	lea	128(%edx), %edx
298	jae	L(128bytesormore_normal)
299
300L(128bytesless_normal):
301	lea	128(%ecx), %ecx
302#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
303	POP (%ebx)
304#endif
305	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
306
307	ALIGN (4)
308L(128bytes_L2_normal):
309	prefetchnta	0x380(%edx)
310	prefetchnta	0x3c0(%edx)
311	sub	$128, %ecx
312	movdqa	%xmm0, (%edx)
313	movaps	%xmm0, 0x10(%edx)
314	movaps	%xmm0, 0x20(%edx)
315	movaps	%xmm0, 0x30(%edx)
316	movaps	%xmm0, 0x40(%edx)
317	movaps	%xmm0, 0x50(%edx)
318	movaps	%xmm0, 0x60(%edx)
319	movaps	%xmm0, 0x70(%edx)
320	add	$128, %edx
321	cmp	$128, %ecx
322	jae	L(128bytes_L2_normal)
323
324L(128bytesless_L2_normal):
325#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
326	POP (%ebx)
327#endif
328	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
329
330L(128bytesormore_nt_start):
331	sub	%ebx, %ecx
332	ALIGN (4)
333L(128bytesormore_shared_cache_loop):
334	prefetchnta	0x3c0(%edx)
335	prefetchnta	0x380(%edx)
336	sub	$0x80, %ebx
337	movdqa	%xmm0, (%edx)
338	movaps	%xmm0, 0x10(%edx)
339	movaps	%xmm0, 0x20(%edx)
340	movaps	%xmm0, 0x30(%edx)
341	movaps	%xmm0, 0x40(%edx)
342	movaps	%xmm0, 0x50(%edx)
343	movaps	%xmm0, 0x60(%edx)
344	movaps	%xmm0, 0x70(%edx)
345	add	$0x80, %edx
346	cmp	$0x80, %ebx
347	jae	L(128bytesormore_shared_cache_loop)
348	cmp	$0x80, %ecx
349	jb	L(shared_cache_loop_end)
350	ALIGN (4)
351L(128bytesormore_nt):
352	sub	$0x80, %ecx
353	movntdq	%xmm0, (%edx)
354	movntdq	%xmm0, 0x10(%edx)
355	movntdq	%xmm0, 0x20(%edx)
356	movntdq	%xmm0, 0x30(%edx)
357	movntdq	%xmm0, 0x40(%edx)
358	movntdq	%xmm0, 0x50(%edx)
359	movntdq	%xmm0, 0x60(%edx)
360	movntdq	%xmm0, 0x70(%edx)
361	add	$0x80, %edx
362	cmp	$0x80, %ecx
363	jae	L(128bytesormore_nt)
364	sfence
365L(shared_cache_loop_end):
366#if defined SHARED_CACHE_SIZE || !(defined SHARED || defined __PIC__)
367	POP (%ebx)
368#endif
369	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
370
371
372	.pushsection .rodata.sse2,"a",@progbits
373	ALIGN (2)
374L(table_16_128bytes):
375	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
376	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
377	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
378	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
379	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
380	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
381	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
382	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
383	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
384	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
385	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
386	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
387	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
388	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
389	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
390	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
391	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
392	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
393	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
394	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
395	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
396	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
397	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
398	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
399	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
400	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
401	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
402	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
403	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
404	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
405	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
406	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
407	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
408	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
409	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
410	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
411	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
412	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
413	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
414	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
415	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
416	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
417	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
418	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
419	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
420	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
421	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
422	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
423	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
424	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
425	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
426	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
427	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
428	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
429	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
430	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
431	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
432	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
433	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
434	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
435	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
436	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
437	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
438	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
439	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
440	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
441	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
442	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
443	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
444	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
445	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
446	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
447	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
448	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
449	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
450	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
451	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
452	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
453	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
454	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
455	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
456	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
457	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
458	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
459	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
460	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
461	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
462	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
463	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
464	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
465	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
466	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
467	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
468	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
469	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
470	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
471	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
472	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
473	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
474	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
475	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
476	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
477	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
478	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
479	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
480	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
481	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
482	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
483	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
484	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
485	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
486	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
487	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
488	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
489	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
490	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
491	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
492	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
493	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
494	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
495	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
496	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
497	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
498	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
499	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
500	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
501	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
502	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
503	.popsection
504
505	ALIGN (4)
506L(aligned_16_112bytes):
507	movdqa	%xmm0, -112(%edx)
508L(aligned_16_96bytes):
509	movdqa	%xmm0, -96(%edx)
510L(aligned_16_80bytes):
511	movdqa	%xmm0, -80(%edx)
512L(aligned_16_64bytes):
513	movdqa	%xmm0, -64(%edx)
514L(aligned_16_48bytes):
515	movdqa	%xmm0, -48(%edx)
516L(aligned_16_32bytes):
517	movdqa	%xmm0, -32(%edx)
518L(aligned_16_16bytes):
519	movdqa	%xmm0, -16(%edx)
520L(aligned_16_0bytes):
521	SETRTNVAL
522	RETURN
523
524	ALIGN (4)
525L(aligned_16_113bytes):
526	movdqa	%xmm0, -113(%edx)
527L(aligned_16_97bytes):
528	movdqa	%xmm0, -97(%edx)
529L(aligned_16_81bytes):
530	movdqa	%xmm0, -81(%edx)
531L(aligned_16_65bytes):
532	movdqa	%xmm0, -65(%edx)
533L(aligned_16_49bytes):
534	movdqa	%xmm0, -49(%edx)
535L(aligned_16_33bytes):
536	movdqa	%xmm0, -33(%edx)
537L(aligned_16_17bytes):
538	movdqa	%xmm0, -17(%edx)
539L(aligned_16_1bytes):
540	movb	%al, -1(%edx)
541	SETRTNVAL
542	RETURN
543
544	ALIGN (4)
545L(aligned_16_114bytes):
546	movdqa	%xmm0, -114(%edx)
547L(aligned_16_98bytes):
548	movdqa	%xmm0, -98(%edx)
549L(aligned_16_82bytes):
550	movdqa	%xmm0, -82(%edx)
551L(aligned_16_66bytes):
552	movdqa	%xmm0, -66(%edx)
553L(aligned_16_50bytes):
554	movdqa	%xmm0, -50(%edx)
555L(aligned_16_34bytes):
556	movdqa	%xmm0, -34(%edx)
557L(aligned_16_18bytes):
558	movdqa	%xmm0, -18(%edx)
559L(aligned_16_2bytes):
560	movw	%ax, -2(%edx)
561	SETRTNVAL
562	RETURN
563
564	ALIGN (4)
565L(aligned_16_115bytes):
566	movdqa	%xmm0, -115(%edx)
567L(aligned_16_99bytes):
568	movdqa	%xmm0, -99(%edx)
569L(aligned_16_83bytes):
570	movdqa	%xmm0, -83(%edx)
571L(aligned_16_67bytes):
572	movdqa	%xmm0, -67(%edx)
573L(aligned_16_51bytes):
574	movdqa	%xmm0, -51(%edx)
575L(aligned_16_35bytes):
576	movdqa	%xmm0, -35(%edx)
577L(aligned_16_19bytes):
578	movdqa	%xmm0, -19(%edx)
579L(aligned_16_3bytes):
580	movw	%ax, -3(%edx)
581	movb	%al, -1(%edx)
582	SETRTNVAL
583	RETURN
584
585	ALIGN (4)
586L(aligned_16_116bytes):
587	movdqa	%xmm0, -116(%edx)
588L(aligned_16_100bytes):
589	movdqa	%xmm0, -100(%edx)
590L(aligned_16_84bytes):
591	movdqa	%xmm0, -84(%edx)
592L(aligned_16_68bytes):
593	movdqa	%xmm0, -68(%edx)
594L(aligned_16_52bytes):
595	movdqa	%xmm0, -52(%edx)
596L(aligned_16_36bytes):
597	movdqa	%xmm0, -36(%edx)
598L(aligned_16_20bytes):
599	movdqa	%xmm0, -20(%edx)
600L(aligned_16_4bytes):
601	movl	%eax, -4(%edx)
602	SETRTNVAL
603	RETURN
604
605	ALIGN (4)
606L(aligned_16_117bytes):
607	movdqa	%xmm0, -117(%edx)
608L(aligned_16_101bytes):
609	movdqa	%xmm0, -101(%edx)
610L(aligned_16_85bytes):
611	movdqa	%xmm0, -85(%edx)
612L(aligned_16_69bytes):
613	movdqa	%xmm0, -69(%edx)
614L(aligned_16_53bytes):
615	movdqa	%xmm0, -53(%edx)
616L(aligned_16_37bytes):
617	movdqa	%xmm0, -37(%edx)
618L(aligned_16_21bytes):
619	movdqa	%xmm0, -21(%edx)
620L(aligned_16_5bytes):
621	movl	%eax, -5(%edx)
622	movb	%al, -1(%edx)
623	SETRTNVAL
624	RETURN
625
626	ALIGN (4)
627L(aligned_16_118bytes):
628	movdqa	%xmm0, -118(%edx)
629L(aligned_16_102bytes):
630	movdqa	%xmm0, -102(%edx)
631L(aligned_16_86bytes):
632	movdqa	%xmm0, -86(%edx)
633L(aligned_16_70bytes):
634	movdqa	%xmm0, -70(%edx)
635L(aligned_16_54bytes):
636	movdqa	%xmm0, -54(%edx)
637L(aligned_16_38bytes):
638	movdqa	%xmm0, -38(%edx)
639L(aligned_16_22bytes):
640	movdqa	%xmm0, -22(%edx)
641L(aligned_16_6bytes):
642	movl	%eax, -6(%edx)
643	movw	%ax, -2(%edx)
644	SETRTNVAL
645	RETURN
646
647	ALIGN (4)
648L(aligned_16_119bytes):
649	movdqa	%xmm0, -119(%edx)
650L(aligned_16_103bytes):
651	movdqa	%xmm0, -103(%edx)
652L(aligned_16_87bytes):
653	movdqa	%xmm0, -87(%edx)
654L(aligned_16_71bytes):
655	movdqa	%xmm0, -71(%edx)
656L(aligned_16_55bytes):
657	movdqa	%xmm0, -55(%edx)
658L(aligned_16_39bytes):
659	movdqa	%xmm0, -39(%edx)
660L(aligned_16_23bytes):
661	movdqa	%xmm0, -23(%edx)
662L(aligned_16_7bytes):
663	movl	%eax, -7(%edx)
664	movw	%ax, -3(%edx)
665	movb	%al, -1(%edx)
666	SETRTNVAL
667	RETURN
668
669	ALIGN (4)
670L(aligned_16_120bytes):
671	movdqa	%xmm0, -120(%edx)
672L(aligned_16_104bytes):
673	movdqa	%xmm0, -104(%edx)
674L(aligned_16_88bytes):
675	movdqa	%xmm0, -88(%edx)
676L(aligned_16_72bytes):
677	movdqa	%xmm0, -72(%edx)
678L(aligned_16_56bytes):
679	movdqa	%xmm0, -56(%edx)
680L(aligned_16_40bytes):
681	movdqa	%xmm0, -40(%edx)
682L(aligned_16_24bytes):
683	movdqa	%xmm0, -24(%edx)
684L(aligned_16_8bytes):
685	movq	%xmm0, -8(%edx)
686	SETRTNVAL
687	RETURN
688
689	ALIGN (4)
690L(aligned_16_121bytes):
691	movdqa	%xmm0, -121(%edx)
692L(aligned_16_105bytes):
693	movdqa	%xmm0, -105(%edx)
694L(aligned_16_89bytes):
695	movdqa	%xmm0, -89(%edx)
696L(aligned_16_73bytes):
697	movdqa	%xmm0, -73(%edx)
698L(aligned_16_57bytes):
699	movdqa	%xmm0, -57(%edx)
700L(aligned_16_41bytes):
701	movdqa	%xmm0, -41(%edx)
702L(aligned_16_25bytes):
703	movdqa	%xmm0, -25(%edx)
704L(aligned_16_9bytes):
705	movq	%xmm0, -9(%edx)
706	movb	%al, -1(%edx)
707	SETRTNVAL
708	RETURN
709
710	ALIGN (4)
711L(aligned_16_122bytes):
712	movdqa	%xmm0, -122(%edx)
713L(aligned_16_106bytes):
714	movdqa	%xmm0, -106(%edx)
715L(aligned_16_90bytes):
716	movdqa	%xmm0, -90(%edx)
717L(aligned_16_74bytes):
718	movdqa	%xmm0, -74(%edx)
719L(aligned_16_58bytes):
720	movdqa	%xmm0, -58(%edx)
721L(aligned_16_42bytes):
722	movdqa	%xmm0, -42(%edx)
723L(aligned_16_26bytes):
724	movdqa	%xmm0, -26(%edx)
725L(aligned_16_10bytes):
726	movq	%xmm0, -10(%edx)
727	movw	%ax, -2(%edx)
728	SETRTNVAL
729	RETURN
730
731	ALIGN (4)
732L(aligned_16_123bytes):
733	movdqa	%xmm0, -123(%edx)
734L(aligned_16_107bytes):
735	movdqa	%xmm0, -107(%edx)
736L(aligned_16_91bytes):
737	movdqa	%xmm0, -91(%edx)
738L(aligned_16_75bytes):
739	movdqa	%xmm0, -75(%edx)
740L(aligned_16_59bytes):
741	movdqa	%xmm0, -59(%edx)
742L(aligned_16_43bytes):
743	movdqa	%xmm0, -43(%edx)
744L(aligned_16_27bytes):
745	movdqa	%xmm0, -27(%edx)
746L(aligned_16_11bytes):
747	movq	%xmm0, -11(%edx)
748	movw	%ax, -3(%edx)
749	movb	%al, -1(%edx)
750	SETRTNVAL
751	RETURN
752
753	ALIGN (4)
754L(aligned_16_124bytes):
755	movdqa	%xmm0, -124(%edx)
756L(aligned_16_108bytes):
757	movdqa	%xmm0, -108(%edx)
758L(aligned_16_92bytes):
759	movdqa	%xmm0, -92(%edx)
760L(aligned_16_76bytes):
761	movdqa	%xmm0, -76(%edx)
762L(aligned_16_60bytes):
763	movdqa	%xmm0, -60(%edx)
764L(aligned_16_44bytes):
765	movdqa	%xmm0, -44(%edx)
766L(aligned_16_28bytes):
767	movdqa	%xmm0, -28(%edx)
768L(aligned_16_12bytes):
769	movq	%xmm0, -12(%edx)
770	movl	%eax, -4(%edx)
771	SETRTNVAL
772	RETURN
773
774	ALIGN (4)
775L(aligned_16_125bytes):
776	movdqa	%xmm0, -125(%edx)
777L(aligned_16_109bytes):
778	movdqa	%xmm0, -109(%edx)
779L(aligned_16_93bytes):
780	movdqa	%xmm0, -93(%edx)
781L(aligned_16_77bytes):
782	movdqa	%xmm0, -77(%edx)
783L(aligned_16_61bytes):
784	movdqa	%xmm0, -61(%edx)
785L(aligned_16_45bytes):
786	movdqa	%xmm0, -45(%edx)
787L(aligned_16_29bytes):
788	movdqa	%xmm0, -29(%edx)
789L(aligned_16_13bytes):
790	movq	%xmm0, -13(%edx)
791	movl	%eax, -5(%edx)
792	movb	%al, -1(%edx)
793	SETRTNVAL
794	RETURN
795
796	ALIGN (4)
797L(aligned_16_126bytes):
798	movdqa	%xmm0, -126(%edx)
799L(aligned_16_110bytes):
800	movdqa	%xmm0, -110(%edx)
801L(aligned_16_94bytes):
802	movdqa	%xmm0, -94(%edx)
803L(aligned_16_78bytes):
804	movdqa	%xmm0, -78(%edx)
805L(aligned_16_62bytes):
806	movdqa	%xmm0, -62(%edx)
807L(aligned_16_46bytes):
808	movdqa	%xmm0, -46(%edx)
809L(aligned_16_30bytes):
810	movdqa	%xmm0, -30(%edx)
811L(aligned_16_14bytes):
812	movq	%xmm0, -14(%edx)
813	movl	%eax, -6(%edx)
814	movw	%ax, -2(%edx)
815	SETRTNVAL
816	RETURN
817
818	ALIGN (4)
819L(aligned_16_127bytes):
820	movdqa	%xmm0, -127(%edx)
821L(aligned_16_111bytes):
822	movdqa	%xmm0, -111(%edx)
823L(aligned_16_95bytes):
824	movdqa	%xmm0, -95(%edx)
825L(aligned_16_79bytes):
826	movdqa	%xmm0, -79(%edx)
827L(aligned_16_63bytes):
828	movdqa	%xmm0, -63(%edx)
829L(aligned_16_47bytes):
830	movdqa	%xmm0, -47(%edx)
831L(aligned_16_31bytes):
832	movdqa	%xmm0, -31(%edx)
833L(aligned_16_15bytes):
834	movq	%xmm0, -15(%edx)
835	movl	%eax, -7(%edx)
836	movw	%ax, -3(%edx)
837	movb	%al, -1(%edx)
838	SETRTNVAL
839	RETURN_END
840
841END (MEMSET)
842