1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include <private/bionic_asm.h>
32
33#include "cache.h"
34
35#ifndef L
36# define L(label)	.L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n)	.p2align n
41#endif
42
43#define CFI_PUSH(REG)						\
44  .cfi_adjust_cfa_offset 4;					\
45  .cfi_rel_offset REG, 0
46
47#define CFI_POP(REG)						\
48  .cfi_adjust_cfa_offset -4;					\
49  .cfi_restore REG
50
51#define PUSH(REG)	pushl REG; CFI_PUSH(REG)
52#define POP(REG)	popl REG; CFI_POP(REG)
53
54#define PARMS 8  /* Preserve EBX. */
55#define DST PARMS
56#define CHR (DST+4)
57#define LEN (CHR+4)
58#define CHK_DST_LEN (LEN+4)
59#define SETRTNVAL	movl DST(%esp), %eax
60
61#define ENTRANCE	PUSH(%ebx);
62#define RETURN_END	POP(%ebx); ret
63#define RETURN		RETURN_END; CFI_PUSH(%ebx)
64#define JMPTBL(I, B)	I - B
65
66/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
67   jump table with relative offsets.   */
68# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
69    /* We first load PC into EBX.  */				\
70    call	__x86.get_pc_thunk.bx;				\
71    /* Get the address of the jump table.  */			\
72    add		$(TABLE - .), %ebx;				\
73    /* Get the entry and convert the relative offset to the	\
74       absolute address.  */					\
75    add		(%ebx,%ecx,4), %ebx;				\
76    add		%ecx, %edx;					\
77    /* We loaded the jump table and adjusted EDX. Go.  */	\
78    jmp		*%ebx
79
80	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
81	.globl	__x86.get_pc_thunk.bx
82	.hidden	__x86.get_pc_thunk.bx
83	ALIGN(4)
84	.type	__x86.get_pc_thunk.bx,@function
85__x86.get_pc_thunk.bx:
86	movl	(%esp), %ebx
87	ret
88
89ENTRY(__memset_chk)
90  ENTRANCE
91
92  movl LEN(%esp), %ecx
93  cmpl CHK_DST_LEN(%esp), %ecx
94  jna L(memset_length_loaded)
95
96  POP(%ebx) // Undo ENTRANCE without returning.
97  jmp __memset_chk_fail
98END(__memset_chk)
99
100	.section .text.sse2,"ax",@progbits
101	ALIGN(4)
102ENTRY(memset)
103	ENTRANCE
104
105	movl	LEN(%esp), %ecx
106L(memset_length_loaded):
107	movzbl	CHR(%esp), %eax
108	movb	%al, %ah
109	/* Fill the whole EAX with pattern.  */
110	movl	%eax, %edx
111	shl	$16, %eax
112	or	%edx, %eax
113	movl	DST(%esp), %edx
114	cmp	$32, %ecx
115	jae	L(32bytesormore)
116
117L(write_less32bytes):
118	BRANCH_TO_JMPTBL_ENTRY(L(table_less_32bytes))
119
120
121	.pushsection .rodata.sse2,"a",@progbits
122	ALIGN(2)
123L(table_less_32bytes):
124	.int	JMPTBL(L(write_0bytes), L(table_less_32bytes))
125	.int	JMPTBL(L(write_1bytes), L(table_less_32bytes))
126	.int	JMPTBL(L(write_2bytes), L(table_less_32bytes))
127	.int	JMPTBL(L(write_3bytes), L(table_less_32bytes))
128	.int	JMPTBL(L(write_4bytes), L(table_less_32bytes))
129	.int	JMPTBL(L(write_5bytes), L(table_less_32bytes))
130	.int	JMPTBL(L(write_6bytes), L(table_less_32bytes))
131	.int	JMPTBL(L(write_7bytes), L(table_less_32bytes))
132	.int	JMPTBL(L(write_8bytes), L(table_less_32bytes))
133	.int	JMPTBL(L(write_9bytes), L(table_less_32bytes))
134	.int	JMPTBL(L(write_10bytes), L(table_less_32bytes))
135	.int	JMPTBL(L(write_11bytes), L(table_less_32bytes))
136	.int	JMPTBL(L(write_12bytes), L(table_less_32bytes))
137	.int	JMPTBL(L(write_13bytes), L(table_less_32bytes))
138	.int	JMPTBL(L(write_14bytes), L(table_less_32bytes))
139	.int	JMPTBL(L(write_15bytes), L(table_less_32bytes))
140	.int	JMPTBL(L(write_16bytes), L(table_less_32bytes))
141	.int	JMPTBL(L(write_17bytes), L(table_less_32bytes))
142	.int	JMPTBL(L(write_18bytes), L(table_less_32bytes))
143	.int	JMPTBL(L(write_19bytes), L(table_less_32bytes))
144	.int	JMPTBL(L(write_20bytes), L(table_less_32bytes))
145	.int	JMPTBL(L(write_21bytes), L(table_less_32bytes))
146	.int	JMPTBL(L(write_22bytes), L(table_less_32bytes))
147	.int	JMPTBL(L(write_23bytes), L(table_less_32bytes))
148	.int	JMPTBL(L(write_24bytes), L(table_less_32bytes))
149	.int	JMPTBL(L(write_25bytes), L(table_less_32bytes))
150	.int	JMPTBL(L(write_26bytes), L(table_less_32bytes))
151	.int	JMPTBL(L(write_27bytes), L(table_less_32bytes))
152	.int	JMPTBL(L(write_28bytes), L(table_less_32bytes))
153	.int	JMPTBL(L(write_29bytes), L(table_less_32bytes))
154	.int	JMPTBL(L(write_30bytes), L(table_less_32bytes))
155	.int	JMPTBL(L(write_31bytes), L(table_less_32bytes))
156	.popsection
157
158	ALIGN(4)
159L(write_28bytes):
160	movl	%eax, -28(%edx)
161L(write_24bytes):
162	movl	%eax, -24(%edx)
163L(write_20bytes):
164	movl	%eax, -20(%edx)
165L(write_16bytes):
166	movl	%eax, -16(%edx)
167L(write_12bytes):
168	movl	%eax, -12(%edx)
169L(write_8bytes):
170	movl	%eax, -8(%edx)
171L(write_4bytes):
172	movl	%eax, -4(%edx)
173L(write_0bytes):
174	SETRTNVAL
175	RETURN
176
177	ALIGN(4)
178L(write_29bytes):
179	movl	%eax, -29(%edx)
180L(write_25bytes):
181	movl	%eax, -25(%edx)
182L(write_21bytes):
183	movl	%eax, -21(%edx)
184L(write_17bytes):
185	movl	%eax, -17(%edx)
186L(write_13bytes):
187	movl	%eax, -13(%edx)
188L(write_9bytes):
189	movl	%eax, -9(%edx)
190L(write_5bytes):
191	movl	%eax, -5(%edx)
192L(write_1bytes):
193	movb	%al, -1(%edx)
194	SETRTNVAL
195	RETURN
196
197	ALIGN(4)
198L(write_30bytes):
199	movl	%eax, -30(%edx)
200L(write_26bytes):
201	movl	%eax, -26(%edx)
202L(write_22bytes):
203	movl	%eax, -22(%edx)
204L(write_18bytes):
205	movl	%eax, -18(%edx)
206L(write_14bytes):
207	movl	%eax, -14(%edx)
208L(write_10bytes):
209	movl	%eax, -10(%edx)
210L(write_6bytes):
211	movl	%eax, -6(%edx)
212L(write_2bytes):
213	movw	%ax, -2(%edx)
214	SETRTNVAL
215	RETURN
216
217	ALIGN(4)
218L(write_31bytes):
219	movl	%eax, -31(%edx)
220L(write_27bytes):
221	movl	%eax, -27(%edx)
222L(write_23bytes):
223	movl	%eax, -23(%edx)
224L(write_19bytes):
225	movl	%eax, -19(%edx)
226L(write_15bytes):
227	movl	%eax, -15(%edx)
228L(write_11bytes):
229	movl	%eax, -11(%edx)
230L(write_7bytes):
231	movl	%eax, -7(%edx)
232L(write_3bytes):
233	movw	%ax, -3(%edx)
234	movb	%al, -1(%edx)
235	SETRTNVAL
236	RETURN
237
238	ALIGN(4)
239/* ECX > 32 and EDX is 4 byte aligned.  */
240L(32bytesormore):
241	/* Fill xmm0 with the pattern.  */
242	movd	%eax, %xmm0
243	pshufd	$0, %xmm0, %xmm0
244	testl	$0xf, %edx
245	jz	L(aligned_16)
246/* ECX > 32 and EDX is not 16 byte aligned.  */
247L(not_aligned_16):
248	movdqu	%xmm0, (%edx)
249	movl	%edx, %eax
250	and	$-16, %edx
251	add	$16, %edx
252	sub	%edx, %eax
253	add	%eax, %ecx
254	movd	%xmm0, %eax
255
256	ALIGN(4)
257L(aligned_16):
258	cmp	$128, %ecx
259	jae	L(128bytesormore)
260
261L(aligned_16_less128bytes):
262	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
263
264	ALIGN(4)
265L(128bytesormore):
266	PUSH(%ebx)
267	mov	$SHARED_CACHE_SIZE, %ebx
268	cmp	%ebx, %ecx
269	jae	L(128bytesormore_nt_start)
270
271
272	POP(%ebx)
273# define RESTORE_EBX_STATE CFI_PUSH(%ebx)
274	cmp	$DATA_CACHE_SIZE, %ecx
275
276	jae	L(128bytes_L2_normal)
277	subl	$128, %ecx
278L(128bytesormore_normal):
279	sub	$128, %ecx
280	movdqa	%xmm0, (%edx)
281	movdqa	%xmm0, 0x10(%edx)
282	movdqa	%xmm0, 0x20(%edx)
283	movdqa	%xmm0, 0x30(%edx)
284	movdqa	%xmm0, 0x40(%edx)
285	movdqa	%xmm0, 0x50(%edx)
286	movdqa	%xmm0, 0x60(%edx)
287	movdqa	%xmm0, 0x70(%edx)
288	lea	128(%edx), %edx
289	jb	L(128bytesless_normal)
290
291
292	sub	$128, %ecx
293	movdqa	%xmm0, (%edx)
294	movdqa	%xmm0, 0x10(%edx)
295	movdqa	%xmm0, 0x20(%edx)
296	movdqa	%xmm0, 0x30(%edx)
297	movdqa	%xmm0, 0x40(%edx)
298	movdqa	%xmm0, 0x50(%edx)
299	movdqa	%xmm0, 0x60(%edx)
300	movdqa	%xmm0, 0x70(%edx)
301	lea	128(%edx), %edx
302	jae	L(128bytesormore_normal)
303
304L(128bytesless_normal):
305	add	$128, %ecx
306	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
307
308	ALIGN(4)
309L(128bytes_L2_normal):
310	prefetcht0	0x380(%edx)
311	prefetcht0	0x3c0(%edx)
312	sub	$128, %ecx
313	movdqa	%xmm0, (%edx)
314	movaps	%xmm0, 0x10(%edx)
315	movaps	%xmm0, 0x20(%edx)
316	movaps	%xmm0, 0x30(%edx)
317	movaps	%xmm0, 0x40(%edx)
318	movaps	%xmm0, 0x50(%edx)
319	movaps	%xmm0, 0x60(%edx)
320	movaps	%xmm0, 0x70(%edx)
321	add	$128, %edx
322	cmp	$128, %ecx
323	jae	L(128bytes_L2_normal)
324
325L(128bytesless_L2_normal):
326	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
327
328	RESTORE_EBX_STATE
329L(128bytesormore_nt_start):
330	sub	%ebx, %ecx
331	mov	%ebx, %eax
332	and	$0x7f, %eax
333	add	%eax, %ecx
334	movd	%xmm0, %eax
335	ALIGN(4)
336L(128bytesormore_shared_cache_loop):
337	prefetcht0	0x3c0(%edx)
338	prefetcht0	0x380(%edx)
339	sub	$0x80, %ebx
340	movdqa	%xmm0, (%edx)
341	movdqa	%xmm0, 0x10(%edx)
342	movdqa	%xmm0, 0x20(%edx)
343	movdqa	%xmm0, 0x30(%edx)
344	movdqa	%xmm0, 0x40(%edx)
345	movdqa	%xmm0, 0x50(%edx)
346	movdqa	%xmm0, 0x60(%edx)
347	movdqa	%xmm0, 0x70(%edx)
348	add	$0x80, %edx
349	cmp	$0x80, %ebx
350	jae	L(128bytesormore_shared_cache_loop)
351	cmp	$0x80, %ecx
352	jb	L(shared_cache_loop_end)
353	ALIGN(4)
354L(128bytesormore_nt):
355	sub	$0x80, %ecx
356	movntdq	%xmm0, (%edx)
357	movntdq	%xmm0, 0x10(%edx)
358	movntdq	%xmm0, 0x20(%edx)
359	movntdq	%xmm0, 0x30(%edx)
360	movntdq	%xmm0, 0x40(%edx)
361	movntdq	%xmm0, 0x50(%edx)
362	movntdq	%xmm0, 0x60(%edx)
363	movntdq	%xmm0, 0x70(%edx)
364	add	$0x80, %edx
365	cmp	$0x80, %ecx
366	jae	L(128bytesormore_nt)
367	sfence
368L(shared_cache_loop_end):
369	POP(%ebx)
370	BRANCH_TO_JMPTBL_ENTRY(L(table_16_128bytes))
371
372
373	.pushsection .rodata.sse2,"a",@progbits
374	ALIGN(2)
375L(table_16_128bytes):
376	.int	JMPTBL(L(aligned_16_0bytes), L(table_16_128bytes))
377	.int	JMPTBL(L(aligned_16_1bytes), L(table_16_128bytes))
378	.int	JMPTBL(L(aligned_16_2bytes), L(table_16_128bytes))
379	.int	JMPTBL(L(aligned_16_3bytes), L(table_16_128bytes))
380	.int	JMPTBL(L(aligned_16_4bytes), L(table_16_128bytes))
381	.int	JMPTBL(L(aligned_16_5bytes), L(table_16_128bytes))
382	.int	JMPTBL(L(aligned_16_6bytes), L(table_16_128bytes))
383	.int	JMPTBL(L(aligned_16_7bytes), L(table_16_128bytes))
384	.int	JMPTBL(L(aligned_16_8bytes), L(table_16_128bytes))
385	.int	JMPTBL(L(aligned_16_9bytes), L(table_16_128bytes))
386	.int	JMPTBL(L(aligned_16_10bytes), L(table_16_128bytes))
387	.int	JMPTBL(L(aligned_16_11bytes), L(table_16_128bytes))
388	.int	JMPTBL(L(aligned_16_12bytes), L(table_16_128bytes))
389	.int	JMPTBL(L(aligned_16_13bytes), L(table_16_128bytes))
390	.int	JMPTBL(L(aligned_16_14bytes), L(table_16_128bytes))
391	.int	JMPTBL(L(aligned_16_15bytes), L(table_16_128bytes))
392	.int	JMPTBL(L(aligned_16_16bytes), L(table_16_128bytes))
393	.int	JMPTBL(L(aligned_16_17bytes), L(table_16_128bytes))
394	.int	JMPTBL(L(aligned_16_18bytes), L(table_16_128bytes))
395	.int	JMPTBL(L(aligned_16_19bytes), L(table_16_128bytes))
396	.int	JMPTBL(L(aligned_16_20bytes), L(table_16_128bytes))
397	.int	JMPTBL(L(aligned_16_21bytes), L(table_16_128bytes))
398	.int	JMPTBL(L(aligned_16_22bytes), L(table_16_128bytes))
399	.int	JMPTBL(L(aligned_16_23bytes), L(table_16_128bytes))
400	.int	JMPTBL(L(aligned_16_24bytes), L(table_16_128bytes))
401	.int	JMPTBL(L(aligned_16_25bytes), L(table_16_128bytes))
402	.int	JMPTBL(L(aligned_16_26bytes), L(table_16_128bytes))
403	.int	JMPTBL(L(aligned_16_27bytes), L(table_16_128bytes))
404	.int	JMPTBL(L(aligned_16_28bytes), L(table_16_128bytes))
405	.int	JMPTBL(L(aligned_16_29bytes), L(table_16_128bytes))
406	.int	JMPTBL(L(aligned_16_30bytes), L(table_16_128bytes))
407	.int	JMPTBL(L(aligned_16_31bytes), L(table_16_128bytes))
408	.int	JMPTBL(L(aligned_16_32bytes), L(table_16_128bytes))
409	.int	JMPTBL(L(aligned_16_33bytes), L(table_16_128bytes))
410	.int	JMPTBL(L(aligned_16_34bytes), L(table_16_128bytes))
411	.int	JMPTBL(L(aligned_16_35bytes), L(table_16_128bytes))
412	.int	JMPTBL(L(aligned_16_36bytes), L(table_16_128bytes))
413	.int	JMPTBL(L(aligned_16_37bytes), L(table_16_128bytes))
414	.int	JMPTBL(L(aligned_16_38bytes), L(table_16_128bytes))
415	.int	JMPTBL(L(aligned_16_39bytes), L(table_16_128bytes))
416	.int	JMPTBL(L(aligned_16_40bytes), L(table_16_128bytes))
417	.int	JMPTBL(L(aligned_16_41bytes), L(table_16_128bytes))
418	.int	JMPTBL(L(aligned_16_42bytes), L(table_16_128bytes))
419	.int	JMPTBL(L(aligned_16_43bytes), L(table_16_128bytes))
420	.int	JMPTBL(L(aligned_16_44bytes), L(table_16_128bytes))
421	.int	JMPTBL(L(aligned_16_45bytes), L(table_16_128bytes))
422	.int	JMPTBL(L(aligned_16_46bytes), L(table_16_128bytes))
423	.int	JMPTBL(L(aligned_16_47bytes), L(table_16_128bytes))
424	.int	JMPTBL(L(aligned_16_48bytes), L(table_16_128bytes))
425	.int	JMPTBL(L(aligned_16_49bytes), L(table_16_128bytes))
426	.int	JMPTBL(L(aligned_16_50bytes), L(table_16_128bytes))
427	.int	JMPTBL(L(aligned_16_51bytes), L(table_16_128bytes))
428	.int	JMPTBL(L(aligned_16_52bytes), L(table_16_128bytes))
429	.int	JMPTBL(L(aligned_16_53bytes), L(table_16_128bytes))
430	.int	JMPTBL(L(aligned_16_54bytes), L(table_16_128bytes))
431	.int	JMPTBL(L(aligned_16_55bytes), L(table_16_128bytes))
432	.int	JMPTBL(L(aligned_16_56bytes), L(table_16_128bytes))
433	.int	JMPTBL(L(aligned_16_57bytes), L(table_16_128bytes))
434	.int	JMPTBL(L(aligned_16_58bytes), L(table_16_128bytes))
435	.int	JMPTBL(L(aligned_16_59bytes), L(table_16_128bytes))
436	.int	JMPTBL(L(aligned_16_60bytes), L(table_16_128bytes))
437	.int	JMPTBL(L(aligned_16_61bytes), L(table_16_128bytes))
438	.int	JMPTBL(L(aligned_16_62bytes), L(table_16_128bytes))
439	.int	JMPTBL(L(aligned_16_63bytes), L(table_16_128bytes))
440	.int	JMPTBL(L(aligned_16_64bytes), L(table_16_128bytes))
441	.int	JMPTBL(L(aligned_16_65bytes), L(table_16_128bytes))
442	.int	JMPTBL(L(aligned_16_66bytes), L(table_16_128bytes))
443	.int	JMPTBL(L(aligned_16_67bytes), L(table_16_128bytes))
444	.int	JMPTBL(L(aligned_16_68bytes), L(table_16_128bytes))
445	.int	JMPTBL(L(aligned_16_69bytes), L(table_16_128bytes))
446	.int	JMPTBL(L(aligned_16_70bytes), L(table_16_128bytes))
447	.int	JMPTBL(L(aligned_16_71bytes), L(table_16_128bytes))
448	.int	JMPTBL(L(aligned_16_72bytes), L(table_16_128bytes))
449	.int	JMPTBL(L(aligned_16_73bytes), L(table_16_128bytes))
450	.int	JMPTBL(L(aligned_16_74bytes), L(table_16_128bytes))
451	.int	JMPTBL(L(aligned_16_75bytes), L(table_16_128bytes))
452	.int	JMPTBL(L(aligned_16_76bytes), L(table_16_128bytes))
453	.int	JMPTBL(L(aligned_16_77bytes), L(table_16_128bytes))
454	.int	JMPTBL(L(aligned_16_78bytes), L(table_16_128bytes))
455	.int	JMPTBL(L(aligned_16_79bytes), L(table_16_128bytes))
456	.int	JMPTBL(L(aligned_16_80bytes), L(table_16_128bytes))
457	.int	JMPTBL(L(aligned_16_81bytes), L(table_16_128bytes))
458	.int	JMPTBL(L(aligned_16_82bytes), L(table_16_128bytes))
459	.int	JMPTBL(L(aligned_16_83bytes), L(table_16_128bytes))
460	.int	JMPTBL(L(aligned_16_84bytes), L(table_16_128bytes))
461	.int	JMPTBL(L(aligned_16_85bytes), L(table_16_128bytes))
462	.int	JMPTBL(L(aligned_16_86bytes), L(table_16_128bytes))
463	.int	JMPTBL(L(aligned_16_87bytes), L(table_16_128bytes))
464	.int	JMPTBL(L(aligned_16_88bytes), L(table_16_128bytes))
465	.int	JMPTBL(L(aligned_16_89bytes), L(table_16_128bytes))
466	.int	JMPTBL(L(aligned_16_90bytes), L(table_16_128bytes))
467	.int	JMPTBL(L(aligned_16_91bytes), L(table_16_128bytes))
468	.int	JMPTBL(L(aligned_16_92bytes), L(table_16_128bytes))
469	.int	JMPTBL(L(aligned_16_93bytes), L(table_16_128bytes))
470	.int	JMPTBL(L(aligned_16_94bytes), L(table_16_128bytes))
471	.int	JMPTBL(L(aligned_16_95bytes), L(table_16_128bytes))
472	.int	JMPTBL(L(aligned_16_96bytes), L(table_16_128bytes))
473	.int	JMPTBL(L(aligned_16_97bytes), L(table_16_128bytes))
474	.int	JMPTBL(L(aligned_16_98bytes), L(table_16_128bytes))
475	.int	JMPTBL(L(aligned_16_99bytes), L(table_16_128bytes))
476	.int	JMPTBL(L(aligned_16_100bytes), L(table_16_128bytes))
477	.int	JMPTBL(L(aligned_16_101bytes), L(table_16_128bytes))
478	.int	JMPTBL(L(aligned_16_102bytes), L(table_16_128bytes))
479	.int	JMPTBL(L(aligned_16_103bytes), L(table_16_128bytes))
480	.int	JMPTBL(L(aligned_16_104bytes), L(table_16_128bytes))
481	.int	JMPTBL(L(aligned_16_105bytes), L(table_16_128bytes))
482	.int	JMPTBL(L(aligned_16_106bytes), L(table_16_128bytes))
483	.int	JMPTBL(L(aligned_16_107bytes), L(table_16_128bytes))
484	.int	JMPTBL(L(aligned_16_108bytes), L(table_16_128bytes))
485	.int	JMPTBL(L(aligned_16_109bytes), L(table_16_128bytes))
486	.int	JMPTBL(L(aligned_16_110bytes), L(table_16_128bytes))
487	.int	JMPTBL(L(aligned_16_111bytes), L(table_16_128bytes))
488	.int	JMPTBL(L(aligned_16_112bytes), L(table_16_128bytes))
489	.int	JMPTBL(L(aligned_16_113bytes), L(table_16_128bytes))
490	.int	JMPTBL(L(aligned_16_114bytes), L(table_16_128bytes))
491	.int	JMPTBL(L(aligned_16_115bytes), L(table_16_128bytes))
492	.int	JMPTBL(L(aligned_16_116bytes), L(table_16_128bytes))
493	.int	JMPTBL(L(aligned_16_117bytes), L(table_16_128bytes))
494	.int	JMPTBL(L(aligned_16_118bytes), L(table_16_128bytes))
495	.int	JMPTBL(L(aligned_16_119bytes), L(table_16_128bytes))
496	.int	JMPTBL(L(aligned_16_120bytes), L(table_16_128bytes))
497	.int	JMPTBL(L(aligned_16_121bytes), L(table_16_128bytes))
498	.int	JMPTBL(L(aligned_16_122bytes), L(table_16_128bytes))
499	.int	JMPTBL(L(aligned_16_123bytes), L(table_16_128bytes))
500	.int	JMPTBL(L(aligned_16_124bytes), L(table_16_128bytes))
501	.int	JMPTBL(L(aligned_16_125bytes), L(table_16_128bytes))
502	.int	JMPTBL(L(aligned_16_126bytes), L(table_16_128bytes))
503	.int	JMPTBL(L(aligned_16_127bytes), L(table_16_128bytes))
504	.popsection
505
506	ALIGN(4)
507L(aligned_16_112bytes):
508	movdqa	%xmm0, -112(%edx)
509L(aligned_16_96bytes):
510	movdqa	%xmm0, -96(%edx)
511L(aligned_16_80bytes):
512	movdqa	%xmm0, -80(%edx)
513L(aligned_16_64bytes):
514	movdqa	%xmm0, -64(%edx)
515L(aligned_16_48bytes):
516	movdqa	%xmm0, -48(%edx)
517L(aligned_16_32bytes):
518	movdqa	%xmm0, -32(%edx)
519L(aligned_16_16bytes):
520	movdqa	%xmm0, -16(%edx)
521L(aligned_16_0bytes):
522	SETRTNVAL
523	RETURN
524
525	ALIGN(4)
526L(aligned_16_113bytes):
527	movdqa	%xmm0, -113(%edx)
528L(aligned_16_97bytes):
529	movdqa	%xmm0, -97(%edx)
530L(aligned_16_81bytes):
531	movdqa	%xmm0, -81(%edx)
532L(aligned_16_65bytes):
533	movdqa	%xmm0, -65(%edx)
534L(aligned_16_49bytes):
535	movdqa	%xmm0, -49(%edx)
536L(aligned_16_33bytes):
537	movdqa	%xmm0, -33(%edx)
538L(aligned_16_17bytes):
539	movdqa	%xmm0, -17(%edx)
540L(aligned_16_1bytes):
541	movb	%al, -1(%edx)
542	SETRTNVAL
543	RETURN
544
545	ALIGN(4)
546L(aligned_16_114bytes):
547	movdqa	%xmm0, -114(%edx)
548L(aligned_16_98bytes):
549	movdqa	%xmm0, -98(%edx)
550L(aligned_16_82bytes):
551	movdqa	%xmm0, -82(%edx)
552L(aligned_16_66bytes):
553	movdqa	%xmm0, -66(%edx)
554L(aligned_16_50bytes):
555	movdqa	%xmm0, -50(%edx)
556L(aligned_16_34bytes):
557	movdqa	%xmm0, -34(%edx)
558L(aligned_16_18bytes):
559	movdqa	%xmm0, -18(%edx)
560L(aligned_16_2bytes):
561	movw	%ax, -2(%edx)
562	SETRTNVAL
563	RETURN
564
565	ALIGN(4)
566L(aligned_16_115bytes):
567	movdqa	%xmm0, -115(%edx)
568L(aligned_16_99bytes):
569	movdqa	%xmm0, -99(%edx)
570L(aligned_16_83bytes):
571	movdqa	%xmm0, -83(%edx)
572L(aligned_16_67bytes):
573	movdqa	%xmm0, -67(%edx)
574L(aligned_16_51bytes):
575	movdqa	%xmm0, -51(%edx)
576L(aligned_16_35bytes):
577	movdqa	%xmm0, -35(%edx)
578L(aligned_16_19bytes):
579	movdqa	%xmm0, -19(%edx)
580L(aligned_16_3bytes):
581	movw	%ax, -3(%edx)
582	movb	%al, -1(%edx)
583	SETRTNVAL
584	RETURN
585
586	ALIGN(4)
587L(aligned_16_116bytes):
588	movdqa	%xmm0, -116(%edx)
589L(aligned_16_100bytes):
590	movdqa	%xmm0, -100(%edx)
591L(aligned_16_84bytes):
592	movdqa	%xmm0, -84(%edx)
593L(aligned_16_68bytes):
594	movdqa	%xmm0, -68(%edx)
595L(aligned_16_52bytes):
596	movdqa	%xmm0, -52(%edx)
597L(aligned_16_36bytes):
598	movdqa	%xmm0, -36(%edx)
599L(aligned_16_20bytes):
600	movdqa	%xmm0, -20(%edx)
601L(aligned_16_4bytes):
602	movl	%eax, -4(%edx)
603	SETRTNVAL
604	RETURN
605
606	ALIGN(4)
607L(aligned_16_117bytes):
608	movdqa	%xmm0, -117(%edx)
609L(aligned_16_101bytes):
610	movdqa	%xmm0, -101(%edx)
611L(aligned_16_85bytes):
612	movdqa	%xmm0, -85(%edx)
613L(aligned_16_69bytes):
614	movdqa	%xmm0, -69(%edx)
615L(aligned_16_53bytes):
616	movdqa	%xmm0, -53(%edx)
617L(aligned_16_37bytes):
618	movdqa	%xmm0, -37(%edx)
619L(aligned_16_21bytes):
620	movdqa	%xmm0, -21(%edx)
621L(aligned_16_5bytes):
622	movl	%eax, -5(%edx)
623	movb	%al, -1(%edx)
624	SETRTNVAL
625	RETURN
626
627	ALIGN(4)
628L(aligned_16_118bytes):
629	movdqa	%xmm0, -118(%edx)
630L(aligned_16_102bytes):
631	movdqa	%xmm0, -102(%edx)
632L(aligned_16_86bytes):
633	movdqa	%xmm0, -86(%edx)
634L(aligned_16_70bytes):
635	movdqa	%xmm0, -70(%edx)
636L(aligned_16_54bytes):
637	movdqa	%xmm0, -54(%edx)
638L(aligned_16_38bytes):
639	movdqa	%xmm0, -38(%edx)
640L(aligned_16_22bytes):
641	movdqa	%xmm0, -22(%edx)
642L(aligned_16_6bytes):
643	movl	%eax, -6(%edx)
644	movw	%ax, -2(%edx)
645	SETRTNVAL
646	RETURN
647
648	ALIGN(4)
649L(aligned_16_119bytes):
650	movdqa	%xmm0, -119(%edx)
651L(aligned_16_103bytes):
652	movdqa	%xmm0, -103(%edx)
653L(aligned_16_87bytes):
654	movdqa	%xmm0, -87(%edx)
655L(aligned_16_71bytes):
656	movdqa	%xmm0, -71(%edx)
657L(aligned_16_55bytes):
658	movdqa	%xmm0, -55(%edx)
659L(aligned_16_39bytes):
660	movdqa	%xmm0, -39(%edx)
661L(aligned_16_23bytes):
662	movdqa	%xmm0, -23(%edx)
663L(aligned_16_7bytes):
664	movl	%eax, -7(%edx)
665	movw	%ax, -3(%edx)
666	movb	%al, -1(%edx)
667	SETRTNVAL
668	RETURN
669
670	ALIGN(4)
671L(aligned_16_120bytes):
672	movdqa	%xmm0, -120(%edx)
673L(aligned_16_104bytes):
674	movdqa	%xmm0, -104(%edx)
675L(aligned_16_88bytes):
676	movdqa	%xmm0, -88(%edx)
677L(aligned_16_72bytes):
678	movdqa	%xmm0, -72(%edx)
679L(aligned_16_56bytes):
680	movdqa	%xmm0, -56(%edx)
681L(aligned_16_40bytes):
682	movdqa	%xmm0, -40(%edx)
683L(aligned_16_24bytes):
684	movdqa	%xmm0, -24(%edx)
685L(aligned_16_8bytes):
686	movq	%xmm0, -8(%edx)
687	SETRTNVAL
688	RETURN
689
690	ALIGN(4)
691L(aligned_16_121bytes):
692	movdqa	%xmm0, -121(%edx)
693L(aligned_16_105bytes):
694	movdqa	%xmm0, -105(%edx)
695L(aligned_16_89bytes):
696	movdqa	%xmm0, -89(%edx)
697L(aligned_16_73bytes):
698	movdqa	%xmm0, -73(%edx)
699L(aligned_16_57bytes):
700	movdqa	%xmm0, -57(%edx)
701L(aligned_16_41bytes):
702	movdqa	%xmm0, -41(%edx)
703L(aligned_16_25bytes):
704	movdqa	%xmm0, -25(%edx)
705L(aligned_16_9bytes):
706	movq	%xmm0, -9(%edx)
707	movb	%al, -1(%edx)
708	SETRTNVAL
709	RETURN
710
711	ALIGN(4)
712L(aligned_16_122bytes):
713	movdqa	%xmm0, -122(%edx)
714L(aligned_16_106bytes):
715	movdqa	%xmm0, -106(%edx)
716L(aligned_16_90bytes):
717	movdqa	%xmm0, -90(%edx)
718L(aligned_16_74bytes):
719	movdqa	%xmm0, -74(%edx)
720L(aligned_16_58bytes):
721	movdqa	%xmm0, -58(%edx)
722L(aligned_16_42bytes):
723	movdqa	%xmm0, -42(%edx)
724L(aligned_16_26bytes):
725	movdqa	%xmm0, -26(%edx)
726L(aligned_16_10bytes):
727	movq	%xmm0, -10(%edx)
728	movw	%ax, -2(%edx)
729	SETRTNVAL
730	RETURN
731
732	ALIGN(4)
733L(aligned_16_123bytes):
734	movdqa	%xmm0, -123(%edx)
735L(aligned_16_107bytes):
736	movdqa	%xmm0, -107(%edx)
737L(aligned_16_91bytes):
738	movdqa	%xmm0, -91(%edx)
739L(aligned_16_75bytes):
740	movdqa	%xmm0, -75(%edx)
741L(aligned_16_59bytes):
742	movdqa	%xmm0, -59(%edx)
743L(aligned_16_43bytes):
744	movdqa	%xmm0, -43(%edx)
745L(aligned_16_27bytes):
746	movdqa	%xmm0, -27(%edx)
747L(aligned_16_11bytes):
748	movq	%xmm0, -11(%edx)
749	movw	%ax, -3(%edx)
750	movb	%al, -1(%edx)
751	SETRTNVAL
752	RETURN
753
754	ALIGN(4)
755L(aligned_16_124bytes):
756	movdqa	%xmm0, -124(%edx)
757L(aligned_16_108bytes):
758	movdqa	%xmm0, -108(%edx)
759L(aligned_16_92bytes):
760	movdqa	%xmm0, -92(%edx)
761L(aligned_16_76bytes):
762	movdqa	%xmm0, -76(%edx)
763L(aligned_16_60bytes):
764	movdqa	%xmm0, -60(%edx)
765L(aligned_16_44bytes):
766	movdqa	%xmm0, -44(%edx)
767L(aligned_16_28bytes):
768	movdqa	%xmm0, -28(%edx)
769L(aligned_16_12bytes):
770	movq	%xmm0, -12(%edx)
771	movl	%eax, -4(%edx)
772	SETRTNVAL
773	RETURN
774
775	ALIGN(4)
776L(aligned_16_125bytes):
777	movdqa	%xmm0, -125(%edx)
778L(aligned_16_109bytes):
779	movdqa	%xmm0, -109(%edx)
780L(aligned_16_93bytes):
781	movdqa	%xmm0, -93(%edx)
782L(aligned_16_77bytes):
783	movdqa	%xmm0, -77(%edx)
784L(aligned_16_61bytes):
785	movdqa	%xmm0, -61(%edx)
786L(aligned_16_45bytes):
787	movdqa	%xmm0, -45(%edx)
788L(aligned_16_29bytes):
789	movdqa	%xmm0, -29(%edx)
790L(aligned_16_13bytes):
791	movq	%xmm0, -13(%edx)
792	movl	%eax, -5(%edx)
793	movb	%al, -1(%edx)
794	SETRTNVAL
795	RETURN
796
797	ALIGN(4)
798L(aligned_16_126bytes):
799	movdqa	%xmm0, -126(%edx)
800L(aligned_16_110bytes):
801	movdqa	%xmm0, -110(%edx)
802L(aligned_16_94bytes):
803	movdqa	%xmm0, -94(%edx)
804L(aligned_16_78bytes):
805	movdqa	%xmm0, -78(%edx)
806L(aligned_16_62bytes):
807	movdqa	%xmm0, -62(%edx)
808L(aligned_16_46bytes):
809	movdqa	%xmm0, -46(%edx)
810L(aligned_16_30bytes):
811	movdqa	%xmm0, -30(%edx)
812L(aligned_16_14bytes):
813	movq	%xmm0, -14(%edx)
814	movl	%eax, -6(%edx)
815	movw	%ax, -2(%edx)
816	SETRTNVAL
817	RETURN
818
819	ALIGN(4)
820L(aligned_16_127bytes):
821	movdqa	%xmm0, -127(%edx)
822L(aligned_16_111bytes):
823	movdqa	%xmm0, -111(%edx)
824L(aligned_16_95bytes):
825	movdqa	%xmm0, -95(%edx)
826L(aligned_16_79bytes):
827	movdqa	%xmm0, -79(%edx)
828L(aligned_16_63bytes):
829	movdqa	%xmm0, -63(%edx)
830L(aligned_16_47bytes):
831	movdqa	%xmm0, -47(%edx)
832L(aligned_16_31bytes):
833	movdqa	%xmm0, -31(%edx)
834L(aligned_16_15bytes):
835	movq	%xmm0, -15(%edx)
836	movl	%eax, -7(%edx)
837	movw	%ax, -3(%edx)
838	movb	%al, -1(%edx)
839	SETRTNVAL
840	RETURN_END
841
842END(memset)
843