1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "cache.h"
18
19#ifndef MEMSET
20# define MEMSET		android_memset16
21#endif
22
23#ifndef L
24# define L(label)	.L##label
25#endif
26
27#ifndef ALIGN
28# define ALIGN(n)	.p2align n
29#endif
30
31#ifndef cfi_startproc
32# define cfi_startproc			.cfi_startproc
33#endif
34
35#ifndef cfi_endproc
36# define cfi_endproc			.cfi_endproc
37#endif
38
39#ifndef ENTRY
40# define ENTRY(name)			\
41	.type name,  @function; 	\
42	.globl name;			\
43	.p2align 4;			\
44name:					\
45	cfi_startproc
46#endif
47
48#ifndef END
49# define END(name)			\
50	cfi_endproc;			\
51	.size name, .-name
52#endif
53
54#define JMPTBL(I, B)	I - B
55
56/* Branch to an entry in a jump table.  TABLE is a jump table with
57   relative offsets.  INDEX is a register contains the index into the
58   jump table.  SCALE is the scale of INDEX.  */
59#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
60	lea    TABLE(%rip), %r11;						\
61	movslq (%r11, INDEX, SCALE), INDEX;				\
62	lea    (%r11, INDEX), INDEX;					\
63	jmp    *INDEX
64
65	.section .text.sse2,"ax",@progbits
66	ALIGN (4)
67ENTRY (MEMSET)	// Address in rdi
68	shr    $1, %rdx			// Count in rdx
69	movzwl %si, %ecx
70	/* Fill the whole ECX with pattern.  */
71	shl    $16, %esi
72	or     %esi, %ecx		// Pattern in ecx
73
74	cmp    $32, %rdx
75	jae    L(32wordsormore)
76
77L(write_less32words):
78	lea    (%rdi, %rdx, 2), %rdi
79	BRANCH_TO_JMPTBL_ENTRY (L(table_less32words), %rdx, 4)
80
81	.pushsection .rodata.sse2,"a",@progbits
82	ALIGN (2)
83L(table_less32words):
84	.int	JMPTBL (L(write_0words), L(table_less32words))
85	.int	JMPTBL (L(write_1words), L(table_less32words))
86	.int	JMPTBL (L(write_2words), L(table_less32words))
87	.int	JMPTBL (L(write_3words), L(table_less32words))
88	.int	JMPTBL (L(write_4words), L(table_less32words))
89	.int	JMPTBL (L(write_5words), L(table_less32words))
90	.int	JMPTBL (L(write_6words), L(table_less32words))
91	.int	JMPTBL (L(write_7words), L(table_less32words))
92	.int	JMPTBL (L(write_8words), L(table_less32words))
93	.int	JMPTBL (L(write_9words), L(table_less32words))
94	.int	JMPTBL (L(write_10words), L(table_less32words))
95	.int	JMPTBL (L(write_11words), L(table_less32words))
96	.int	JMPTBL (L(write_12words), L(table_less32words))
97	.int	JMPTBL (L(write_13words), L(table_less32words))
98	.int	JMPTBL (L(write_14words), L(table_less32words))
99	.int	JMPTBL (L(write_15words), L(table_less32words))
100	.int	JMPTBL (L(write_16words), L(table_less32words))
101	.int	JMPTBL (L(write_17words), L(table_less32words))
102	.int	JMPTBL (L(write_18words), L(table_less32words))
103	.int	JMPTBL (L(write_19words), L(table_less32words))
104	.int	JMPTBL (L(write_20words), L(table_less32words))
105	.int	JMPTBL (L(write_21words), L(table_less32words))
106	.int	JMPTBL (L(write_22words), L(table_less32words))
107	.int	JMPTBL (L(write_23words), L(table_less32words))
108	.int	JMPTBL (L(write_24words), L(table_less32words))
109	.int	JMPTBL (L(write_25words), L(table_less32words))
110	.int	JMPTBL (L(write_26words), L(table_less32words))
111	.int	JMPTBL (L(write_27words), L(table_less32words))
112	.int	JMPTBL (L(write_28words), L(table_less32words))
113	.int	JMPTBL (L(write_29words), L(table_less32words))
114	.int	JMPTBL (L(write_30words), L(table_less32words))
115	.int	JMPTBL (L(write_31words), L(table_less32words))
116	.popsection
117
118	ALIGN (4)
119L(write_28words):
120	movl   %ecx, -56(%rdi)
121	movl   %ecx, -52(%rdi)
122L(write_24words):
123	movl   %ecx, -48(%rdi)
124	movl   %ecx, -44(%rdi)
125L(write_20words):
126	movl   %ecx, -40(%rdi)
127	movl   %ecx, -36(%rdi)
128L(write_16words):
129	movl   %ecx, -32(%rdi)
130	movl   %ecx, -28(%rdi)
131L(write_12words):
132	movl   %ecx, -24(%rdi)
133	movl   %ecx, -20(%rdi)
134L(write_8words):
135	movl   %ecx, -16(%rdi)
136	movl   %ecx, -12(%rdi)
137L(write_4words):
138	movl   %ecx, -8(%rdi)
139	movl   %ecx, -4(%rdi)
140L(write_0words):
141	ret
142
143	ALIGN (4)
144L(write_29words):
145	movl   %ecx, -58(%rdi)
146	movl   %ecx, -54(%rdi)
147L(write_25words):
148	movl   %ecx, -50(%rdi)
149	movl   %ecx, -46(%rdi)
150L(write_21words):
151	movl   %ecx, -42(%rdi)
152	movl   %ecx, -38(%rdi)
153L(write_17words):
154	movl   %ecx, -34(%rdi)
155	movl   %ecx, -30(%rdi)
156L(write_13words):
157	movl   %ecx, -26(%rdi)
158	movl   %ecx, -22(%rdi)
159L(write_9words):
160	movl   %ecx, -18(%rdi)
161	movl   %ecx, -14(%rdi)
162L(write_5words):
163	movl   %ecx, -10(%rdi)
164	movl   %ecx, -6(%rdi)
165L(write_1words):
166	mov	%cx, -2(%rdi)
167	ret
168
169	ALIGN (4)
170L(write_30words):
171	movl   %ecx, -60(%rdi)
172	movl   %ecx, -56(%rdi)
173L(write_26words):
174	movl   %ecx, -52(%rdi)
175	movl   %ecx, -48(%rdi)
176L(write_22words):
177	movl   %ecx, -44(%rdi)
178	movl   %ecx, -40(%rdi)
179L(write_18words):
180	movl   %ecx, -36(%rdi)
181	movl   %ecx, -32(%rdi)
182L(write_14words):
183	movl   %ecx, -28(%rdi)
184	movl   %ecx, -24(%rdi)
185L(write_10words):
186	movl   %ecx, -20(%rdi)
187	movl   %ecx, -16(%rdi)
188L(write_6words):
189	movl   %ecx, -12(%rdi)
190	movl   %ecx, -8(%rdi)
191L(write_2words):
192	movl   %ecx, -4(%rdi)
193	ret
194
195	ALIGN (4)
196L(write_31words):
197	movl   %ecx, -62(%rdi)
198	movl   %ecx, -58(%rdi)
199L(write_27words):
200	movl   %ecx, -54(%rdi)
201	movl   %ecx, -50(%rdi)
202L(write_23words):
203	movl   %ecx, -46(%rdi)
204	movl   %ecx, -42(%rdi)
205L(write_19words):
206	movl   %ecx, -38(%rdi)
207	movl   %ecx, -34(%rdi)
208L(write_15words):
209	movl   %ecx, -30(%rdi)
210	movl   %ecx, -26(%rdi)
211L(write_11words):
212	movl   %ecx, -22(%rdi)
213	movl   %ecx, -18(%rdi)
214L(write_7words):
215	movl   %ecx, -14(%rdi)
216	movl   %ecx, -10(%rdi)
217L(write_3words):
218	movl   %ecx, -6(%rdi)
219	movw   %cx, -2(%rdi)
220	ret
221
222	ALIGN (4)
223L(32wordsormore):
224	shl    $1, %rdx
225	test   $0x01, %edi
226	jz     L(aligned2bytes)
227	mov    %ecx, (%rdi)
228	mov    %ecx, -4(%rdi, %rdx)
229	sub    $2, %rdx
230	add    $1, %rdi
231	rol    $8, %ecx
232L(aligned2bytes):
233	/* Fill xmm0 with the pattern.  */
234	movd   %ecx, %xmm0
235	pshufd $0, %xmm0, %xmm0
236
237	testl  $0xf, %edi
238	jz     L(aligned_16)
239/* RDX > 32 and RDI is not 16 byte aligned.  */
240	movdqu %xmm0, (%rdi)
241	mov    %rdi, %rsi
242	and    $-16, %rdi
243	add    $16, %rdi
244	sub    %rdi, %rsi
245	add    %rsi, %rdx
246
247	ALIGN (4)
248L(aligned_16):
249	cmp    $128, %rdx
250	jge    L(128bytesormore)
251
252L(aligned_16_less128bytes):
253	add    %rdx, %rdi
254	shr    $1, %rdx
255	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
256
257	ALIGN (4)
258L(128bytesormore):
259	cmp    $SHARED_CACHE_SIZE, %rdx
260	jg     L(128bytesormore_nt)
261
262L(128bytesormore_normal):
263	sub    $128, %rdx
264	movdqa %xmm0, (%rdi)
265	movdqa %xmm0, 0x10(%rdi)
266	movdqa %xmm0, 0x20(%rdi)
267	movdqa %xmm0, 0x30(%rdi)
268	movdqa %xmm0, 0x40(%rdi)
269	movdqa %xmm0, 0x50(%rdi)
270	movdqa %xmm0, 0x60(%rdi)
271	movdqa %xmm0, 0x70(%rdi)
272	lea    128(%rdi), %rdi
273	cmp    $128, %rdx
274	jl     L(128bytesless_normal)
275
276	sub    $128, %rdx
277	movdqa %xmm0, (%rdi)
278	movdqa %xmm0, 0x10(%rdi)
279	movdqa %xmm0, 0x20(%rdi)
280	movdqa %xmm0, 0x30(%rdi)
281	movdqa %xmm0, 0x40(%rdi)
282	movdqa %xmm0, 0x50(%rdi)
283	movdqa %xmm0, 0x60(%rdi)
284	movdqa %xmm0, 0x70(%rdi)
285	lea    128(%rdi), %rdi
286	cmp    $128, %rdx
287	jl     L(128bytesless_normal)
288
289	sub    $128, %rdx
290	movdqa %xmm0, (%rdi)
291	movdqa %xmm0, 0x10(%rdi)
292	movdqa %xmm0, 0x20(%rdi)
293	movdqa %xmm0, 0x30(%rdi)
294	movdqa %xmm0, 0x40(%rdi)
295	movdqa %xmm0, 0x50(%rdi)
296	movdqa %xmm0, 0x60(%rdi)
297	movdqa %xmm0, 0x70(%rdi)
298	lea    128(%rdi), %rdi
299	cmp    $128, %rdx
300	jl     L(128bytesless_normal)
301
302	sub    $128, %rdx
303	movdqa %xmm0, (%rdi)
304	movdqa %xmm0, 0x10(%rdi)
305	movdqa %xmm0, 0x20(%rdi)
306	movdqa %xmm0, 0x30(%rdi)
307	movdqa %xmm0, 0x40(%rdi)
308	movdqa %xmm0, 0x50(%rdi)
309	movdqa %xmm0, 0x60(%rdi)
310	movdqa %xmm0, 0x70(%rdi)
311	lea    128(%rdi), %rdi
312	cmp    $128, %rdx
313	jge    L(128bytesormore_normal)
314
315L(128bytesless_normal):
316	add    %rdx, %rdi
317	shr    $1, %rdx
318	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
319
320	ALIGN (4)
321L(128bytesormore_nt):
322	sub    $128, %rdx
323	movntdq %xmm0, (%rdi)
324	movntdq %xmm0, 0x10(%rdi)
325	movntdq %xmm0, 0x20(%rdi)
326	movntdq %xmm0, 0x30(%rdi)
327	movntdq %xmm0, 0x40(%rdi)
328	movntdq %xmm0, 0x50(%rdi)
329	movntdq %xmm0, 0x60(%rdi)
330	movntdq %xmm0, 0x70(%rdi)
331	lea    128(%rdi), %rdi
332	cmp    $128, %rdx
333	jge    L(128bytesormore_nt)
334
335	sfence
336	add    %rdx, %rdi
337	shr    $1, %rdx
338	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
339
340	.pushsection .rodata.sse2,"a",@progbits
341	ALIGN (2)
342L(table_16_128bytes):
343	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
344	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
345	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
346	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
347	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
348	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
349	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
350	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
351	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
352	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
353	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
354	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
355	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
356	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
357	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
358	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
359	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
360	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
361	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
362	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
363	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
364	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
365	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
366	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
367	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
368	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
369	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
370	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
371	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
372	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
373	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
374	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
375	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
376	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
377	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
378	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
379	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
380	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
381	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
382	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
383	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
384	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
385	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
386	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
387	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
388	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
389	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
390	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
391	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
392	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
393	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
394	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
395	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
396	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
397	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
398	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
399	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
400	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
401	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
402	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
403	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
404	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
405	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
406	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
407	.popsection
408
409	ALIGN (4)
410L(aligned_16_112bytes):
411	movdqa %xmm0, -112(%rdi)
412L(aligned_16_96bytes):
413	movdqa %xmm0, -96(%rdi)
414L(aligned_16_80bytes):
415	movdqa %xmm0, -80(%rdi)
416L(aligned_16_64bytes):
417	movdqa %xmm0, -64(%rdi)
418L(aligned_16_48bytes):
419	movdqa %xmm0, -48(%rdi)
420L(aligned_16_32bytes):
421	movdqa %xmm0, -32(%rdi)
422L(aligned_16_16bytes):
423	movdqa %xmm0, -16(%rdi)
424L(aligned_16_0bytes):
425	ret
426
427	ALIGN (4)
428L(aligned_16_114bytes):
429	movdqa %xmm0, -114(%rdi)
430L(aligned_16_98bytes):
431	movdqa %xmm0, -98(%rdi)
432L(aligned_16_82bytes):
433	movdqa %xmm0, -82(%rdi)
434L(aligned_16_66bytes):
435	movdqa %xmm0, -66(%rdi)
436L(aligned_16_50bytes):
437	movdqa %xmm0, -50(%rdi)
438L(aligned_16_34bytes):
439	movdqa %xmm0, -34(%rdi)
440L(aligned_16_18bytes):
441	movdqa %xmm0, -18(%rdi)
442L(aligned_16_2bytes):
443	movw   %cx, -2(%rdi)
444	ret
445
446	ALIGN (4)
447L(aligned_16_116bytes):
448	movdqa %xmm0, -116(%rdi)
449L(aligned_16_100bytes):
450	movdqa %xmm0, -100(%rdi)
451L(aligned_16_84bytes):
452	movdqa %xmm0, -84(%rdi)
453L(aligned_16_68bytes):
454	movdqa %xmm0, -68(%rdi)
455L(aligned_16_52bytes):
456	movdqa %xmm0, -52(%rdi)
457L(aligned_16_36bytes):
458	movdqa %xmm0, -36(%rdi)
459L(aligned_16_20bytes):
460	movdqa %xmm0, -20(%rdi)
461L(aligned_16_4bytes):
462	movl   %ecx, -4(%rdi)
463	ret
464
465	ALIGN (4)
466L(aligned_16_118bytes):
467	movdqa %xmm0, -118(%rdi)
468L(aligned_16_102bytes):
469	movdqa %xmm0, -102(%rdi)
470L(aligned_16_86bytes):
471	movdqa %xmm0, -86(%rdi)
472L(aligned_16_70bytes):
473	movdqa %xmm0, -70(%rdi)
474L(aligned_16_54bytes):
475	movdqa %xmm0, -54(%rdi)
476L(aligned_16_38bytes):
477	movdqa %xmm0, -38(%rdi)
478L(aligned_16_22bytes):
479	movdqa %xmm0, -22(%rdi)
480L(aligned_16_6bytes):
481	movl   %ecx, -6(%rdi)
482	movw   %cx, -2(%rdi)
483	ret
484
485	ALIGN (4)
486L(aligned_16_120bytes):
487	movdqa %xmm0, -120(%rdi)
488L(aligned_16_104bytes):
489	movdqa %xmm0, -104(%rdi)
490L(aligned_16_88bytes):
491	movdqa %xmm0, -88(%rdi)
492L(aligned_16_72bytes):
493	movdqa %xmm0, -72(%rdi)
494L(aligned_16_56bytes):
495	movdqa %xmm0, -56(%rdi)
496L(aligned_16_40bytes):
497	movdqa %xmm0, -40(%rdi)
498L(aligned_16_24bytes):
499	movdqa %xmm0, -24(%rdi)
500L(aligned_16_8bytes):
501	movq   %xmm0, -8(%rdi)
502	ret
503
504	ALIGN (4)
505L(aligned_16_122bytes):
506	movdqa %xmm0, -122(%rdi)
507L(aligned_16_106bytes):
508	movdqa %xmm0, -106(%rdi)
509L(aligned_16_90bytes):
510	movdqa %xmm0, -90(%rdi)
511L(aligned_16_74bytes):
512	movdqa %xmm0, -74(%rdi)
513L(aligned_16_58bytes):
514	movdqa %xmm0, -58(%rdi)
515L(aligned_16_42bytes):
516	movdqa %xmm0, -42(%rdi)
517L(aligned_16_26bytes):
518	movdqa %xmm0, -26(%rdi)
519L(aligned_16_10bytes):
520	movq   %xmm0, -10(%rdi)
521	movw   %cx, -2(%rdi)
522	ret
523
524	ALIGN (4)
525L(aligned_16_124bytes):
526	movdqa %xmm0, -124(%rdi)
527L(aligned_16_108bytes):
528	movdqa %xmm0, -108(%rdi)
529L(aligned_16_92bytes):
530	movdqa %xmm0, -92(%rdi)
531L(aligned_16_76bytes):
532	movdqa %xmm0, -76(%rdi)
533L(aligned_16_60bytes):
534	movdqa %xmm0, -60(%rdi)
535L(aligned_16_44bytes):
536	movdqa %xmm0, -44(%rdi)
537L(aligned_16_28bytes):
538	movdqa %xmm0, -28(%rdi)
539L(aligned_16_12bytes):
540	movq   %xmm0, -12(%rdi)
541	movl   %ecx, -4(%rdi)
542	ret
543
544	ALIGN (4)
545L(aligned_16_126bytes):
546	movdqa %xmm0, -126(%rdi)
547L(aligned_16_110bytes):
548	movdqa %xmm0, -110(%rdi)
549L(aligned_16_94bytes):
550	movdqa %xmm0, -94(%rdi)
551L(aligned_16_78bytes):
552	movdqa %xmm0, -78(%rdi)
553L(aligned_16_62bytes):
554	movdqa %xmm0, -62(%rdi)
555L(aligned_16_46bytes):
556	movdqa %xmm0, -46(%rdi)
557L(aligned_16_30bytes):
558	movdqa %xmm0, -30(%rdi)
559L(aligned_16_14bytes):
560	movq   %xmm0, -14(%rdi)
561	movl   %ecx, -6(%rdi)
562	movw   %cx, -2(%rdi)
563	ret
564
565END (MEMSET)
566