sse2-memmove-slm.S revision fce861498c8c4720c6ad2475a73bb4c3e55d6948
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE		memmove
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#define CFI_PUSH(REG)		\
77	cfi_adjust_cfa_offset (4);		\
78	cfi_rel_offset (REG, 0)
79
80#define CFI_POP(REG)		\
81	cfi_adjust_cfa_offset (-4);		\
82	cfi_restore (REG)
83
84#define PUSH(REG)	push REG;
85#define POP(REG)	pop REG;
86
87#define ENTRANCE	PUSH (%rbx);
88#define RETURN_END	POP (%rbx); ret
89#define RETURN		RETURN_END;
90
91	.section .text.sse2,"ax",@progbits
92ENTRY (MEMMOVE)
93	ENTRANCE
94#ifdef USE_AS_BCOPY
95	xchg	%rsi, %rdi
96#endif
97	mov	%rdi, %rax
98
99/* Check whether we should copy backward or forward.  */
100	cmp	%rsi, %rdi
101	je	L(mm_return)
102	jg	L(mm_len_0_or_more_backward)
103
104/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
105	separately.  */
106	cmp	$16, %rdx
107	jbe	L(mm_len_0_16_bytes_forward)
108
109	cmp	$32, %rdx
110	ja	L(mm_len_32_or_more_forward)
111
112/* Copy [0..32] and return.  */
113	movdqu	(%rsi), %xmm0
114	movdqu	-16(%rsi, %rdx), %xmm1
115	movdqu	%xmm0, (%rdi)
116	movdqu	%xmm1, -16(%rdi, %rdx)
117	jmp	L(mm_return)
118
119L(mm_len_32_or_more_forward):
120	cmp	$64, %rdx
121	ja	L(mm_len_64_or_more_forward)
122
123/* Copy [0..64] and return.  */
124	movdqu	(%rsi), %xmm0
125	movdqu	16(%rsi), %xmm1
126	movdqu	-16(%rsi, %rdx), %xmm2
127	movdqu	-32(%rsi, %rdx), %xmm3
128	movdqu	%xmm0, (%rdi)
129	movdqu	%xmm1, 16(%rdi)
130	movdqu	%xmm2, -16(%rdi, %rdx)
131	movdqu	%xmm3, -32(%rdi, %rdx)
132	jmp	L(mm_return)
133
134L(mm_len_64_or_more_forward):
135	cmp	$128, %rdx
136	ja	L(mm_len_128_or_more_forward)
137
138/* Copy [0..128] and return.  */
139	movdqu	(%rsi), %xmm0
140	movdqu	16(%rsi), %xmm1
141	movdqu	32(%rsi), %xmm2
142	movdqu	48(%rsi), %xmm3
143	movdqu	-64(%rsi, %rdx), %xmm4
144	movdqu	-48(%rsi, %rdx), %xmm5
145	movdqu	-32(%rsi, %rdx), %xmm6
146	movdqu	-16(%rsi, %rdx), %xmm7
147	movdqu	%xmm0, (%rdi)
148	movdqu	%xmm1, 16(%rdi)
149	movdqu	%xmm2, 32(%rdi)
150	movdqu	%xmm3, 48(%rdi)
151	movdqu	%xmm4, -64(%rdi, %rdx)
152	movdqu	%xmm5, -48(%rdi, %rdx)
153	movdqu	%xmm6, -32(%rdi, %rdx)
154	movdqu	%xmm7, -16(%rdi, %rdx)
155	jmp	L(mm_return)
156
157L(mm_len_128_or_more_forward):
158/* Aligning the address of destination.  */
159/*  save first unaligned 64 bytes */
160	movdqu	(%rsi), %xmm0
161	movdqu	16(%rsi), %xmm1
162	movdqu	32(%rsi), %xmm2
163	movdqu	48(%rsi), %xmm3
164
165	lea	64(%rdi), %r8
166	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
167	sub	%rdi, %rsi /* rsi = src - dst = diff */
168
169	movdqu	(%r8, %rsi), %xmm4
170	movdqu	16(%r8, %rsi), %xmm5
171	movdqu	32(%r8, %rsi), %xmm6
172	movdqu	48(%r8, %rsi), %xmm7
173
174	movdqu	%xmm0, (%rdi)
175	movdqu	%xmm1, 16(%rdi)
176	movdqu	%xmm2, 32(%rdi)
177	movdqu	%xmm3, 48(%rdi)
178	movdqa	%xmm4, (%r8)
179	movaps	%xmm5, 16(%r8)
180	movaps	%xmm6, 32(%r8)
181	movaps	%xmm7, 48(%r8)
182	add	$64, %r8
183
184	lea	(%rdi, %rdx), %rbx
185	and	$-64, %rbx
186	cmp	%r8, %rbx
187	jbe	L(mm_copy_remaining_forward)
188
189	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
190	jae	L(mm_large_page_loop_forward)
191
192	.p2align 4
193L(mm_main_loop_forward):
194
195	prefetcht0 128(%r8, %rsi)
196
197	movdqu	(%r8, %rsi), %xmm0
198	movdqu	16(%r8, %rsi), %xmm1
199	movdqu	32(%r8, %rsi), %xmm2
200	movdqu	48(%r8, %rsi), %xmm3
201	movdqa	%xmm0, (%r8)
202	movaps	%xmm1, 16(%r8)
203	movaps	%xmm2, 32(%r8)
204	movaps	%xmm3, 48(%r8)
205	lea	64(%r8), %r8
206	cmp	%r8, %rbx
207	ja	L(mm_main_loop_forward)
208
209L(mm_copy_remaining_forward):
210	add	%rdi, %rdx
211	sub	%r8, %rdx
212/* We copied all up till %rdi position in the dst.
213	In %rdx now is how many bytes are left to copy.
214	Now we need to advance %r8. */
215	lea	(%r8, %rsi), %r9
216
217L(mm_remaining_0_64_bytes_forward):
218	cmp	$32, %rdx
219	ja	L(mm_remaining_33_64_bytes_forward)
220	cmp	$16, %rdx
221	ja	L(mm_remaining_17_32_bytes_forward)
222	test	%rdx, %rdx
223	.p2align 4,,2
224	je	L(mm_return)
225
226	cmpb	$8, %dl
227	ja	L(mm_remaining_9_16_bytes_forward)
228	cmpb	$4, %dl
229	.p2align 4,,5
230	ja	L(mm_remaining_5_8_bytes_forward)
231	cmpb	$2, %dl
232	.p2align 4,,1
233	ja	L(mm_remaining_3_4_bytes_forward)
234	movzbl	-1(%r9,%rdx), %esi
235	movzbl	(%r9), %ebx
236	movb	%sil, -1(%r8,%rdx)
237	movb	%bl, (%r8)
238	jmp	L(mm_return)
239
240L(mm_remaining_33_64_bytes_forward):
241	movdqu	(%r9), %xmm0
242	movdqu	16(%r9), %xmm1
243	movdqu	-32(%r9, %rdx), %xmm2
244	movdqu	-16(%r9, %rdx), %xmm3
245	movdqu	%xmm0, (%r8)
246	movdqu	%xmm1, 16(%r8)
247	movdqu	%xmm2, -32(%r8, %rdx)
248	movdqu	%xmm3, -16(%r8, %rdx)
249	jmp	L(mm_return)
250
251L(mm_remaining_17_32_bytes_forward):
252	movdqu	(%r9), %xmm0
253	movdqu	-16(%r9, %rdx), %xmm1
254	movdqu	%xmm0, (%r8)
255	movdqu	%xmm1, -16(%r8, %rdx)
256	jmp	L(mm_return)
257
258L(mm_remaining_5_8_bytes_forward):
259	movl	(%r9), %esi
260	movl	-4(%r9,%rdx), %ebx
261	movl	%esi, (%r8)
262	movl	%ebx, -4(%r8,%rdx)
263	jmp	L(mm_return)
264
265L(mm_remaining_9_16_bytes_forward):
266	mov	(%r9), %rsi
267	mov	-8(%r9, %rdx), %rbx
268	mov	%rsi, (%r8)
269	mov	%rbx, -8(%r8, %rdx)
270	jmp	L(mm_return)
271
272L(mm_remaining_3_4_bytes_forward):
273	movzwl	-2(%r9,%rdx), %esi
274	movzwl	(%r9), %ebx
275	movw	%si, -2(%r8,%rdx)
276	movw	%bx, (%r8)
277	jmp	L(mm_return)
278
279L(mm_len_0_16_bytes_forward):
280	testb	$24, %dl
281	jne	L(mm_len_9_16_bytes_forward)
282	testb	$4, %dl
283	.p2align 4,,5
284	jne	L(mm_len_5_8_bytes_forward)
285	test	%rdx, %rdx
286	.p2align 4,,2
287	je	L(mm_return)
288	testb	$2, %dl
289	.p2align 4,,1
290	jne	L(mm_len_2_4_bytes_forward)
291	movzbl	-1(%rsi,%rdx), %ebx
292	movzbl	(%rsi), %esi
293	movb	%bl, -1(%rdi,%rdx)
294	movb	%sil, (%rdi)
295	jmp	L(mm_return)
296
297L(mm_len_2_4_bytes_forward):
298	movzwl	-2(%rsi,%rdx), %ebx
299	movzwl	(%rsi), %esi
300	movw	%bx, -2(%rdi,%rdx)
301	movw	%si, (%rdi)
302	jmp	L(mm_return)
303
304L(mm_len_5_8_bytes_forward):
305	movl	(%rsi), %ebx
306	movl	-4(%rsi,%rdx), %esi
307	movl	%ebx, (%rdi)
308	movl	%esi, -4(%rdi,%rdx)
309	jmp	L(mm_return)
310
311L(mm_len_9_16_bytes_forward):
312	mov	(%rsi), %rbx
313	mov	-8(%rsi, %rdx), %rsi
314	mov	%rbx, (%rdi)
315	mov	%rsi, -8(%rdi, %rdx)
316	jmp	L(mm_return)
317
318L(mm_recalc_len):
319/* Compute in %rdx how many bytes are left to copy after
320	the main loop stops.  */
321	mov 	%rbx, %rdx
322	sub 	%rdi, %rdx
323/* The code for copying backwards.  */
324L(mm_len_0_or_more_backward):
325
326/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
327	separately.  */
328	cmp	$16, %rdx
329	jbe	L(mm_len_0_16_bytes_backward)
330
331	cmp	$32, %rdx
332	ja	L(mm_len_32_or_more_backward)
333
334/* Copy [0..32] and return.  */
335	movdqu	(%rsi), %xmm0
336	movdqu	-16(%rsi, %rdx), %xmm1
337	movdqu	%xmm0, (%rdi)
338	movdqu	%xmm1, -16(%rdi, %rdx)
339	jmp	L(mm_return)
340
341L(mm_len_32_or_more_backward):
342	cmp	$64, %rdx
343	ja	L(mm_len_64_or_more_backward)
344
345/* Copy [0..64] and return.  */
346	movdqu	(%rsi), %xmm0
347	movdqu	16(%rsi), %xmm1
348	movdqu	-16(%rsi, %rdx), %xmm2
349	movdqu	-32(%rsi, %rdx), %xmm3
350	movdqu	%xmm0, (%rdi)
351	movdqu	%xmm1, 16(%rdi)
352	movdqu	%xmm2, -16(%rdi, %rdx)
353	movdqu	%xmm3, -32(%rdi, %rdx)
354	jmp	L(mm_return)
355
356L(mm_len_64_or_more_backward):
357	cmp	$128, %rdx
358	ja	L(mm_len_128_or_more_backward)
359
360/* Copy [0..128] and return.  */
361	movdqu	(%rsi), %xmm0
362	movdqu	16(%rsi), %xmm1
363	movdqu	32(%rsi), %xmm2
364	movdqu	48(%rsi), %xmm3
365	movdqu	-64(%rsi, %rdx), %xmm4
366	movdqu	-48(%rsi, %rdx), %xmm5
367	movdqu	-32(%rsi, %rdx), %xmm6
368	movdqu	-16(%rsi, %rdx), %xmm7
369	movdqu	%xmm0, (%rdi)
370	movdqu	%xmm1, 16(%rdi)
371	movdqu	%xmm2, 32(%rdi)
372	movdqu	%xmm3, 48(%rdi)
373	movdqu	%xmm4, -64(%rdi, %rdx)
374	movdqu	%xmm5, -48(%rdi, %rdx)
375	movdqu	%xmm6, -32(%rdi, %rdx)
376	movdqu	%xmm7, -16(%rdi, %rdx)
377	jmp	L(mm_return)
378
379L(mm_len_128_or_more_backward):
380/* Aligning the address of destination. We need to save
381	16 bits from the source in order not to overwrite them.  */
382	movdqu	-16(%rsi, %rdx), %xmm0
383	movdqu	-32(%rsi, %rdx), %xmm1
384	movdqu	-48(%rsi, %rdx), %xmm2
385	movdqu	-64(%rsi, %rdx), %xmm3
386
387	lea	(%rdi, %rdx), %r9
388	and	$-64, %r9 /* r9 = aligned dst */
389
390	mov	%rsi, %r8
391	sub	%rdi, %r8 /* r8 = src - dst, diff */
392
393	movdqu	-16(%r9, %r8), %xmm4
394	movdqu	-32(%r9, %r8), %xmm5
395	movdqu	-48(%r9, %r8), %xmm6
396	movdqu	-64(%r9, %r8), %xmm7
397
398	movdqu	%xmm0, -16(%rdi, %rdx)
399	movdqu	%xmm1, -32(%rdi, %rdx)
400	movdqu	%xmm2, -48(%rdi, %rdx)
401	movdqu	%xmm3, -64(%rdi, %rdx)
402	movdqa	%xmm4, -16(%r9)
403	movaps	%xmm5, -32(%r9)
404	movaps	%xmm6, -48(%r9)
405	movaps	%xmm7, -64(%r9)
406	lea	-64(%r9), %r9
407
408	lea	64(%rdi), %rbx
409	and	$-64, %rbx
410
411	cmp	%r9, %rbx
412	jae	L(mm_recalc_len)
413
414	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
415	jae	L(mm_large_page_loop_backward)
416
417	.p2align 4
418L(mm_main_loop_backward):
419
420	prefetcht0 -128(%r9, %r8)
421
422	movdqu	-64(%r9, %r8), %xmm0
423	movdqu	-48(%r9, %r8), %xmm1
424	movdqu	-32(%r9, %r8), %xmm2
425	movdqu	-16(%r9, %r8), %xmm3
426	movdqa	%xmm0, -64(%r9)
427	movaps	%xmm1, -48(%r9)
428	movaps	%xmm2, -32(%r9)
429	movaps	%xmm3, -16(%r9)
430	lea	-64(%r9), %r9
431	cmp	%r9, %rbx
432	jb	L(mm_main_loop_backward)
433	jmp	L(mm_recalc_len)
434
435/* Copy [0..16] and return.  */
436L(mm_len_0_16_bytes_backward):
437	testb	$24, %dl
438	jnz	L(mm_len_9_16_bytes_backward)
439	testb	$4, %dl
440	.p2align 4,,5
441	jnz	L(mm_len_5_8_bytes_backward)
442	test	%rdx, %rdx
443	.p2align 4,,2
444	je	L(mm_return)
445	testb	$2, %dl
446	.p2align 4,,1
447	jne	L(mm_len_3_4_bytes_backward)
448	movzbl	-1(%rsi,%rdx), %ebx
449	movzbl	(%rsi), %ecx
450	movb	%bl, -1(%rdi,%rdx)
451	movb	%cl, (%rdi)
452	jmp	L(mm_return)
453
454L(mm_len_3_4_bytes_backward):
455	movzwl	-2(%rsi,%rdx), %ebx
456	movzwl	(%rsi), %ecx
457	movw	%bx, -2(%rdi,%rdx)
458	movw	%cx, (%rdi)
459	jmp	L(mm_return)
460
461L(mm_len_9_16_bytes_backward):
462	movl	-4(%rsi,%rdx), %ebx
463	movl	-8(%rsi,%rdx), %ecx
464	movl	%ebx, -4(%rdi,%rdx)
465	movl	%ecx, -8(%rdi,%rdx)
466	sub	$8, %rdx
467	jmp	L(mm_len_0_16_bytes_backward)
468
469L(mm_len_5_8_bytes_backward):
470	movl	(%rsi), %ebx
471	movl	-4(%rsi,%rdx), %ecx
472	movl	%ebx, (%rdi)
473	movl	%ecx, -4(%rdi,%rdx)
474
475L(mm_return):
476	RETURN
477
478/* Big length copy forward part.  */
479
480	.p2align 4
481L(mm_large_page_loop_forward):
482	movdqu	(%r8, %rsi), %xmm0
483	movdqu	16(%r8, %rsi), %xmm1
484	movdqu	32(%r8, %rsi), %xmm2
485	movdqu	48(%r8, %rsi), %xmm3
486	movntdq	%xmm0, (%r8)
487	movntdq	%xmm1, 16(%r8)
488	movntdq	%xmm2, 32(%r8)
489	movntdq	%xmm3, 48(%r8)
490	lea 	64(%r8), %r8
491	cmp	%r8, %rbx
492	ja	L(mm_large_page_loop_forward)
493	sfence
494	jmp	L(mm_copy_remaining_forward)
495
496/* Big length copy backward part.  */
497	.p2align 4
498L(mm_large_page_loop_backward):
499	movdqu	-64(%r9, %r8), %xmm0
500	movdqu	-48(%r9, %r8), %xmm1
501	movdqu	-32(%r9, %r8), %xmm2
502	movdqu	-16(%r9, %r8), %xmm3
503	movntdq	%xmm0, -64(%r9)
504	movntdq	%xmm1, -48(%r9)
505	movntdq	%xmm2, -32(%r9)
506	movntdq	%xmm3, -16(%r9)
507	lea 	-64(%r9), %r9
508	cmp	%r9, %rbx
509	jb	L(mm_large_page_loop_backward)
510	sfence
511	jmp	L(mm_recalc_len)
512
513END (MEMMOVE)
514