1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMCPY
34# define MEMCPY	memcpy
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#define DEST		PARMS
77#define SRC		DEST+4
78#define LEN		SRC+4
79
80#define CFI_PUSH(REG)		\
81  cfi_adjust_cfa_offset (4);		\
82  cfi_rel_offset (REG, 0)
83
84#define CFI_POP(REG)		\
85  cfi_adjust_cfa_offset (-4);		\
86  cfi_restore (REG)
87
88#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
89#define POP(REG)	popl REG; CFI_POP (REG)
90
91#define PARMS		8		/* Preserve EBX.  */
92#define ENTRANCE	PUSH (%ebx);
93#define RETURN_END	POP (%ebx); ret
94#define RETURN		RETURN_END; CFI_PUSH (%ebx)
95
96	.section .text.sse2,"ax",@progbits
97ENTRY (MEMCPY)
98	ENTRANCE
99	movl	LEN(%esp), %ecx
100	movl	SRC(%esp), %eax
101	movl	DEST(%esp), %edx
102
103	cmp	%eax, %edx
104	je	L(return)
105
106	cmp	$16, %ecx
107	jbe	L(len_0_16_bytes)
108
109	cmp     $SHARED_CACHE_SIZE_HALF, %ecx
110	jae     L(large_page)
111
112	movdqu	(%eax), %xmm0
113	movdqu	-16(%eax, %ecx), %xmm1
114	cmpl    $32, %ecx
115	movdqu	%xmm0, (%edx)
116	movdqu	%xmm1, -16(%edx, %ecx)
117	jbe	L(return)
118
119	movdqu	16(%eax), %xmm0
120	movdqu	-32(%eax, %ecx), %xmm1
121	cmpl    $64, %ecx
122	movdqu	%xmm0, 16(%edx)
123	movdqu	%xmm1, -32(%edx, %ecx)
124	jbe	L(return)
125
126	movdqu	32(%eax), %xmm0
127	movdqu	48(%eax), %xmm1
128	movdqu	-48(%eax, %ecx), %xmm2
129	movdqu	-64(%eax, %ecx), %xmm3
130	cmpl    $128, %ecx
131	movdqu	%xmm0, 32(%edx)
132	movdqu	%xmm1, 48(%edx)
133	movdqu	%xmm2, -48(%edx, %ecx)
134	movdqu	%xmm3, -64(%edx, %ecx)
135	jbe	L(return)
136
137/* Now the main loop: we align the address of the destination.  */
138	leal	64(%edx), %ebx
139	andl	$-64, %ebx
140
141	addl	%edx, %ecx
142	andl	$-64, %ecx
143
144	subl	%edx, %eax
145
146/* We should stop two iterations before the termination
147	(in order not to misprefetch).  */
148	subl	$64, %ecx
149	cmpl	%ebx, %ecx
150	je	L(main_loop_just_one_iteration)
151
152	subl	$64, %ecx
153	cmpl	%ebx, %ecx
154	je	L(main_loop_last_two_iterations)
155
156
157	.p2align 4
158L(main_loop_cache):
159
160	prefetcht0 128(%ebx, %eax)
161
162	movdqu	(%ebx, %eax), %xmm0
163	movdqu	16(%ebx, %eax), %xmm1
164	movdqu	32(%ebx, %eax), %xmm2
165	movdqu	48(%ebx, %eax), %xmm3
166	movdqa	%xmm0, (%ebx)
167	movdqa	%xmm1, 16(%ebx)
168	movdqa	%xmm2, 32(%ebx)
169	movdqa	%xmm3, 48(%ebx)
170	lea	64(%ebx), %ebx
171	cmpl	%ebx, %ecx
172	jne	L(main_loop_cache)
173
174L(main_loop_last_two_iterations):
175	movdqu	(%ebx, %eax), %xmm0
176	movdqu	16(%ebx, %eax), %xmm1
177	movdqu	32(%ebx, %eax), %xmm2
178	movdqu	48(%ebx, %eax), %xmm3
179	movdqu	64(%ebx, %eax), %xmm4
180	movdqu	80(%ebx, %eax), %xmm5
181	movdqu	96(%ebx, %eax), %xmm6
182	movdqu	112(%ebx, %eax), %xmm7
183	movdqa	%xmm0, (%ebx)
184	movdqa	%xmm1, 16(%ebx)
185	movdqa	%xmm2, 32(%ebx)
186	movdqa	%xmm3, 48(%ebx)
187	movdqa	%xmm4, 64(%ebx)
188	movdqa	%xmm5, 80(%ebx)
189	movdqa	%xmm6, 96(%ebx)
190	movdqa	%xmm7, 112(%ebx)
191	jmp	L(return)
192
193L(main_loop_just_one_iteration):
194	movdqu	(%ebx, %eax), %xmm0
195	movdqu	16(%ebx, %eax), %xmm1
196	movdqu	32(%ebx, %eax), %xmm2
197	movdqu	48(%ebx, %eax), %xmm3
198	movdqa	%xmm0, (%ebx)
199	movdqa	%xmm1, 16(%ebx)
200	movdqa	%xmm2, 32(%ebx)
201	movdqa	%xmm3, 48(%ebx)
202	jmp	L(return)
203
204L(large_page):
205	movdqu	(%eax), %xmm0
206	movdqu	16(%eax), %xmm1
207	movdqu	32(%eax), %xmm2
208	movdqu	48(%eax), %xmm3
209	movdqu	-64(%eax, %ecx), %xmm4
210	movdqu	-48(%eax, %ecx), %xmm5
211	movdqu	-32(%eax, %ecx), %xmm6
212	movdqu	-16(%eax, %ecx), %xmm7
213	movdqu	%xmm0, (%edx)
214	movdqu	%xmm1, 16(%edx)
215	movdqu	%xmm2, 32(%edx)
216	movdqu	%xmm3, 48(%edx)
217	movdqu	%xmm4, -64(%edx, %ecx)
218	movdqu	%xmm5, -48(%edx, %ecx)
219	movdqu	%xmm6, -32(%edx, %ecx)
220	movdqu	%xmm7, -16(%edx, %ecx)
221
222	movdqu	64(%eax), %xmm0
223	movdqu	80(%eax), %xmm1
224	movdqu	96(%eax), %xmm2
225	movdqu	112(%eax), %xmm3
226	movdqu	-128(%eax, %ecx), %xmm4
227	movdqu	-112(%eax, %ecx), %xmm5
228	movdqu	-96(%eax, %ecx), %xmm6
229	movdqu	-80(%eax, %ecx), %xmm7
230	movdqu	%xmm0, 64(%edx)
231	movdqu	%xmm1, 80(%edx)
232	movdqu	%xmm2, 96(%edx)
233	movdqu	%xmm3, 112(%edx)
234	movdqu	%xmm4, -128(%edx, %ecx)
235	movdqu	%xmm5, -112(%edx, %ecx)
236	movdqu	%xmm6, -96(%edx, %ecx)
237	movdqu	%xmm7, -80(%edx, %ecx)
238
239/* Now the main loop with non temporal stores. We align
240	the address of the destination.  */
241	leal	128(%edx), %ebx
242	andl	$-128, %ebx
243
244	addl	%edx, %ecx
245	andl	$-128, %ecx
246
247	subl	%edx, %eax
248
249	.p2align 4
250L(main_loop_large_page):
251	movdqu	(%ebx, %eax), %xmm0
252	movdqu	16(%ebx, %eax), %xmm1
253	movdqu	32(%ebx, %eax), %xmm2
254	movdqu	48(%ebx, %eax), %xmm3
255	movdqu	64(%ebx, %eax), %xmm4
256	movdqu	80(%ebx, %eax), %xmm5
257	movdqu	96(%ebx, %eax), %xmm6
258	movdqu	112(%ebx, %eax), %xmm7
259	movntdq	%xmm0, (%ebx)
260	movntdq	%xmm1, 16(%ebx)
261	movntdq	%xmm2, 32(%ebx)
262	movntdq	%xmm3, 48(%ebx)
263	movntdq	%xmm4, 64(%ebx)
264	movntdq	%xmm5, 80(%ebx)
265	movntdq	%xmm6, 96(%ebx)
266	movntdq	%xmm7, 112(%ebx)
267	lea	128(%ebx), %ebx
268	cmpl	%ebx, %ecx
269	jne	L(main_loop_large_page)
270	sfence
271	jmp	L(return)
272
273L(len_0_16_bytes):
274	testb	$24, %cl
275	jne	L(len_9_16_bytes)
276	testb	$4, %cl
277	.p2align 4,,5
278	jne	L(len_5_8_bytes)
279	testl	%ecx, %ecx
280	.p2align 4,,2
281	je	L(return)
282	movzbl	(%eax), %ebx
283	testb	$2, %cl
284	movb	%bl, (%edx)
285	je	L(return)
286	movzwl	-2(%eax,%ecx), %ebx
287	movw	%bx, -2(%edx,%ecx)
288	jmp	L(return)
289
290L(len_9_16_bytes):
291	movq	(%eax), %xmm0
292	movq	-8(%eax, %ecx), %xmm1
293	movq	%xmm0, (%edx)
294	movq	%xmm1, -8(%edx, %ecx)
295	jmp	L(return)
296
297L(len_5_8_bytes):
298	movl	(%eax), %ebx
299	movl	%ebx, (%edx)
300	movl	-4(%eax,%ecx), %ebx
301	movl	%ebx, -4(%edx,%ecx)
302	jmp	L(return)
303
304L(return):
305	movl	%edx, %eax
306	RETURN
307
308END (MEMCPY)
309