mmx_32.c revision e7a40d268ec2afab7e0596667cabd2ae53fec8d8
1#include <linux/types.h>
2#include <linux/string.h>
3#include <linux/sched.h>
4#include <linux/hardirq.h>
5#include <linux/module.h>
6
7#include <asm/asm.h>
8#include <asm/i387.h>
9
10
11/*
12 *	MMX 3DNow! library helper functions
13 *
14 *	To do:
15 *	We can use MMX just for prefetch in IRQ's. This may be a win.
16 *		(reported so on K6-III)
17 *	We should use a better code neutral filler for the short jump
18 *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
19 *	We also want to clobber the filler register so we don't get any
20 *		register forwarding stalls on the filler.
21 *
22 *	Add *user handling. Checksums are not a win with MMX on any CPU
23 *	tested so far for any MMX solution figured.
24 *
25 *	22/09/2000 - Arjan van de Ven
26 *		Improved for non-egineering-sample Athlons
27 *
28 */
29
30void *_mmx_memcpy(void *to, const void *from, size_t len)
31{
32	void *p;
33	int i;
34
35	if (unlikely(in_interrupt()))
36		return __memcpy(to, from, len);
37
38	p = to;
39	i = len >> 6; /* len/64 */
40
41	kernel_fpu_begin();
42
43	__asm__ __volatile__ (
44		"1: prefetch (%0)\n"		/* This set is 28 bytes */
45		"   prefetch 64(%0)\n"
46		"   prefetch 128(%0)\n"
47		"   prefetch 192(%0)\n"
48		"   prefetch 256(%0)\n"
49		"2:  \n"
50		".section .fixup, \"ax\"\n"
51		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
52		"   jmp 2b\n"
53		".previous\n"
54		_ASM_EXTABLE(1b,3b)
55		: : "r" (from) );
56
57
58	for(; i>5; i--)
59	{
60		__asm__ __volatile__ (
61		"1:  prefetch 320(%0)\n"
62		"2:  movq (%0), %%mm0\n"
63		"  movq 8(%0), %%mm1\n"
64		"  movq 16(%0), %%mm2\n"
65		"  movq 24(%0), %%mm3\n"
66		"  movq %%mm0, (%1)\n"
67		"  movq %%mm1, 8(%1)\n"
68		"  movq %%mm2, 16(%1)\n"
69		"  movq %%mm3, 24(%1)\n"
70		"  movq 32(%0), %%mm0\n"
71		"  movq 40(%0), %%mm1\n"
72		"  movq 48(%0), %%mm2\n"
73		"  movq 56(%0), %%mm3\n"
74		"  movq %%mm0, 32(%1)\n"
75		"  movq %%mm1, 40(%1)\n"
76		"  movq %%mm2, 48(%1)\n"
77		"  movq %%mm3, 56(%1)\n"
78		".section .fixup, \"ax\"\n"
79		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
80		"   jmp 2b\n"
81		".previous\n"
82		_ASM_EXTABLE(1b,3b)
83		: : "r" (from), "r" (to) : "memory");
84		from+=64;
85		to+=64;
86	}
87
88	for(; i>0; i--)
89	{
90		__asm__ __volatile__ (
91		"  movq (%0), %%mm0\n"
92		"  movq 8(%0), %%mm1\n"
93		"  movq 16(%0), %%mm2\n"
94		"  movq 24(%0), %%mm3\n"
95		"  movq %%mm0, (%1)\n"
96		"  movq %%mm1, 8(%1)\n"
97		"  movq %%mm2, 16(%1)\n"
98		"  movq %%mm3, 24(%1)\n"
99		"  movq 32(%0), %%mm0\n"
100		"  movq 40(%0), %%mm1\n"
101		"  movq 48(%0), %%mm2\n"
102		"  movq 56(%0), %%mm3\n"
103		"  movq %%mm0, 32(%1)\n"
104		"  movq %%mm1, 40(%1)\n"
105		"  movq %%mm2, 48(%1)\n"
106		"  movq %%mm3, 56(%1)\n"
107		: : "r" (from), "r" (to) : "memory");
108		from+=64;
109		to+=64;
110	}
111	/*
112	 *	Now do the tail of the block
113	 */
114	__memcpy(to, from, len&63);
115	kernel_fpu_end();
116	return p;
117}
118
119#ifdef CONFIG_MK7
120
121/*
122 *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
123 *	other MMX using processors do not.
124 */
125
126static void fast_clear_page(void *page)
127{
128	int i;
129
130	kernel_fpu_begin();
131
132	__asm__ __volatile__ (
133		"  pxor %%mm0, %%mm0\n" : :
134	);
135
136	for(i=0;i<4096/64;i++)
137	{
138		__asm__ __volatile__ (
139		"  movntq %%mm0, (%0)\n"
140		"  movntq %%mm0, 8(%0)\n"
141		"  movntq %%mm0, 16(%0)\n"
142		"  movntq %%mm0, 24(%0)\n"
143		"  movntq %%mm0, 32(%0)\n"
144		"  movntq %%mm0, 40(%0)\n"
145		"  movntq %%mm0, 48(%0)\n"
146		"  movntq %%mm0, 56(%0)\n"
147		: : "r" (page) : "memory");
148		page+=64;
149	}
150	/* since movntq is weakly-ordered, a "sfence" is needed to become
151	 * ordered again.
152	 */
153	__asm__ __volatile__ (
154		"  sfence \n" : :
155	);
156	kernel_fpu_end();
157}
158
159static void fast_copy_page(void *to, void *from)
160{
161	int i;
162
163	kernel_fpu_begin();
164
165	/* maybe the prefetch stuff can go before the expensive fnsave...
166	 * but that is for later. -AV
167	 */
168	__asm__ __volatile__ (
169		"1: prefetch (%0)\n"
170		"   prefetch 64(%0)\n"
171		"   prefetch 128(%0)\n"
172		"   prefetch 192(%0)\n"
173		"   prefetch 256(%0)\n"
174		"2:  \n"
175		".section .fixup, \"ax\"\n"
176		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
177		"   jmp 2b\n"
178		".previous\n"
179		_ASM_EXTABLE(1b,3b)
180		: : "r" (from) );
181
182	for(i=0; i<(4096-320)/64; i++)
183	{
184		__asm__ __volatile__ (
185		"1: prefetch 320(%0)\n"
186		"2: movq (%0), %%mm0\n"
187		"   movntq %%mm0, (%1)\n"
188		"   movq 8(%0), %%mm1\n"
189		"   movntq %%mm1, 8(%1)\n"
190		"   movq 16(%0), %%mm2\n"
191		"   movntq %%mm2, 16(%1)\n"
192		"   movq 24(%0), %%mm3\n"
193		"   movntq %%mm3, 24(%1)\n"
194		"   movq 32(%0), %%mm4\n"
195		"   movntq %%mm4, 32(%1)\n"
196		"   movq 40(%0), %%mm5\n"
197		"   movntq %%mm5, 40(%1)\n"
198		"   movq 48(%0), %%mm6\n"
199		"   movntq %%mm6, 48(%1)\n"
200		"   movq 56(%0), %%mm7\n"
201		"   movntq %%mm7, 56(%1)\n"
202		".section .fixup, \"ax\"\n"
203		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
204		"   jmp 2b\n"
205		".previous\n"
206		_ASM_EXTABLE(1b,3b)
207		: : "r" (from), "r" (to) : "memory");
208		from+=64;
209		to+=64;
210	}
211	for(i=(4096-320)/64; i<4096/64; i++)
212	{
213		__asm__ __volatile__ (
214		"2: movq (%0), %%mm0\n"
215		"   movntq %%mm0, (%1)\n"
216		"   movq 8(%0), %%mm1\n"
217		"   movntq %%mm1, 8(%1)\n"
218		"   movq 16(%0), %%mm2\n"
219		"   movntq %%mm2, 16(%1)\n"
220		"   movq 24(%0), %%mm3\n"
221		"   movntq %%mm3, 24(%1)\n"
222		"   movq 32(%0), %%mm4\n"
223		"   movntq %%mm4, 32(%1)\n"
224		"   movq 40(%0), %%mm5\n"
225		"   movntq %%mm5, 40(%1)\n"
226		"   movq 48(%0), %%mm6\n"
227		"   movntq %%mm6, 48(%1)\n"
228		"   movq 56(%0), %%mm7\n"
229		"   movntq %%mm7, 56(%1)\n"
230		: : "r" (from), "r" (to) : "memory");
231		from+=64;
232		to+=64;
233	}
234	/* since movntq is weakly-ordered, a "sfence" is needed to become
235	 * ordered again.
236	 */
237	__asm__ __volatile__ (
238		"  sfence \n" : :
239	);
240	kernel_fpu_end();
241}
242
243#else
244
245/*
246 *	Generic MMX implementation without K7 specific streaming
247 */
248
249static void fast_clear_page(void *page)
250{
251	int i;
252
253	kernel_fpu_begin();
254
255	__asm__ __volatile__ (
256		"  pxor %%mm0, %%mm0\n" : :
257	);
258
259	for(i=0;i<4096/128;i++)
260	{
261		__asm__ __volatile__ (
262		"  movq %%mm0, (%0)\n"
263		"  movq %%mm0, 8(%0)\n"
264		"  movq %%mm0, 16(%0)\n"
265		"  movq %%mm0, 24(%0)\n"
266		"  movq %%mm0, 32(%0)\n"
267		"  movq %%mm0, 40(%0)\n"
268		"  movq %%mm0, 48(%0)\n"
269		"  movq %%mm0, 56(%0)\n"
270		"  movq %%mm0, 64(%0)\n"
271		"  movq %%mm0, 72(%0)\n"
272		"  movq %%mm0, 80(%0)\n"
273		"  movq %%mm0, 88(%0)\n"
274		"  movq %%mm0, 96(%0)\n"
275		"  movq %%mm0, 104(%0)\n"
276		"  movq %%mm0, 112(%0)\n"
277		"  movq %%mm0, 120(%0)\n"
278		: : "r" (page) : "memory");
279		page+=128;
280	}
281
282	kernel_fpu_end();
283}
284
285static void fast_copy_page(void *to, void *from)
286{
287	int i;
288
289
290	kernel_fpu_begin();
291
292	__asm__ __volatile__ (
293		"1: prefetch (%0)\n"
294		"   prefetch 64(%0)\n"
295		"   prefetch 128(%0)\n"
296		"   prefetch 192(%0)\n"
297		"   prefetch 256(%0)\n"
298		"2:  \n"
299		".section .fixup, \"ax\"\n"
300		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
301		"   jmp 2b\n"
302		".previous\n"
303		_ASM_EXTABLE(1b,3b)
304		: : "r" (from) );
305
306	for(i=0; i<4096/64; i++)
307	{
308		__asm__ __volatile__ (
309		"1: prefetch 320(%0)\n"
310		"2: movq (%0), %%mm0\n"
311		"   movq 8(%0), %%mm1\n"
312		"   movq 16(%0), %%mm2\n"
313		"   movq 24(%0), %%mm3\n"
314		"   movq %%mm0, (%1)\n"
315		"   movq %%mm1, 8(%1)\n"
316		"   movq %%mm2, 16(%1)\n"
317		"   movq %%mm3, 24(%1)\n"
318		"   movq 32(%0), %%mm0\n"
319		"   movq 40(%0), %%mm1\n"
320		"   movq 48(%0), %%mm2\n"
321		"   movq 56(%0), %%mm3\n"
322		"   movq %%mm0, 32(%1)\n"
323		"   movq %%mm1, 40(%1)\n"
324		"   movq %%mm2, 48(%1)\n"
325		"   movq %%mm3, 56(%1)\n"
326		".section .fixup, \"ax\"\n"
327		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
328		"   jmp 2b\n"
329		".previous\n"
330		_ASM_EXTABLE(1b,3b)
331		: : "r" (from), "r" (to) : "memory");
332		from+=64;
333		to+=64;
334	}
335	kernel_fpu_end();
336}
337
338
339#endif
340
341/*
342 *	Favour MMX for page clear and copy.
343 */
344
345static void slow_zero_page(void * page)
346{
347	int d0, d1;
348	__asm__ __volatile__( \
349		"cld\n\t" \
350		"rep ; stosl" \
351		: "=&c" (d0), "=&D" (d1)
352		:"a" (0),"1" (page),"0" (1024)
353		:"memory");
354}
355
356void mmx_clear_page(void * page)
357{
358	if(unlikely(in_interrupt()))
359		slow_zero_page(page);
360	else
361		fast_clear_page(page);
362}
363
364static void slow_copy_page(void *to, void *from)
365{
366	int d0, d1, d2;
367	__asm__ __volatile__( \
368		"cld\n\t" \
369		"rep ; movsl" \
370		: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
371		: "0" (1024),"1" ((long) to),"2" ((long) from) \
372		: "memory");
373}
374
375
376void mmx_copy_page(void *to, void *from)
377{
378	if(unlikely(in_interrupt()))
379		slow_copy_page(to, from);
380	else
381		fast_copy_page(to, from);
382}
383
384EXPORT_SYMBOL(_mmx_memcpy);
385EXPORT_SYMBOL(mmx_clear_page);
386EXPORT_SYMBOL(mmx_copy_page);
387