1#ifdef CONFIG_KMEMCHECK
2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
3# include <asm-generic/xor.h>
4#elif !defined(_ASM_X86_XOR_H)
5#define _ASM_X86_XOR_H
6
7/*
8 * Optimized RAID-5 checksumming functions for SSE.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2, or (at your option)
13 * any later version.
14 *
15 * You should have received a copy of the GNU General Public License
16 * (for example /usr/src/linux/COPYING); if not, write to the Free
17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20/*
21 * Cache avoiding checksumming functions utilizing KNI instructions
22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
23 */
24
25/*
26 * Based on
27 * High-speed RAID5 checksumming functions utilizing SSE instructions.
28 * Copyright (C) 1998 Ingo Molnar.
29 */
30
31/*
32 * x86-64 changes / gcc fixes from Andi Kleen.
33 * Copyright 2002 Andi Kleen, SuSE Labs.
34 *
35 * This hasn't been optimized for the hammer yet, but there are likely
36 * no advantages to be gotten from x86-64 here anyways.
37 */
38
39#include <asm/i387.h>
40
41#ifdef CONFIG_X86_32
42/* reduce register pressure */
43# define XOR_CONSTANT_CONSTRAINT "i"
44#else
45# define XOR_CONSTANT_CONSTRAINT "re"
46#endif
47
48#define OFFS(x)		"16*("#x")"
49#define PF_OFFS(x)	"256+16*("#x")"
50#define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
51#define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
52#define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
53#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
54#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
55#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
56#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
57#define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
58#define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
59#define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
60#define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
61#define NOP(x)
62
63#define BLK64(pf, op, i)				\
64		pf(i)					\
65		op(i, 0)				\
66			op(i + 1, 1)			\
67				op(i + 2, 2)		\
68					op(i + 3, 3)
69
70static void
71xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
72{
73	unsigned long lines = bytes >> 8;
74
75	kernel_fpu_begin();
76
77	asm volatile(
78#undef BLOCK
79#define BLOCK(i)					\
80		LD(i, 0)				\
81			LD(i + 1, 1)			\
82		PF1(i)					\
83				PF1(i + 2)		\
84				LD(i + 2, 2)		\
85					LD(i + 3, 3)	\
86		PF0(i + 4)				\
87				PF0(i + 6)		\
88		XO1(i, 0)				\
89			XO1(i + 1, 1)			\
90				XO1(i + 2, 2)		\
91					XO1(i + 3, 3)	\
92		ST(i, 0)				\
93			ST(i + 1, 1)			\
94				ST(i + 2, 2)		\
95					ST(i + 3, 3)	\
96
97
98		PF0(0)
99				PF0(2)
100
101	" .align 32			;\n"
102	" 1:                            ;\n"
103
104		BLOCK(0)
105		BLOCK(4)
106		BLOCK(8)
107		BLOCK(12)
108
109	"       add %[inc], %[p1]       ;\n"
110	"       add %[inc], %[p2]       ;\n"
111	"       dec %[cnt]              ;\n"
112	"       jnz 1b                  ;\n"
113	: [cnt] "+r" (lines),
114	  [p1] "+r" (p1), [p2] "+r" (p2)
115	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
116	: "memory");
117
118	kernel_fpu_end();
119}
120
121static void
122xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
123{
124	unsigned long lines = bytes >> 8;
125
126	kernel_fpu_begin();
127
128	asm volatile(
129#undef BLOCK
130#define BLOCK(i)			\
131		BLK64(PF0, LD, i)	\
132		BLK64(PF1, XO1, i)	\
133		BLK64(NOP, ST, i)	\
134
135	" .align 32			;\n"
136	" 1:                            ;\n"
137
138		BLOCK(0)
139		BLOCK(4)
140		BLOCK(8)
141		BLOCK(12)
142
143	"       add %[inc], %[p1]       ;\n"
144	"       add %[inc], %[p2]       ;\n"
145	"       dec %[cnt]              ;\n"
146	"       jnz 1b                  ;\n"
147	: [cnt] "+r" (lines),
148	  [p1] "+r" (p1), [p2] "+r" (p2)
149	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
150	: "memory");
151
152	kernel_fpu_end();
153}
154
155static void
156xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
157	  unsigned long *p3)
158{
159	unsigned long lines = bytes >> 8;
160
161	kernel_fpu_begin();
162
163	asm volatile(
164#undef BLOCK
165#define BLOCK(i) \
166		PF1(i)					\
167				PF1(i + 2)		\
168		LD(i, 0)				\
169			LD(i + 1, 1)			\
170				LD(i + 2, 2)		\
171					LD(i + 3, 3)	\
172		PF2(i)					\
173				PF2(i + 2)		\
174		PF0(i + 4)				\
175				PF0(i + 6)		\
176		XO1(i, 0)				\
177			XO1(i + 1, 1)			\
178				XO1(i + 2, 2)		\
179					XO1(i + 3, 3)	\
180		XO2(i, 0)				\
181			XO2(i + 1, 1)			\
182				XO2(i + 2, 2)		\
183					XO2(i + 3, 3)	\
184		ST(i, 0)				\
185			ST(i + 1, 1)			\
186				ST(i + 2, 2)		\
187					ST(i + 3, 3)	\
188
189
190		PF0(0)
191				PF0(2)
192
193	" .align 32			;\n"
194	" 1:                            ;\n"
195
196		BLOCK(0)
197		BLOCK(4)
198		BLOCK(8)
199		BLOCK(12)
200
201	"       add %[inc], %[p1]       ;\n"
202	"       add %[inc], %[p2]       ;\n"
203	"       add %[inc], %[p3]       ;\n"
204	"       dec %[cnt]              ;\n"
205	"       jnz 1b                  ;\n"
206	: [cnt] "+r" (lines),
207	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
208	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
209	: "memory");
210
211	kernel_fpu_end();
212}
213
214static void
215xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
216	       unsigned long *p3)
217{
218	unsigned long lines = bytes >> 8;
219
220	kernel_fpu_begin();
221
222	asm volatile(
223#undef BLOCK
224#define BLOCK(i)			\
225		BLK64(PF0, LD, i)	\
226		BLK64(PF1, XO1, i)	\
227		BLK64(PF2, XO2, i)	\
228		BLK64(NOP, ST, i)	\
229
230	" .align 32			;\n"
231	" 1:                            ;\n"
232
233		BLOCK(0)
234		BLOCK(4)
235		BLOCK(8)
236		BLOCK(12)
237
238	"       add %[inc], %[p1]       ;\n"
239	"       add %[inc], %[p2]       ;\n"
240	"       add %[inc], %[p3]       ;\n"
241	"       dec %[cnt]              ;\n"
242	"       jnz 1b                  ;\n"
243	: [cnt] "+r" (lines),
244	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
245	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
246	: "memory");
247
248	kernel_fpu_end();
249}
250
251static void
252xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
253	  unsigned long *p3, unsigned long *p4)
254{
255	unsigned long lines = bytes >> 8;
256
257	kernel_fpu_begin();
258
259	asm volatile(
260#undef BLOCK
261#define BLOCK(i) \
262		PF1(i)					\
263				PF1(i + 2)		\
264		LD(i, 0)				\
265			LD(i + 1, 1)			\
266				LD(i + 2, 2)		\
267					LD(i + 3, 3)	\
268		PF2(i)					\
269				PF2(i + 2)		\
270		XO1(i, 0)				\
271			XO1(i + 1, 1)			\
272				XO1(i + 2, 2)		\
273					XO1(i + 3, 3)	\
274		PF3(i)					\
275				PF3(i + 2)		\
276		PF0(i + 4)				\
277				PF0(i + 6)		\
278		XO2(i, 0)				\
279			XO2(i + 1, 1)			\
280				XO2(i + 2, 2)		\
281					XO2(i + 3, 3)	\
282		XO3(i, 0)				\
283			XO3(i + 1, 1)			\
284				XO3(i + 2, 2)		\
285					XO3(i + 3, 3)	\
286		ST(i, 0)				\
287			ST(i + 1, 1)			\
288				ST(i + 2, 2)		\
289					ST(i + 3, 3)	\
290
291
292		PF0(0)
293				PF0(2)
294
295	" .align 32			;\n"
296	" 1:                            ;\n"
297
298		BLOCK(0)
299		BLOCK(4)
300		BLOCK(8)
301		BLOCK(12)
302
303	"       add %[inc], %[p1]       ;\n"
304	"       add %[inc], %[p2]       ;\n"
305	"       add %[inc], %[p3]       ;\n"
306	"       add %[inc], %[p4]       ;\n"
307	"       dec %[cnt]              ;\n"
308	"       jnz 1b                  ;\n"
309	: [cnt] "+r" (lines), [p1] "+r" (p1),
310	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
311	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
312	: "memory");
313
314	kernel_fpu_end();
315}
316
317static void
318xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
319	       unsigned long *p3, unsigned long *p4)
320{
321	unsigned long lines = bytes >> 8;
322
323	kernel_fpu_begin();
324
325	asm volatile(
326#undef BLOCK
327#define BLOCK(i)			\
328		BLK64(PF0, LD, i)	\
329		BLK64(PF1, XO1, i)	\
330		BLK64(PF2, XO2, i)	\
331		BLK64(PF3, XO3, i)	\
332		BLK64(NOP, ST, i)	\
333
334	" .align 32			;\n"
335	" 1:                            ;\n"
336
337		BLOCK(0)
338		BLOCK(4)
339		BLOCK(8)
340		BLOCK(12)
341
342	"       add %[inc], %[p1]       ;\n"
343	"       add %[inc], %[p2]       ;\n"
344	"       add %[inc], %[p3]       ;\n"
345	"       add %[inc], %[p4]       ;\n"
346	"       dec %[cnt]              ;\n"
347	"       jnz 1b                  ;\n"
348	: [cnt] "+r" (lines), [p1] "+r" (p1),
349	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
350	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
351	: "memory");
352
353	kernel_fpu_end();
354}
355
356static void
357xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
358	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
359{
360	unsigned long lines = bytes >> 8;
361
362	kernel_fpu_begin();
363
364	asm volatile(
365#undef BLOCK
366#define BLOCK(i) \
367		PF1(i)					\
368				PF1(i + 2)		\
369		LD(i, 0)				\
370			LD(i + 1, 1)			\
371				LD(i + 2, 2)		\
372					LD(i + 3, 3)	\
373		PF2(i)					\
374				PF2(i + 2)		\
375		XO1(i, 0)				\
376			XO1(i + 1, 1)			\
377				XO1(i + 2, 2)		\
378					XO1(i + 3, 3)	\
379		PF3(i)					\
380				PF3(i + 2)		\
381		XO2(i, 0)				\
382			XO2(i + 1, 1)			\
383				XO2(i + 2, 2)		\
384					XO2(i + 3, 3)	\
385		PF4(i)					\
386				PF4(i + 2)		\
387		PF0(i + 4)				\
388				PF0(i + 6)		\
389		XO3(i, 0)				\
390			XO3(i + 1, 1)			\
391				XO3(i + 2, 2)		\
392					XO3(i + 3, 3)	\
393		XO4(i, 0)				\
394			XO4(i + 1, 1)			\
395				XO4(i + 2, 2)		\
396					XO4(i + 3, 3)	\
397		ST(i, 0)				\
398			ST(i + 1, 1)			\
399				ST(i + 2, 2)		\
400					ST(i + 3, 3)	\
401
402
403		PF0(0)
404				PF0(2)
405
406	" .align 32			;\n"
407	" 1:                            ;\n"
408
409		BLOCK(0)
410		BLOCK(4)
411		BLOCK(8)
412		BLOCK(12)
413
414	"       add %[inc], %[p1]       ;\n"
415	"       add %[inc], %[p2]       ;\n"
416	"       add %[inc], %[p3]       ;\n"
417	"       add %[inc], %[p4]       ;\n"
418	"       add %[inc], %[p5]       ;\n"
419	"       dec %[cnt]              ;\n"
420	"       jnz 1b                  ;\n"
421	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424	: "memory");
425
426	kernel_fpu_end();
427}
428
429static void
430xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
431	       unsigned long *p3, unsigned long *p4, unsigned long *p5)
432{
433	unsigned long lines = bytes >> 8;
434
435	kernel_fpu_begin();
436
437	asm volatile(
438#undef BLOCK
439#define BLOCK(i)			\
440		BLK64(PF0, LD, i)	\
441		BLK64(PF1, XO1, i)	\
442		BLK64(PF2, XO2, i)	\
443		BLK64(PF3, XO3, i)	\
444		BLK64(PF4, XO4, i)	\
445		BLK64(NOP, ST, i)	\
446
447	" .align 32			;\n"
448	" 1:                            ;\n"
449
450		BLOCK(0)
451		BLOCK(4)
452		BLOCK(8)
453		BLOCK(12)
454
455	"       add %[inc], %[p1]       ;\n"
456	"       add %[inc], %[p2]       ;\n"
457	"       add %[inc], %[p3]       ;\n"
458	"       add %[inc], %[p4]       ;\n"
459	"       add %[inc], %[p5]       ;\n"
460	"       dec %[cnt]              ;\n"
461	"       jnz 1b                  ;\n"
462	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
463	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
464	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
465	: "memory");
466
467	kernel_fpu_end();
468}
469
470static struct xor_block_template xor_block_sse_pf64 = {
471	.name = "prefetch64-sse",
472	.do_2 = xor_sse_2_pf64,
473	.do_3 = xor_sse_3_pf64,
474	.do_4 = xor_sse_4_pf64,
475	.do_5 = xor_sse_5_pf64,
476};
477
478#undef LD
479#undef XO1
480#undef XO2
481#undef XO3
482#undef XO4
483#undef ST
484#undef NOP
485#undef BLK64
486#undef BLOCK
487
488#undef XOR_CONSTANT_CONSTRAINT
489
490#ifdef CONFIG_X86_32
491# include <asm/xor_32.h>
492#else
493# include <asm/xor_64.h>
494#endif
495
496#define XOR_SELECT_TEMPLATE(FASTEST) \
497	AVX_SELECT(FASTEST)
498
499#endif /* _ASM_X86_XOR_H */
500