1#ifdef CONFIG_KMEMCHECK 2/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ 3# include <asm-generic/xor.h> 4#elif !defined(_ASM_X86_XOR_H) 5#define _ASM_X86_XOR_H 6 7/* 8 * Optimized RAID-5 checksumming functions for SSE. 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2, or (at your option) 13 * any later version. 14 * 15 * You should have received a copy of the GNU General Public License 16 * (for example /usr/src/linux/COPYING); if not, write to the Free 17 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 */ 19 20/* 21 * Cache avoiding checksumming functions utilizing KNI instructions 22 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) 23 */ 24 25/* 26 * Based on 27 * High-speed RAID5 checksumming functions utilizing SSE instructions. 28 * Copyright (C) 1998 Ingo Molnar. 29 */ 30 31/* 32 * x86-64 changes / gcc fixes from Andi Kleen. 33 * Copyright 2002 Andi Kleen, SuSE Labs. 34 * 35 * This hasn't been optimized for the hammer yet, but there are likely 36 * no advantages to be gotten from x86-64 here anyways. 37 */ 38 39#include <asm/i387.h> 40 41#ifdef CONFIG_X86_32 42/* reduce register pressure */ 43# define XOR_CONSTANT_CONSTRAINT "i" 44#else 45# define XOR_CONSTANT_CONSTRAINT "re" 46#endif 47 48#define OFFS(x) "16*("#x")" 49#define PF_OFFS(x) "256+16*("#x")" 50#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 51#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 52#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 53#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 54#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 55#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 56#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 57#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 58#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 59#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 60#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 61#define NOP(x) 62 63#define BLK64(pf, op, i) \ 64 pf(i) \ 65 op(i, 0) \ 66 op(i + 1, 1) \ 67 op(i + 2, 2) \ 68 op(i + 3, 3) 69 70static void 71xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 72{ 73 unsigned long lines = bytes >> 8; 74 75 kernel_fpu_begin(); 76 77 asm volatile( 78#undef BLOCK 79#define BLOCK(i) \ 80 LD(i, 0) \ 81 LD(i + 1, 1) \ 82 PF1(i) \ 83 PF1(i + 2) \ 84 LD(i + 2, 2) \ 85 LD(i + 3, 3) \ 86 PF0(i + 4) \ 87 PF0(i + 6) \ 88 XO1(i, 0) \ 89 XO1(i + 1, 1) \ 90 XO1(i + 2, 2) \ 91 XO1(i + 3, 3) \ 92 ST(i, 0) \ 93 ST(i + 1, 1) \ 94 ST(i + 2, 2) \ 95 ST(i + 3, 3) \ 96 97 98 PF0(0) 99 PF0(2) 100 101 " .align 32 ;\n" 102 " 1: ;\n" 103 104 BLOCK(0) 105 BLOCK(4) 106 BLOCK(8) 107 BLOCK(12) 108 109 " add %[inc], %[p1] ;\n" 110 " add %[inc], %[p2] ;\n" 111 " dec %[cnt] ;\n" 112 " jnz 1b ;\n" 113 : [cnt] "+r" (lines), 114 [p1] "+r" (p1), [p2] "+r" (p2) 115 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 116 : "memory"); 117 118 kernel_fpu_end(); 119} 120 121static void 122xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) 123{ 124 unsigned long lines = bytes >> 8; 125 126 kernel_fpu_begin(); 127 128 asm volatile( 129#undef BLOCK 130#define BLOCK(i) \ 131 BLK64(PF0, LD, i) \ 132 BLK64(PF1, XO1, i) \ 133 BLK64(NOP, ST, i) \ 134 135 " .align 32 ;\n" 136 " 1: ;\n" 137 138 BLOCK(0) 139 BLOCK(4) 140 BLOCK(8) 141 BLOCK(12) 142 143 " add %[inc], %[p1] ;\n" 144 " add %[inc], %[p2] ;\n" 145 " dec %[cnt] ;\n" 146 " jnz 1b ;\n" 147 : [cnt] "+r" (lines), 148 [p1] "+r" (p1), [p2] "+r" (p2) 149 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 150 : "memory"); 151 152 kernel_fpu_end(); 153} 154 155static void 156xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 157 unsigned long *p3) 158{ 159 unsigned long lines = bytes >> 8; 160 161 kernel_fpu_begin(); 162 163 asm volatile( 164#undef BLOCK 165#define BLOCK(i) \ 166 PF1(i) \ 167 PF1(i + 2) \ 168 LD(i, 0) \ 169 LD(i + 1, 1) \ 170 LD(i + 2, 2) \ 171 LD(i + 3, 3) \ 172 PF2(i) \ 173 PF2(i + 2) \ 174 PF0(i + 4) \ 175 PF0(i + 6) \ 176 XO1(i, 0) \ 177 XO1(i + 1, 1) \ 178 XO1(i + 2, 2) \ 179 XO1(i + 3, 3) \ 180 XO2(i, 0) \ 181 XO2(i + 1, 1) \ 182 XO2(i + 2, 2) \ 183 XO2(i + 3, 3) \ 184 ST(i, 0) \ 185 ST(i + 1, 1) \ 186 ST(i + 2, 2) \ 187 ST(i + 3, 3) \ 188 189 190 PF0(0) 191 PF0(2) 192 193 " .align 32 ;\n" 194 " 1: ;\n" 195 196 BLOCK(0) 197 BLOCK(4) 198 BLOCK(8) 199 BLOCK(12) 200 201 " add %[inc], %[p1] ;\n" 202 " add %[inc], %[p2] ;\n" 203 " add %[inc], %[p3] ;\n" 204 " dec %[cnt] ;\n" 205 " jnz 1b ;\n" 206 : [cnt] "+r" (lines), 207 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 208 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 209 : "memory"); 210 211 kernel_fpu_end(); 212} 213 214static void 215xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 216 unsigned long *p3) 217{ 218 unsigned long lines = bytes >> 8; 219 220 kernel_fpu_begin(); 221 222 asm volatile( 223#undef BLOCK 224#define BLOCK(i) \ 225 BLK64(PF0, LD, i) \ 226 BLK64(PF1, XO1, i) \ 227 BLK64(PF2, XO2, i) \ 228 BLK64(NOP, ST, i) \ 229 230 " .align 32 ;\n" 231 " 1: ;\n" 232 233 BLOCK(0) 234 BLOCK(4) 235 BLOCK(8) 236 BLOCK(12) 237 238 " add %[inc], %[p1] ;\n" 239 " add %[inc], %[p2] ;\n" 240 " add %[inc], %[p3] ;\n" 241 " dec %[cnt] ;\n" 242 " jnz 1b ;\n" 243 : [cnt] "+r" (lines), 244 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 245 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 246 : "memory"); 247 248 kernel_fpu_end(); 249} 250 251static void 252xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 253 unsigned long *p3, unsigned long *p4) 254{ 255 unsigned long lines = bytes >> 8; 256 257 kernel_fpu_begin(); 258 259 asm volatile( 260#undef BLOCK 261#define BLOCK(i) \ 262 PF1(i) \ 263 PF1(i + 2) \ 264 LD(i, 0) \ 265 LD(i + 1, 1) \ 266 LD(i + 2, 2) \ 267 LD(i + 3, 3) \ 268 PF2(i) \ 269 PF2(i + 2) \ 270 XO1(i, 0) \ 271 XO1(i + 1, 1) \ 272 XO1(i + 2, 2) \ 273 XO1(i + 3, 3) \ 274 PF3(i) \ 275 PF3(i + 2) \ 276 PF0(i + 4) \ 277 PF0(i + 6) \ 278 XO2(i, 0) \ 279 XO2(i + 1, 1) \ 280 XO2(i + 2, 2) \ 281 XO2(i + 3, 3) \ 282 XO3(i, 0) \ 283 XO3(i + 1, 1) \ 284 XO3(i + 2, 2) \ 285 XO3(i + 3, 3) \ 286 ST(i, 0) \ 287 ST(i + 1, 1) \ 288 ST(i + 2, 2) \ 289 ST(i + 3, 3) \ 290 291 292 PF0(0) 293 PF0(2) 294 295 " .align 32 ;\n" 296 " 1: ;\n" 297 298 BLOCK(0) 299 BLOCK(4) 300 BLOCK(8) 301 BLOCK(12) 302 303 " add %[inc], %[p1] ;\n" 304 " add %[inc], %[p2] ;\n" 305 " add %[inc], %[p3] ;\n" 306 " add %[inc], %[p4] ;\n" 307 " dec %[cnt] ;\n" 308 " jnz 1b ;\n" 309 : [cnt] "+r" (lines), [p1] "+r" (p1), 310 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 311 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 312 : "memory"); 313 314 kernel_fpu_end(); 315} 316 317static void 318xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 319 unsigned long *p3, unsigned long *p4) 320{ 321 unsigned long lines = bytes >> 8; 322 323 kernel_fpu_begin(); 324 325 asm volatile( 326#undef BLOCK 327#define BLOCK(i) \ 328 BLK64(PF0, LD, i) \ 329 BLK64(PF1, XO1, i) \ 330 BLK64(PF2, XO2, i) \ 331 BLK64(PF3, XO3, i) \ 332 BLK64(NOP, ST, i) \ 333 334 " .align 32 ;\n" 335 " 1: ;\n" 336 337 BLOCK(0) 338 BLOCK(4) 339 BLOCK(8) 340 BLOCK(12) 341 342 " add %[inc], %[p1] ;\n" 343 " add %[inc], %[p2] ;\n" 344 " add %[inc], %[p3] ;\n" 345 " add %[inc], %[p4] ;\n" 346 " dec %[cnt] ;\n" 347 " jnz 1b ;\n" 348 : [cnt] "+r" (lines), [p1] "+r" (p1), 349 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 350 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 351 : "memory"); 352 353 kernel_fpu_end(); 354} 355 356static void 357xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 358 unsigned long *p3, unsigned long *p4, unsigned long *p5) 359{ 360 unsigned long lines = bytes >> 8; 361 362 kernel_fpu_begin(); 363 364 asm volatile( 365#undef BLOCK 366#define BLOCK(i) \ 367 PF1(i) \ 368 PF1(i + 2) \ 369 LD(i, 0) \ 370 LD(i + 1, 1) \ 371 LD(i + 2, 2) \ 372 LD(i + 3, 3) \ 373 PF2(i) \ 374 PF2(i + 2) \ 375 XO1(i, 0) \ 376 XO1(i + 1, 1) \ 377 XO1(i + 2, 2) \ 378 XO1(i + 3, 3) \ 379 PF3(i) \ 380 PF3(i + 2) \ 381 XO2(i, 0) \ 382 XO2(i + 1, 1) \ 383 XO2(i + 2, 2) \ 384 XO2(i + 3, 3) \ 385 PF4(i) \ 386 PF4(i + 2) \ 387 PF0(i + 4) \ 388 PF0(i + 6) \ 389 XO3(i, 0) \ 390 XO3(i + 1, 1) \ 391 XO3(i + 2, 2) \ 392 XO3(i + 3, 3) \ 393 XO4(i, 0) \ 394 XO4(i + 1, 1) \ 395 XO4(i + 2, 2) \ 396 XO4(i + 3, 3) \ 397 ST(i, 0) \ 398 ST(i + 1, 1) \ 399 ST(i + 2, 2) \ 400 ST(i + 3, 3) \ 401 402 403 PF0(0) 404 PF0(2) 405 406 " .align 32 ;\n" 407 " 1: ;\n" 408 409 BLOCK(0) 410 BLOCK(4) 411 BLOCK(8) 412 BLOCK(12) 413 414 " add %[inc], %[p1] ;\n" 415 " add %[inc], %[p2] ;\n" 416 " add %[inc], %[p3] ;\n" 417 " add %[inc], %[p4] ;\n" 418 " add %[inc], %[p5] ;\n" 419 " dec %[cnt] ;\n" 420 " jnz 1b ;\n" 421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 424 : "memory"); 425 426 kernel_fpu_end(); 427} 428 429static void 430xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, 431 unsigned long *p3, unsigned long *p4, unsigned long *p5) 432{ 433 unsigned long lines = bytes >> 8; 434 435 kernel_fpu_begin(); 436 437 asm volatile( 438#undef BLOCK 439#define BLOCK(i) \ 440 BLK64(PF0, LD, i) \ 441 BLK64(PF1, XO1, i) \ 442 BLK64(PF2, XO2, i) \ 443 BLK64(PF3, XO3, i) \ 444 BLK64(PF4, XO4, i) \ 445 BLK64(NOP, ST, i) \ 446 447 " .align 32 ;\n" 448 " 1: ;\n" 449 450 BLOCK(0) 451 BLOCK(4) 452 BLOCK(8) 453 BLOCK(12) 454 455 " add %[inc], %[p1] ;\n" 456 " add %[inc], %[p2] ;\n" 457 " add %[inc], %[p3] ;\n" 458 " add %[inc], %[p4] ;\n" 459 " add %[inc], %[p5] ;\n" 460 " dec %[cnt] ;\n" 461 " jnz 1b ;\n" 462 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), 463 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) 464 : [inc] XOR_CONSTANT_CONSTRAINT (256UL) 465 : "memory"); 466 467 kernel_fpu_end(); 468} 469 470static struct xor_block_template xor_block_sse_pf64 = { 471 .name = "prefetch64-sse", 472 .do_2 = xor_sse_2_pf64, 473 .do_3 = xor_sse_3_pf64, 474 .do_4 = xor_sse_4_pf64, 475 .do_5 = xor_sse_5_pf64, 476}; 477 478#undef LD 479#undef XO1 480#undef XO2 481#undef XO3 482#undef XO4 483#undef ST 484#undef NOP 485#undef BLK64 486#undef BLOCK 487 488#undef XOR_CONSTANT_CONSTRAINT 489 490#ifdef CONFIG_X86_32 491# include <asm/xor_32.h> 492#else 493# include <asm/xor_64.h> 494#endif 495 496#define XOR_SELECT_TEMPLATE(FASTEST) \ 497 AVX_SELECT(FASTEST) 498 499#endif /* _ASM_X86_XOR_H */ 500