mmx_32.c revision e7a40d268ec2afab7e0596667cabd2ae53fec8d8
1#include <linux/types.h> 2#include <linux/string.h> 3#include <linux/sched.h> 4#include <linux/hardirq.h> 5#include <linux/module.h> 6 7#include <asm/asm.h> 8#include <asm/i387.h> 9 10 11/* 12 * MMX 3DNow! library helper functions 13 * 14 * To do: 15 * We can use MMX just for prefetch in IRQ's. This may be a win. 16 * (reported so on K6-III) 17 * We should use a better code neutral filler for the short jump 18 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? 19 * We also want to clobber the filler register so we don't get any 20 * register forwarding stalls on the filler. 21 * 22 * Add *user handling. Checksums are not a win with MMX on any CPU 23 * tested so far for any MMX solution figured. 24 * 25 * 22/09/2000 - Arjan van de Ven 26 * Improved for non-egineering-sample Athlons 27 * 28 */ 29 30void *_mmx_memcpy(void *to, const void *from, size_t len) 31{ 32 void *p; 33 int i; 34 35 if (unlikely(in_interrupt())) 36 return __memcpy(to, from, len); 37 38 p = to; 39 i = len >> 6; /* len/64 */ 40 41 kernel_fpu_begin(); 42 43 __asm__ __volatile__ ( 44 "1: prefetch (%0)\n" /* This set is 28 bytes */ 45 " prefetch 64(%0)\n" 46 " prefetch 128(%0)\n" 47 " prefetch 192(%0)\n" 48 " prefetch 256(%0)\n" 49 "2: \n" 50 ".section .fixup, \"ax\"\n" 51 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 52 " jmp 2b\n" 53 ".previous\n" 54 _ASM_EXTABLE(1b,3b) 55 : : "r" (from) ); 56 57 58 for(; i>5; i--) 59 { 60 __asm__ __volatile__ ( 61 "1: prefetch 320(%0)\n" 62 "2: movq (%0), %%mm0\n" 63 " movq 8(%0), %%mm1\n" 64 " movq 16(%0), %%mm2\n" 65 " movq 24(%0), %%mm3\n" 66 " movq %%mm0, (%1)\n" 67 " movq %%mm1, 8(%1)\n" 68 " movq %%mm2, 16(%1)\n" 69 " movq %%mm3, 24(%1)\n" 70 " movq 32(%0), %%mm0\n" 71 " movq 40(%0), %%mm1\n" 72 " movq 48(%0), %%mm2\n" 73 " movq 56(%0), %%mm3\n" 74 " movq %%mm0, 32(%1)\n" 75 " movq %%mm1, 40(%1)\n" 76 " movq %%mm2, 48(%1)\n" 77 " movq %%mm3, 56(%1)\n" 78 ".section .fixup, \"ax\"\n" 79 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 80 " jmp 2b\n" 81 ".previous\n" 82 _ASM_EXTABLE(1b,3b) 83 : : "r" (from), "r" (to) : "memory"); 84 from+=64; 85 to+=64; 86 } 87 88 for(; i>0; i--) 89 { 90 __asm__ __volatile__ ( 91 " movq (%0), %%mm0\n" 92 " movq 8(%0), %%mm1\n" 93 " movq 16(%0), %%mm2\n" 94 " movq 24(%0), %%mm3\n" 95 " movq %%mm0, (%1)\n" 96 " movq %%mm1, 8(%1)\n" 97 " movq %%mm2, 16(%1)\n" 98 " movq %%mm3, 24(%1)\n" 99 " movq 32(%0), %%mm0\n" 100 " movq 40(%0), %%mm1\n" 101 " movq 48(%0), %%mm2\n" 102 " movq 56(%0), %%mm3\n" 103 " movq %%mm0, 32(%1)\n" 104 " movq %%mm1, 40(%1)\n" 105 " movq %%mm2, 48(%1)\n" 106 " movq %%mm3, 56(%1)\n" 107 : : "r" (from), "r" (to) : "memory"); 108 from+=64; 109 to+=64; 110 } 111 /* 112 * Now do the tail of the block 113 */ 114 __memcpy(to, from, len&63); 115 kernel_fpu_end(); 116 return p; 117} 118 119#ifdef CONFIG_MK7 120 121/* 122 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and 123 * other MMX using processors do not. 124 */ 125 126static void fast_clear_page(void *page) 127{ 128 int i; 129 130 kernel_fpu_begin(); 131 132 __asm__ __volatile__ ( 133 " pxor %%mm0, %%mm0\n" : : 134 ); 135 136 for(i=0;i<4096/64;i++) 137 { 138 __asm__ __volatile__ ( 139 " movntq %%mm0, (%0)\n" 140 " movntq %%mm0, 8(%0)\n" 141 " movntq %%mm0, 16(%0)\n" 142 " movntq %%mm0, 24(%0)\n" 143 " movntq %%mm0, 32(%0)\n" 144 " movntq %%mm0, 40(%0)\n" 145 " movntq %%mm0, 48(%0)\n" 146 " movntq %%mm0, 56(%0)\n" 147 : : "r" (page) : "memory"); 148 page+=64; 149 } 150 /* since movntq is weakly-ordered, a "sfence" is needed to become 151 * ordered again. 152 */ 153 __asm__ __volatile__ ( 154 " sfence \n" : : 155 ); 156 kernel_fpu_end(); 157} 158 159static void fast_copy_page(void *to, void *from) 160{ 161 int i; 162 163 kernel_fpu_begin(); 164 165 /* maybe the prefetch stuff can go before the expensive fnsave... 166 * but that is for later. -AV 167 */ 168 __asm__ __volatile__ ( 169 "1: prefetch (%0)\n" 170 " prefetch 64(%0)\n" 171 " prefetch 128(%0)\n" 172 " prefetch 192(%0)\n" 173 " prefetch 256(%0)\n" 174 "2: \n" 175 ".section .fixup, \"ax\"\n" 176 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 177 " jmp 2b\n" 178 ".previous\n" 179 _ASM_EXTABLE(1b,3b) 180 : : "r" (from) ); 181 182 for(i=0; i<(4096-320)/64; i++) 183 { 184 __asm__ __volatile__ ( 185 "1: prefetch 320(%0)\n" 186 "2: movq (%0), %%mm0\n" 187 " movntq %%mm0, (%1)\n" 188 " movq 8(%0), %%mm1\n" 189 " movntq %%mm1, 8(%1)\n" 190 " movq 16(%0), %%mm2\n" 191 " movntq %%mm2, 16(%1)\n" 192 " movq 24(%0), %%mm3\n" 193 " movntq %%mm3, 24(%1)\n" 194 " movq 32(%0), %%mm4\n" 195 " movntq %%mm4, 32(%1)\n" 196 " movq 40(%0), %%mm5\n" 197 " movntq %%mm5, 40(%1)\n" 198 " movq 48(%0), %%mm6\n" 199 " movntq %%mm6, 48(%1)\n" 200 " movq 56(%0), %%mm7\n" 201 " movntq %%mm7, 56(%1)\n" 202 ".section .fixup, \"ax\"\n" 203 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 204 " jmp 2b\n" 205 ".previous\n" 206 _ASM_EXTABLE(1b,3b) 207 : : "r" (from), "r" (to) : "memory"); 208 from+=64; 209 to+=64; 210 } 211 for(i=(4096-320)/64; i<4096/64; i++) 212 { 213 __asm__ __volatile__ ( 214 "2: movq (%0), %%mm0\n" 215 " movntq %%mm0, (%1)\n" 216 " movq 8(%0), %%mm1\n" 217 " movntq %%mm1, 8(%1)\n" 218 " movq 16(%0), %%mm2\n" 219 " movntq %%mm2, 16(%1)\n" 220 " movq 24(%0), %%mm3\n" 221 " movntq %%mm3, 24(%1)\n" 222 " movq 32(%0), %%mm4\n" 223 " movntq %%mm4, 32(%1)\n" 224 " movq 40(%0), %%mm5\n" 225 " movntq %%mm5, 40(%1)\n" 226 " movq 48(%0), %%mm6\n" 227 " movntq %%mm6, 48(%1)\n" 228 " movq 56(%0), %%mm7\n" 229 " movntq %%mm7, 56(%1)\n" 230 : : "r" (from), "r" (to) : "memory"); 231 from+=64; 232 to+=64; 233 } 234 /* since movntq is weakly-ordered, a "sfence" is needed to become 235 * ordered again. 236 */ 237 __asm__ __volatile__ ( 238 " sfence \n" : : 239 ); 240 kernel_fpu_end(); 241} 242 243#else 244 245/* 246 * Generic MMX implementation without K7 specific streaming 247 */ 248 249static void fast_clear_page(void *page) 250{ 251 int i; 252 253 kernel_fpu_begin(); 254 255 __asm__ __volatile__ ( 256 " pxor %%mm0, %%mm0\n" : : 257 ); 258 259 for(i=0;i<4096/128;i++) 260 { 261 __asm__ __volatile__ ( 262 " movq %%mm0, (%0)\n" 263 " movq %%mm0, 8(%0)\n" 264 " movq %%mm0, 16(%0)\n" 265 " movq %%mm0, 24(%0)\n" 266 " movq %%mm0, 32(%0)\n" 267 " movq %%mm0, 40(%0)\n" 268 " movq %%mm0, 48(%0)\n" 269 " movq %%mm0, 56(%0)\n" 270 " movq %%mm0, 64(%0)\n" 271 " movq %%mm0, 72(%0)\n" 272 " movq %%mm0, 80(%0)\n" 273 " movq %%mm0, 88(%0)\n" 274 " movq %%mm0, 96(%0)\n" 275 " movq %%mm0, 104(%0)\n" 276 " movq %%mm0, 112(%0)\n" 277 " movq %%mm0, 120(%0)\n" 278 : : "r" (page) : "memory"); 279 page+=128; 280 } 281 282 kernel_fpu_end(); 283} 284 285static void fast_copy_page(void *to, void *from) 286{ 287 int i; 288 289 290 kernel_fpu_begin(); 291 292 __asm__ __volatile__ ( 293 "1: prefetch (%0)\n" 294 " prefetch 64(%0)\n" 295 " prefetch 128(%0)\n" 296 " prefetch 192(%0)\n" 297 " prefetch 256(%0)\n" 298 "2: \n" 299 ".section .fixup, \"ax\"\n" 300 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 301 " jmp 2b\n" 302 ".previous\n" 303 _ASM_EXTABLE(1b,3b) 304 : : "r" (from) ); 305 306 for(i=0; i<4096/64; i++) 307 { 308 __asm__ __volatile__ ( 309 "1: prefetch 320(%0)\n" 310 "2: movq (%0), %%mm0\n" 311 " movq 8(%0), %%mm1\n" 312 " movq 16(%0), %%mm2\n" 313 " movq 24(%0), %%mm3\n" 314 " movq %%mm0, (%1)\n" 315 " movq %%mm1, 8(%1)\n" 316 " movq %%mm2, 16(%1)\n" 317 " movq %%mm3, 24(%1)\n" 318 " movq 32(%0), %%mm0\n" 319 " movq 40(%0), %%mm1\n" 320 " movq 48(%0), %%mm2\n" 321 " movq 56(%0), %%mm3\n" 322 " movq %%mm0, 32(%1)\n" 323 " movq %%mm1, 40(%1)\n" 324 " movq %%mm2, 48(%1)\n" 325 " movq %%mm3, 56(%1)\n" 326 ".section .fixup, \"ax\"\n" 327 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 328 " jmp 2b\n" 329 ".previous\n" 330 _ASM_EXTABLE(1b,3b) 331 : : "r" (from), "r" (to) : "memory"); 332 from+=64; 333 to+=64; 334 } 335 kernel_fpu_end(); 336} 337 338 339#endif 340 341/* 342 * Favour MMX for page clear and copy. 343 */ 344 345static void slow_zero_page(void * page) 346{ 347 int d0, d1; 348 __asm__ __volatile__( \ 349 "cld\n\t" \ 350 "rep ; stosl" \ 351 : "=&c" (d0), "=&D" (d1) 352 :"a" (0),"1" (page),"0" (1024) 353 :"memory"); 354} 355 356void mmx_clear_page(void * page) 357{ 358 if(unlikely(in_interrupt())) 359 slow_zero_page(page); 360 else 361 fast_clear_page(page); 362} 363 364static void slow_copy_page(void *to, void *from) 365{ 366 int d0, d1, d2; 367 __asm__ __volatile__( \ 368 "cld\n\t" \ 369 "rep ; movsl" \ 370 : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ 371 : "0" (1024),"1" ((long) to),"2" ((long) from) \ 372 : "memory"); 373} 374 375 376void mmx_copy_page(void *to, void *from) 377{ 378 if(unlikely(in_interrupt())) 379 slow_copy_page(to, from); 380 else 381 fast_copy_page(to, from); 382} 383 384EXPORT_SYMBOL(_mmx_memcpy); 385EXPORT_SYMBOL(mmx_clear_page); 386EXPORT_SYMBOL(mmx_copy_page); 387