read_rgba_span_x86.S revision ea3885812704645944752887d892c38a46710956
1/* 2 * (C) Copyright IBM Corporation 2004 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25/** 26 * \file read_rgba_span_x86.S 27 * Optimized routines to transfer pixel data from the framebuffer to a 28 * buffer in main memory. 29 * 30 * \author Ian Romanick <idr@us.ibm.com> 31 */ 32 33 .file "read_rgba_span_x86.S" 34#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */ 35 .section .rodata 36 .align 16 37 .type mask, @object 38 .size mask, 32 39mask: 40 .long 0xff00ff00 41 .long 0xff00ff00 42 .long 0xff00ff00 43 .long 0xff00ff00 44 .long 0x00ff0000 45 .long 0x00ff0000 46 .long 0x00ff0000 47 .long 0x00ff0000 48 49 50/* I implemented these as macros because the appear in quite a few places, 51 * and I've tweaked them a number of times. I got tired of changing every 52 * place they appear. :) 53 */ 54 55#define DO_ONE_PIXEL() \ 56 movl (%ebx), %eax ; \ 57 addl $4, %ebx ; \ 58 bswap %eax /* ARGB -> BGRA */ ; \ 59 rorl $8, %eax /* BGRA -> ABGR */ ; \ 60 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 61 addl $4, %ecx 62 63#define DO_ONE_LAST_PIXEL() \ 64 movl (%ebx), %eax ; \ 65 bswap %eax /* ARGB -> BGRA */ ; \ 66 rorl $8, %eax /* BGRA -> ABGR */ ; \ 67 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 68 69 70/** 71 * MMX optimized version of the BGRA8888_REV to RGBA copy routine. 72 * 73 * \warning 74 * This function assumes that the caller will issue the EMMS instruction 75 * at the correct places. 76 */ 77 78.globl _generic_read_RGBA_span_BGRA8888_REV_MMX 79 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function 80_generic_read_RGBA_span_BGRA8888_REV_MMX: 81 pushl %ebx 82 83#ifdef USE_INNER_EMMS 84 emms 85#endif 86 movq mask, %mm1 87 movq mask+16, %mm2 88 89 movl 8(%esp), %ebx /* source pointer */ 90 movl 16(%esp), %edx /* number of pixels to copy */ 91 movl 12(%esp), %ecx /* destination pointer */ 92 93 testl %edx, %edx 94 je .L20 /* Bail if there's nothing to do. */ 95 96 movl %ebx, %eax 97 98 negl %eax 99 sarl $2, %eax 100 andl $1, %eax 101 je .L17 102 103 subl %eax, %edx 104 DO_ONE_PIXEL() 105.L17: 106 107 /* Would it be faster to unroll this loop once and process 4 pixels 108 * per pass, instead of just two? 109 */ 110 111 movl %edx, %eax 112 shrl %eax 113 jmp .L18 114.L19: 115 movq (%ebx), %mm0 116 addl $8, %ebx 117 118 /* These 9 instructions do what PSHUFB (if there were such an 119 * instruction) could do in 1. :( 120 */ 121 122 movq %mm0, %mm3 123 movq %mm0, %mm4 124 125 pand %mm2, %mm3 126 psllq $16, %mm4 127 psrlq $16, %mm3 128 pand %mm2, %mm4 129 130 pand %mm1, %mm0 131 por %mm4, %mm3 132 por %mm3, %mm0 133 134 movq %mm0, (%ecx) 135 addl $8, %ecx 136 subl $1, %eax 137.L18: 138 jne .L19 139 140#ifdef USE_INNER_EMMS 141 emms 142#endif 143 144 /* At this point there are either 1 or 0 pixels remaining to be 145 * converted. Convert the last pixel, if needed. 146 */ 147 148 testl $1, %edx 149 je .L20 150 151 DO_ONE_LAST_PIXEL() 152 153.L20: 154 popl %ebx 155 ret 156 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX 157 158 159/** 160 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE 161 * instructions are only actually used to read data from the framebuffer. 162 * In practice, the speed-up is pretty small. 163 * 164 * \todo 165 * Do some more testing and determine if there's any reason to have this 166 * function in addition to the MMX version. 167 * 168 * \warning 169 * This function assumes that the caller will issue the EMMS instruction 170 * at the correct places. 171 */ 172 173.globl _generic_read_RGBA_span_BGRA8888_REV_SSE 174 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function 175_generic_read_RGBA_span_BGRA8888_REV_SSE: 176 pushl %esi 177 pushl %ebx 178 pushl %ebp 179 180#ifdef USE_INNER_EMMS 181 emms 182#endif 183 movq mask, %mm1 184 movq mask+16, %mm2 185 186 movl 16(%esp), %ebx /* source pointer */ 187 movl 24(%esp), %edx /* number of pixels to copy */ 188 movl 20(%esp), %ecx /* destination pointer */ 189 190 movl %esp, %ebp 191 subl $16, %esp 192 andl $0xfffffff0, %esp 193 194 movl %ebx, %eax 195 movl %edx, %esi 196 197 negl %eax 198 andl $15, %eax 199 sarl $2, %eax 200 cmpl %edx, %eax 201 cmovle %eax, %esi 202 203 subl %esi, %edx 204 205 testl $1, %esi 206 je .L32 207 208 DO_ONE_PIXEL() 209.L32: 210 211 testl $2, %esi 212 je .L31 213 214 movq (%ebx), %mm0 215 addl $8, %ebx 216 217 movq %mm0, %mm3 218 movq %mm0, %mm4 219 220 pand %mm2, %mm3 221 psllq $16, %mm4 222 psrlq $16, %mm3 223 pand %mm2, %mm4 224 225 pand %mm1, %mm0 226 por %mm4, %mm3 227 por %mm3, %mm0 228 229 movq %mm0, (%ecx) 230 addl $8, %ecx 231.L31: 232 233 movl %edx, %eax 234 shrl $2, %eax 235 jmp .L33 236.L34: 237 movaps (%ebx), %xmm0 238 addl $16, %ebx 239 240 /* This would be so much better if we could just move directly from 241 * an SSE register to an MMX register. Unfortunately, that 242 * functionality wasn't introduced until SSE2 with the MOVDQ2Q 243 * instruction. 244 */ 245 246 movaps %xmm0, (%esp) 247 movq (%esp), %mm0 248 movq 8(%esp), %mm5 249 250 movq %mm0, %mm3 251 movq %mm0, %mm4 252 movq %mm5, %mm6 253 movq %mm5, %mm7 254 255 pand %mm2, %mm3 256 pand %mm2, %mm6 257 258 psllq $16, %mm4 259 psllq $16, %mm7 260 261 psrlq $16, %mm3 262 psrlq $16, %mm6 263 264 pand %mm2, %mm4 265 pand %mm2, %mm7 266 267 pand %mm1, %mm0 268 pand %mm1, %mm5 269 270 por %mm4, %mm3 271 por %mm7, %mm6 272 273 por %mm3, %mm0 274 por %mm6, %mm5 275 276 movq %mm0, (%ecx) 277 movq %mm5, 8(%ecx) 278 addl $16, %ecx 279 280 subl $1, %eax 281.L33: 282 jne .L34 283 284#ifdef USE_INNER_EMMS 285 emms 286#endif 287 movl %ebp, %esp 288 289 /* At this point there are either [0, 3] pixels remaining to be 290 * converted. 291 */ 292 293 testl $2, %edx 294 je .L36 295 296 movq (%ebx), %mm0 297 addl $8, %ebx 298 299 movq %mm0, %mm3 300 movq %mm0, %mm4 301 302 pand %mm2, %mm3 303 psllq $16, %mm4 304 psrlq $16, %mm3 305 pand %mm2, %mm4 306 307 pand %mm1, %mm0 308 por %mm4, %mm3 309 por %mm3, %mm0 310 311 movq %mm0, (%ecx) 312 addl $8, %ecx 313.L36: 314 315 testl $1, %edx 316 je .L35 317 318 DO_ONE_LAST_PIXEL() 319.L35: 320 popl %ebp 321 popl %ebx 322 popl %esi 323 ret 324 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE 325 326 327/** 328 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. 329 */ 330 331 .text 332.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 333 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function 334_generic_read_RGBA_span_BGRA8888_REV_SSE2: 335 pushl %esi 336 pushl %ebx 337 338 movdqa mask, %xmm1 339 movdqa mask+16, %xmm2 340 341 movl 12(%esp), %ebx /* source pointer */ 342 movl 20(%esp), %edx /* number of pixels to copy */ 343 movl 16(%esp), %ecx /* destination pointer */ 344 345 movl %ebx, %eax 346 movl %edx, %esi 347 348 /* If the source pointer isn't a multiple of 16 we have to process 349 * a few pixels the "slow" way to get the address aligned for 350 * the SSE fetch intsructions. 351 */ 352 353 negl %eax 354 andl $15, %eax 355 sarl $2, %eax 356 357 cmpl %edx, %eax 358 cmovbe %eax, %esi 359 subl %esi, %edx 360 361 testl $1, %esi 362 je .L41 363 364 DO_ONE_PIXEL() 365.L41: 366 testl $2, %esi 367 je .L40 368 369 movq (%ebx), %xmm0 370 addl $8, %ebx 371 372 movdqa %xmm0, %xmm3 373 movdqa %xmm0, %xmm4 374 andps %xmm1, %xmm0 375 376 andps %xmm2, %xmm3 377 pslldq $2, %xmm4 378 psrldq $2, %xmm3 379 andps %xmm2, %xmm4 380 381 orps %xmm4, %xmm3 382 orps %xmm3, %xmm0 383 384 movq %xmm0, (%ecx) 385 addl $8, %ecx 386.L40: 387 388 /* Would it be worth having a specialized version of this loop for 389 * the case where the destination is 16-byte aligned? That version 390 * would be identical except that it could use movedqa instead of 391 * movdqu. 392 */ 393 394 movl %edx, %eax 395 shrl $2, %eax 396 jmp .L42 397.L43: 398 movdqa (%ebx), %xmm0 399 addl $16, %ebx 400 401 movdqa %xmm0, %xmm3 402 movdqa %xmm0, %xmm4 403 andps %xmm1, %xmm0 404 405 andps %xmm2, %xmm3 406 pslldq $2, %xmm4 407 psrldq $2, %xmm3 408 andps %xmm2, %xmm4 409 410 orps %xmm4, %xmm3 411 orps %xmm3, %xmm0 412 413 movdqu %xmm0, (%ecx) 414 addl $16, %ecx 415 subl $1, %eax 416.L42: 417 jne .L43 418 419 420 /* There may be upto 3 pixels remaining to be copied. Take care 421 * of them now. We do the 2 pixel case first because the data 422 * will be aligned. 423 */ 424 425 testl $2, %edx 426 je .L47 427 428 movq (%ebx), %xmm0 429 430 movdqa %xmm0, %xmm3 431 movdqa %xmm0, %xmm4 432 andps %xmm1, %xmm0 433 434 andps %xmm2, %xmm3 435 pslldq $2, %xmm4 436 psrldq $2, %xmm3 437 andps %xmm2, %xmm4 438 439 orps %xmm4, %xmm3 440 orps %xmm3, %xmm0 441 442 movq %xmm0, (%ecx) 443.L47: 444 445 testl $1, %edx 446 je .L46 447 448 DO_ONE_LAST_PIXEL() 449.L46: 450 451 popl %ebx 452 popl %esi 453 ret 454 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 455 456 457 458 .section .rodata 459 460 .align 16 461mask_565: 462 .word 0xf800 463 .word 0x07e0 464 .word 0x001f 465 .word 0x0000 466 467/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C 468 * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but 469 * at a small cost to accuracy. 470 */ 471 472#define SCALE_ADJUST 5 473#if SCALE_ADJUST == 5 474prescale: 475 .word 0x0001 476 .word 0x0010 477 .word 0x0200 478 .word 0x0000 479 480scale: 481 .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */ 482 .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */ 483 .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */ 484 .word 0x0000 485#elif SCALE_ADJUST == 0 486prescale: 487 .word 0x0001 488 .word 0x0020 489 .word 0x0800 490 .word 0x0000 491 492scale: 493 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */ 494 .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */ 495 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */ 496 .word 0x0000 497#else 498#error SCALE_ADJUST must either be 5 or 0. 499#endif 500 501 502alpha: .long 0x00000000 503 .long 0x00ff0000 504 505/** 506 * MMX optimized version of the RGB565 to RGBA copy routine. 507 */ 508 509 .text 510 .globl _generic_read_RGBA_span_RGB565_MMX 511 .type _generic_read_RGBA_span_RGB565_MMX, @function 512 513_generic_read_RGBA_span_RGB565_MMX: 514 515#ifdef USE_INNER_EMMS 516 emms 517#endif 518 519 movl 4(%esp), %eax /* source pointer */ 520 movl 8(%esp), %edx /* destination pointer */ 521 movl 12(%esp), %ecx /* number of pixels to copy */ 522 523 movq mask_565, %mm5 524 movq prescale, %mm6 525 movq scale, %mm7 526 527 shrl $2, %ecx 528 jmp .L02 529 530.L03: 531 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and 532 * second pixels into the four words of %mm0 and %mm2. 533 */ 534 535 movq (%eax), %mm4 536 addl $8, %eax 537 538 pshufw $0x00, %mm4, %mm0 539 pshufw $0x55, %mm4, %mm2 540 541 542 /* Mask the pixels so that each word of each register contains only 543 * one color component. 544 */ 545 546 pand %mm5, %mm0 547 pand %mm5, %mm2 548 549 550 /* Adjust the component values so that they are as small as possible, 551 * but large enough so that we can multiply them by an unsigned 16-bit 552 * number and get a value as large as 0x00ff0000. 553 */ 554 555 pmullw %mm6, %mm0 556 pmullw %mm6, %mm2 557#if SCALE_ADJUST > 0 558 psrlw $SCALE_ADJUST, %mm0 559 psrlw $SCALE_ADJUST, %mm2 560#endif 561 562 /* Scale the input component values to be on the range 563 * [0, 0x00ff0000]. This it the real magic of the whole routine. 564 */ 565 566 pmulhuw %mm7, %mm0 567 pmulhuw %mm7, %mm2 568 569 570 /* Always set the alpha value to 0xff. 571 */ 572 573 por alpha, %mm0 574 por alpha, %mm2 575 576 577 /* Pack the 16-bit values to 8-bit values and store the converted 578 * pixel data. 579 */ 580 581 packuswb %mm2, %mm0 582 movq %mm0, (%edx) 583 addl $8, %edx 584 585 586 587 pshufw $0xaa, %mm4, %mm0 588 pshufw $0xff, %mm4, %mm2 589 590 pand %mm5, %mm0 591 pand %mm5, %mm2 592 pmullw %mm6, %mm0 593 pmullw %mm6, %mm2 594#if SCALE_ADJUST > 0 595 psrlw $SCALE_ADJUST, %mm0 596 psrlw $SCALE_ADJUST, %mm2 597#endif 598 pmulhuw %mm7, %mm0 599 pmulhuw %mm7, %mm2 600 601 por alpha, %mm0 602 por alpha, %mm2 603 604 packuswb %mm2, %mm0 605 606 movq %mm0, (%edx) 607 addl $8, %edx 608 609 subl $1, %ecx 610.L02: 611 jne .L03 612 613 614 /* At this point there can be at most 3 pixels left to process. If 615 * there is either 2 or 3 left, process 2. 616 */ 617 618 movl 12(%esp), %ecx 619 testl $0x02, %ecx 620 je .L04 621 622 movd (%eax), %mm4 623 addl $4, %eax 624 625 pshufw $0x00, %mm4, %mm0 626 pshufw $0x55, %mm4, %mm2 627 628 pand %mm5, %mm0 629 pand %mm5, %mm2 630 pmullw %mm6, %mm0 631 pmullw %mm6, %mm2 632#if SCALE_ADJUST > 0 633 psrlw $SCALE_ADJUST, %mm0 634 psrlw $SCALE_ADJUST, %mm2 635#endif 636 pmulhuw %mm7, %mm0 637 pmulhuw %mm7, %mm2 638 639 por alpha, %mm0 640 por alpha, %mm2 641 642 packuswb %mm2, %mm0 643 644 movq %mm0, (%edx) 645 addl $8, %edx 646 647.L04: 648 /* At this point there can be at most 1 pixel left to process. 649 * Process it if needed. 650 */ 651 652 testl $0x01, %ecx 653 je .L01 654 655 movzxw (%eax), %ecx 656 movd %ecx, %mm4 657 658 pshufw $0x00, %mm4, %mm0 659 660 pand %mm5, %mm0 661 pmullw %mm6, %mm0 662#if SCALE_ADJUST > 0 663 psrlw $SCALE_ADJUST, %mm0 664#endif 665 pmulhuw %mm7, %mm0 666 667 por alpha, %mm0 668 669 packuswb %mm0, %mm0 670 671 movd %mm0, (%edx) 672 673.L01: 674#ifdef USE_INNER_EMMS 675 emms 676#endif 677 ret 678#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */ 679