read_rgba_span_x86.S revision 7d39c1ae76cc7dc6793980fd83db100399ee9179
1/* 2 * (C) Copyright IBM Corporation 2004 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25/** 26 * \file read_rgba_span_x86.S 27 * Optimized routines to transfer pixel data from the framebuffer to a 28 * buffer in main memory. 29 * 30 * \author Ian Romanick <idr@us.ibm.com> 31 */ 32 33 .file "read_rgba_span_x86.S" 34#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */ 35/* Kevin F. Quinn 2nd July 2006 36 * Replace data segment constants with text-segment instructions 37 .section .rodata 38 .align 16 39 .type mask, @object 40 .size mask, 32 41mask: 42 .long 0xff00ff00 43 .long 0xff00ff00 44 .long 0xff00ff00 45 .long 0xff00ff00 46 .long 0x00ff0000 47 .long 0x00ff0000 48 .long 0x00ff0000 49 .long 0x00ff0000 50 */ 51#define LOAD_MASK(mvins,m1,m2) \ 52 pushl $0xff00ff00 ;\ 53 pushl $0xff00ff00 ;\ 54 pushl $0xff00ff00 ;\ 55 pushl $0xff00ff00 ;\ 56 mvins (%esp), m1 ;\ 57 pushl $0x00ff0000 ;\ 58 pushl $0x00ff0000 ;\ 59 pushl $0x00ff0000 ;\ 60 pushl $0x00ff0000 ;\ 61 mvins (%esp), m2 ;\ 62 addl $32, %esp 63 64 65/* I implemented these as macros because the appear in quite a few places, 66 * and I've tweaked them a number of times. I got tired of changing every 67 * place they appear. :) 68 */ 69 70#define DO_ONE_PIXEL() \ 71 movl (%ebx), %eax ; \ 72 addl $4, %ebx ; \ 73 bswap %eax /* ARGB -> BGRA */ ; \ 74 rorl $8, %eax /* BGRA -> ABGR */ ; \ 75 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 76 addl $4, %ecx 77 78#define DO_ONE_LAST_PIXEL() \ 79 movl (%ebx), %eax ; \ 80 bswap %eax /* ARGB -> BGRA */ ; \ 81 rorl $8, %eax /* BGRA -> ABGR */ ; \ 82 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 83 84 85/** 86 * MMX optimized version of the BGRA8888_REV to RGBA copy routine. 87 * 88 * \warning 89 * This function assumes that the caller will issue the EMMS instruction 90 * at the correct places. 91 */ 92 93.globl _generic_read_RGBA_span_BGRA8888_REV_MMX 94.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX 95 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function 96_generic_read_RGBA_span_BGRA8888_REV_MMX: 97 pushl %ebx 98 99#ifdef USE_INNER_EMMS 100 emms 101#endif 102/* Kevin F. Quinn 2nd July 2006 103 * Replace data segment constants with text-segment instructions 104 movq mask, %mm1 105 movq mask+16, %mm2 106 */ 107 LOAD_MASK(movq,%mm1,%mm2) 108 109 movl 8(%esp), %ebx /* source pointer */ 110 movl 16(%esp), %edx /* number of pixels to copy */ 111 movl 12(%esp), %ecx /* destination pointer */ 112 113 testl %edx, %edx 114 jle .L20 /* Bail if there's nothing to do. */ 115 116 movl %ebx, %eax 117 118 negl %eax 119 sarl $2, %eax 120 andl $1, %eax 121 je .L17 122 123 subl %eax, %edx 124 DO_ONE_PIXEL() 125.L17: 126 127 /* Would it be faster to unroll this loop once and process 4 pixels 128 * per pass, instead of just two? 129 */ 130 131 movl %edx, %eax 132 shrl %eax 133 jmp .L18 134.L19: 135 movq (%ebx), %mm0 136 addl $8, %ebx 137 138 /* These 9 instructions do what PSHUFB (if there were such an 139 * instruction) could do in 1. :( 140 */ 141 142 movq %mm0, %mm3 143 movq %mm0, %mm4 144 145 pand %mm2, %mm3 146 psllq $16, %mm4 147 psrlq $16, %mm3 148 pand %mm2, %mm4 149 150 pand %mm1, %mm0 151 por %mm4, %mm3 152 por %mm3, %mm0 153 154 movq %mm0, (%ecx) 155 addl $8, %ecx 156 subl $1, %eax 157.L18: 158 jne .L19 159 160#ifdef USE_INNER_EMMS 161 emms 162#endif 163 164 /* At this point there are either 1 or 0 pixels remaining to be 165 * converted. Convert the last pixel, if needed. 166 */ 167 168 testl $1, %edx 169 je .L20 170 171 DO_ONE_LAST_PIXEL() 172 173.L20: 174 popl %ebx 175 ret 176 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX 177 178 179/** 180 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE 181 * instructions are only actually used to read data from the framebuffer. 182 * In practice, the speed-up is pretty small. 183 * 184 * \todo 185 * Do some more testing and determine if there's any reason to have this 186 * function in addition to the MMX version. 187 * 188 * \warning 189 * This function assumes that the caller will issue the EMMS instruction 190 * at the correct places. 191 */ 192 193.globl _generic_read_RGBA_span_BGRA8888_REV_SSE 194.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE 195 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function 196_generic_read_RGBA_span_BGRA8888_REV_SSE: 197 pushl %esi 198 pushl %ebx 199 pushl %ebp 200 201#ifdef USE_INNER_EMMS 202 emms 203#endif 204/* Kevin F. Quinn 2nd July 2006 205 * Replace data segment constants with text-segment instructions 206 movq mask, %mm1 207 movq mask+16, %mm2 208 */ 209 LOAD_MASK(movq,%mm1,%mm2) 210 211 movl 16(%esp), %ebx /* source pointer */ 212 movl 24(%esp), %edx /* number of pixels to copy */ 213 movl 20(%esp), %ecx /* destination pointer */ 214 215 testl %edx, %edx 216 jle .L35 /* Bail if there's nothing to do. */ 217 218 movl %esp, %ebp 219 subl $16, %esp 220 andl $0xfffffff0, %esp 221 222 movl %ebx, %eax 223 movl %edx, %esi 224 225 negl %eax 226 andl $15, %eax 227 sarl $2, %eax 228 cmpl %edx, %eax 229 cmovle %eax, %esi 230 231 subl %esi, %edx 232 233 testl $1, %esi 234 je .L32 235 236 DO_ONE_PIXEL() 237.L32: 238 239 testl $2, %esi 240 je .L31 241 242 movq (%ebx), %mm0 243 addl $8, %ebx 244 245 movq %mm0, %mm3 246 movq %mm0, %mm4 247 248 pand %mm2, %mm3 249 psllq $16, %mm4 250 psrlq $16, %mm3 251 pand %mm2, %mm4 252 253 pand %mm1, %mm0 254 por %mm4, %mm3 255 por %mm3, %mm0 256 257 movq %mm0, (%ecx) 258 addl $8, %ecx 259.L31: 260 261 movl %edx, %eax 262 shrl $2, %eax 263 jmp .L33 264.L34: 265 movaps (%ebx), %xmm0 266 addl $16, %ebx 267 268 /* This would be so much better if we could just move directly from 269 * an SSE register to an MMX register. Unfortunately, that 270 * functionality wasn't introduced until SSE2 with the MOVDQ2Q 271 * instruction. 272 */ 273 274 movaps %xmm0, (%esp) 275 movq (%esp), %mm0 276 movq 8(%esp), %mm5 277 278 movq %mm0, %mm3 279 movq %mm0, %mm4 280 movq %mm5, %mm6 281 movq %mm5, %mm7 282 283 pand %mm2, %mm3 284 pand %mm2, %mm6 285 286 psllq $16, %mm4 287 psllq $16, %mm7 288 289 psrlq $16, %mm3 290 psrlq $16, %mm6 291 292 pand %mm2, %mm4 293 pand %mm2, %mm7 294 295 pand %mm1, %mm0 296 pand %mm1, %mm5 297 298 por %mm4, %mm3 299 por %mm7, %mm6 300 301 por %mm3, %mm0 302 por %mm6, %mm5 303 304 movq %mm0, (%ecx) 305 movq %mm5, 8(%ecx) 306 addl $16, %ecx 307 308 subl $1, %eax 309.L33: 310 jne .L34 311 312#ifdef USE_INNER_EMMS 313 emms 314#endif 315 movl %ebp, %esp 316 317 /* At this point there are either [0, 3] pixels remaining to be 318 * converted. 319 */ 320 321 testl $2, %edx 322 je .L36 323 324 movq (%ebx), %mm0 325 addl $8, %ebx 326 327 movq %mm0, %mm3 328 movq %mm0, %mm4 329 330 pand %mm2, %mm3 331 psllq $16, %mm4 332 psrlq $16, %mm3 333 pand %mm2, %mm4 334 335 pand %mm1, %mm0 336 por %mm4, %mm3 337 por %mm3, %mm0 338 339 movq %mm0, (%ecx) 340 addl $8, %ecx 341.L36: 342 343 testl $1, %edx 344 je .L35 345 346 DO_ONE_LAST_PIXEL() 347.L35: 348 popl %ebp 349 popl %ebx 350 popl %esi 351 ret 352 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE 353 354 355/** 356 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. 357 */ 358 359 .text 360.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 361.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 362 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function 363_generic_read_RGBA_span_BGRA8888_REV_SSE2: 364 pushl %esi 365 pushl %ebx 366 367/* Kevin F. Quinn 2nd July 2006 368 * Replace data segment constants with text-segment instructions 369 movdqa mask, %xmm1 370 movdqa mask+16, %xmm2 371 */ 372 LOAD_MASK(movdqa,%xmm1,%xmm2) 373 374 movl 12(%esp), %ebx /* source pointer */ 375 movl 20(%esp), %edx /* number of pixels to copy */ 376 movl 16(%esp), %ecx /* destination pointer */ 377 378 movl %ebx, %eax 379 movl %edx, %esi 380 381 testl %edx, %edx 382 jle .L46 /* Bail if there's nothing to do. */ 383 384 /* If the source pointer isn't a multiple of 16 we have to process 385 * a few pixels the "slow" way to get the address aligned for 386 * the SSE fetch intsructions. 387 */ 388 389 negl %eax 390 andl $15, %eax 391 sarl $2, %eax 392 393 cmpl %edx, %eax 394 cmovbe %eax, %esi 395 subl %esi, %edx 396 397 testl $1, %esi 398 je .L41 399 400 DO_ONE_PIXEL() 401.L41: 402 testl $2, %esi 403 je .L40 404 405 movq (%ebx), %xmm0 406 addl $8, %ebx 407 408 movdqa %xmm0, %xmm3 409 movdqa %xmm0, %xmm4 410 andps %xmm1, %xmm0 411 412 andps %xmm2, %xmm3 413 pslldq $2, %xmm4 414 psrldq $2, %xmm3 415 andps %xmm2, %xmm4 416 417 orps %xmm4, %xmm3 418 orps %xmm3, %xmm0 419 420 movq %xmm0, (%ecx) 421 addl $8, %ecx 422.L40: 423 424 /* Would it be worth having a specialized version of this loop for 425 * the case where the destination is 16-byte aligned? That version 426 * would be identical except that it could use movedqa instead of 427 * movdqu. 428 */ 429 430 movl %edx, %eax 431 shrl $2, %eax 432 jmp .L42 433.L43: 434 movdqa (%ebx), %xmm0 435 addl $16, %ebx 436 437 movdqa %xmm0, %xmm3 438 movdqa %xmm0, %xmm4 439 andps %xmm1, %xmm0 440 441 andps %xmm2, %xmm3 442 pslldq $2, %xmm4 443 psrldq $2, %xmm3 444 andps %xmm2, %xmm4 445 446 orps %xmm4, %xmm3 447 orps %xmm3, %xmm0 448 449 movdqu %xmm0, (%ecx) 450 addl $16, %ecx 451 subl $1, %eax 452.L42: 453 jne .L43 454 455 456 /* There may be upto 3 pixels remaining to be copied. Take care 457 * of them now. We do the 2 pixel case first because the data 458 * will be aligned. 459 */ 460 461 testl $2, %edx 462 je .L47 463 464 movq (%ebx), %xmm0 465 466 movdqa %xmm0, %xmm3 467 movdqa %xmm0, %xmm4 468 andps %xmm1, %xmm0 469 470 andps %xmm2, %xmm3 471 pslldq $2, %xmm4 472 psrldq $2, %xmm3 473 andps %xmm2, %xmm4 474 475 orps %xmm4, %xmm3 476 orps %xmm3, %xmm0 477 478 movq %xmm0, (%ecx) 479.L47: 480 481 testl $1, %edx 482 je .L46 483 484 DO_ONE_LAST_PIXEL() 485.L46: 486 487 popl %ebx 488 popl %esi 489 ret 490 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 491 492 493 494/* Kevin F. Quinn 2nd July 2006 495 * Replace data segment constants with text-segment instructions 496 */ 497#if 0 498 .section .rodata 499 500 .align 16 501mask_565: 502 .word 0xf800 503 .word 0x07e0 504 .word 0x001f 505 .word 0x0000 506 507/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C 508 * implementation in Mesa. Setting SCALE_ADJUST to 0 is slightly faster but 509 * at a small cost to accuracy. 510 */ 511 512#define SCALE_ADJUST 5 513#if SCALE_ADJUST == 5 514prescale: 515 .word 0x0001 516 .word 0x0010 517 .word 0x0200 518 .word 0x0000 519 520scale: 521 .word 0x20e8 /* (0x00ff0000 / 0x000007c0) + 1 */ 522 .word 0x40c5 /* (0x00ff0000 / 0x000003f0) + 1 */ 523 .word 0x839d /* (0x00ff0000 / 0x000001f0) + 1 */ 524 .word 0x0000 525#elif SCALE_ADJUST == 0 526prescale: 527 .word 0x0001 528 .word 0x0020 529 .word 0x0800 530 .word 0x0000 531 532scale: 533 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */ 534 .word 0x0104 /* (0x00ff0000 / 0x0000fc00) + 1 */ 535 .word 0x0108 /* (0x00ff0000 / 0x0000f800) + 1 */ 536 .word 0x0000 537#else 538#error SCALE_ADJUST must either be 5 or 0. 539#endif 540 541 542alpha: .long 0x00000000 543 .long 0x00ff0000 544#endif 545 546#define MASK_565_L 0x07e0f800 547#define MASK_565_H 0x0000001f 548#define SCALE_ADJUST 5 549#if SCALE_ADJUST == 5 550#define PRESCALE_L 0x00100001 551#define PRESCALE_H 0x00000200 552#define SCALE_L 0x40C620E8 553#define SCALE_H 0x0000839d 554#elif SCALE_ADJUST == 0 555#define PRESCALE_L 0x00200001 556#define PRESCALE_H 0x00000800 557#define SCALE_L 0x01040108 558#define SCALE_H 0x00000108 559#else 560#error SCALE_ADJUST must either be 5 or 0. 561#endif 562#define ALPHA_L 0x00000000 563#define ALPHA_H 0x00ff0000 564 565/** 566 * MMX optimized version of the RGB565 to RGBA copy routine. 567 */ 568 569 .text 570 .globl _generic_read_RGBA_span_RGB565_MMX 571 .hidden _generic_read_RGBA_span_RGB565_MMX 572 .type _generic_read_RGBA_span_RGB565_MMX, @function 573 574_generic_read_RGBA_span_RGB565_MMX: 575 576#ifdef USE_INNER_EMMS 577 emms 578#endif 579 580 movl 4(%esp), %eax /* source pointer */ 581 movl 8(%esp), %edx /* destination pointer */ 582 movl 12(%esp), %ecx /* number of pixels to copy */ 583 584/* Kevin F. Quinn 2nd July 2006 585 * Replace data segment constants with text-segment instructions 586 movq mask_565, %mm5 587 movq prescale, %mm6 588 movq scale, %mm7 589 */ 590 pushl MASK_565_H 591 pushl MASK_565_L 592 movq (%esp), %mm5 593 pushl PRESCALE_H 594 pushl PRESCALE_L 595 movq (%esp), %mm6 596 pushl SCALE_H 597 pushl SCALE_L 598 movq (%esp), %mm7 599 pushl ALPHA_H 600 pushl ALPHA_L 601 movq (%esp), %mm3 602 addl $32,%esp 603 604 sarl $2, %ecx 605 jle .L01 /* Bail early if the count is negative. */ 606 jmp .L02 607 608.L03: 609 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and 610 * second pixels into the four words of %mm0 and %mm2. 611 */ 612 613 movq (%eax), %mm4 614 addl $8, %eax 615 616 pshufw $0x00, %mm4, %mm0 617 pshufw $0x55, %mm4, %mm2 618 619 620 /* Mask the pixels so that each word of each register contains only 621 * one color component. 622 */ 623 624 pand %mm5, %mm0 625 pand %mm5, %mm2 626 627 628 /* Adjust the component values so that they are as small as possible, 629 * but large enough so that we can multiply them by an unsigned 16-bit 630 * number and get a value as large as 0x00ff0000. 631 */ 632 633 pmullw %mm6, %mm0 634 pmullw %mm6, %mm2 635#if SCALE_ADJUST > 0 636 psrlw $SCALE_ADJUST, %mm0 637 psrlw $SCALE_ADJUST, %mm2 638#endif 639 640 /* Scale the input component values to be on the range 641 * [0, 0x00ff0000]. This it the real magic of the whole routine. 642 */ 643 644 pmulhuw %mm7, %mm0 645 pmulhuw %mm7, %mm2 646 647 648 /* Always set the alpha value to 0xff. 649 */ 650 651/* Kevin F. Quinn 2nd July 2006 652 * Replace data segment constants with text-segment instructions 653 por alpha, %mm0 654 por alpha, %mm2 655 */ 656 por %mm3, %mm0 657 por %mm3, %mm2 658 659 660 /* Pack the 16-bit values to 8-bit values and store the converted 661 * pixel data. 662 */ 663 664 packuswb %mm2, %mm0 665 movq %mm0, (%edx) 666 addl $8, %edx 667 668 669 670 pshufw $0xaa, %mm4, %mm0 671 pshufw $0xff, %mm4, %mm2 672 673 pand %mm5, %mm0 674 pand %mm5, %mm2 675 pmullw %mm6, %mm0 676 pmullw %mm6, %mm2 677#if SCALE_ADJUST > 0 678 psrlw $SCALE_ADJUST, %mm0 679 psrlw $SCALE_ADJUST, %mm2 680#endif 681 pmulhuw %mm7, %mm0 682 pmulhuw %mm7, %mm2 683 684/* Kevin F. Quinn 2nd July 2006 685 * Replace data segment constants with text-segment instructions 686 por alpha, %mm0 687 por alpha, %mm2 688 */ 689 por %mm3, %mm0 690 por %mm3, %mm2 691 692 packuswb %mm2, %mm0 693 694 movq %mm0, (%edx) 695 addl $8, %edx 696 697 subl $1, %ecx 698.L02: 699 jne .L03 700 701 702 /* At this point there can be at most 3 pixels left to process. If 703 * there is either 2 or 3 left, process 2. 704 */ 705 706 movl 12(%esp), %ecx 707 testl $0x02, %ecx 708 je .L04 709 710 movd (%eax), %mm4 711 addl $4, %eax 712 713 pshufw $0x00, %mm4, %mm0 714 pshufw $0x55, %mm4, %mm2 715 716 pand %mm5, %mm0 717 pand %mm5, %mm2 718 pmullw %mm6, %mm0 719 pmullw %mm6, %mm2 720#if SCALE_ADJUST > 0 721 psrlw $SCALE_ADJUST, %mm0 722 psrlw $SCALE_ADJUST, %mm2 723#endif 724 pmulhuw %mm7, %mm0 725 pmulhuw %mm7, %mm2 726 727/* Kevin F. Quinn 2nd July 2006 728 * Replace data segment constants with text-segment instructions 729 por alpha, %mm0 730 por alpha, %mm2 731 */ 732 por %mm3, %mm0 733 por %mm3, %mm2 734 735 packuswb %mm2, %mm0 736 737 movq %mm0, (%edx) 738 addl $8, %edx 739 740.L04: 741 /* At this point there can be at most 1 pixel left to process. 742 * Process it if needed. 743 */ 744 745 testl $0x01, %ecx 746 je .L01 747 748 movzxw (%eax), %ecx 749 movd %ecx, %mm4 750 751 pshufw $0x00, %mm4, %mm0 752 753 pand %mm5, %mm0 754 pmullw %mm6, %mm0 755#if SCALE_ADJUST > 0 756 psrlw $SCALE_ADJUST, %mm0 757#endif 758 pmulhuw %mm7, %mm0 759 760/* Kevin F. Quinn 2nd July 2006 761 * Replace data segment constants with text-segment instructions 762 por alpha, %mm0 763 */ 764 por %mm3, %mm0 765 766 packuswb %mm0, %mm0 767 768 movd %mm0, (%edx) 769 770.L01: 771#ifdef USE_INNER_EMMS 772 emms 773#endif 774 ret 775#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */ 776 777#if defined (__ELF__) && defined (__linux__) 778 .section .note.GNU-stack,"",%progbits 779#endif 780