t_vertex_sse.c revision 18a74321aa825c355392f98f1563a971871794cc
1/* 2 * Copyright 2003 Tungsten Graphics, inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw@tungstengraphics.com> 26 */ 27 28#include "glheader.h" 29#include "context.h" 30#include "colormac.h" 31#include "t_context.h" 32#include "t_vertex.h" 33#include "simple_list.h" 34#include "enums.h" 35 36#if defined(USE_X86_ASM) 37 38#define X 0 39#define Y 1 40#define Z 2 41#define W 3 42 43#define DISASSEM 0 44 45struct x86_reg { 46 GLuint file:3; 47 GLuint idx:3; 48 GLuint mod:2; /* mod_REG if this is just a register */ 49 GLint disp:24; /* only +/- 23bits of offset - should be enough... */ 50}; 51 52struct x86_program { 53 GLcontext *ctx; 54 55 GLubyte *store; 56 GLubyte *csr; 57 58 GLuint stack_offset; 59 60 GLboolean inputs_safe; 61 GLboolean outputs_safe; 62 GLboolean have_sse2; 63 GLboolean need_emms; 64 65 struct x86_reg identity; 66 struct x86_reg chan0; 67 68}; 69 70 71#define X86_TWOB 0x0f 72 73/* There are more but these are all we'll use: 74 */ 75enum x86_reg_file { 76 file_REG32, 77 file_MMX, 78 file_XMM 79}; 80 81/* Values for mod field of modr/m byte 82 */ 83enum x86_reg_mod { 84 mod_INDIRECT, 85 mod_DISP8, 86 mod_DISP32, 87 mod_REG 88}; 89 90enum x86_reg_name { 91 reg_AX, 92 reg_CX, 93 reg_DX, 94 reg_BX, 95 reg_SP, 96 reg_BP, 97 reg_SI, 98 reg_DI 99}; 100 101 102enum x86_cc { 103 cc_O, /* overflow */ 104 cc_NO, /* not overflow */ 105 cc_NAE, /* not above or equal / carry */ 106 cc_AE, /* above or equal / not carry */ 107 cc_E, /* equal / zero */ 108 cc_NE /* not equal / not zero */ 109}; 110 111#define cc_Z cc_E 112#define cc_NZ cc_NE 113 114 115/* Create and manipulate registers and regmem values: 116 */ 117static struct x86_reg make_reg( GLuint file, 118 GLuint idx ) 119{ 120 struct x86_reg reg; 121 122 reg.file = file; 123 reg.idx = idx; 124 reg.mod = mod_REG; 125 reg.disp = 0; 126 127 return reg; 128} 129 130static struct x86_reg make_disp( struct x86_reg reg, 131 GLint disp ) 132{ 133 assert(reg.file == file_REG32); 134 135 if (reg.mod == mod_REG) 136 reg.disp = disp; 137 else 138 reg.disp += disp; 139 140 if (reg.disp == 0) 141 reg.mod = mod_INDIRECT; 142 else if (reg.disp <= 127 && reg.disp >= -128) 143 reg.mod = mod_DISP8; 144 else 145 reg.mod = mod_DISP32; 146 147 return reg; 148} 149 150static struct x86_reg deref( struct x86_reg reg ) 151{ 152 return make_disp(reg, 0); 153} 154 155static struct x86_reg get_base_reg( struct x86_reg reg ) 156{ 157 return make_reg( reg.file, reg.idx ); 158} 159 160 161/* Retreive a reference to one of the function arguments, taking into 162 * account any push/pop activity: 163 */ 164static struct x86_reg make_fn_arg( struct x86_program *p, 165 GLuint arg ) 166{ 167 return make_disp(make_reg(file_REG32, reg_SP), 168 p->stack_offset + arg * 4); /* ??? */ 169} 170 171static struct x86_reg get_identity( struct x86_program *p ) 172{ 173 return p->identity; 174} 175 176 177/* Emit bytes to the instruction stream: 178 */ 179static void emit_1b( struct x86_program *p, GLbyte b0 ) 180{ 181 *(GLbyte *)(p->csr++) = b0; 182} 183 184static void emit_1i( struct x86_program *p, GLint i0 ) 185{ 186 *(GLint *)(p->csr) = i0; 187 p->csr += 4; 188} 189 190static void disassem( struct x86_program *p, const char *fn ) 191{ 192#if DISASSEM 193 static const char *last_fn; 194 if (fn && fn != last_fn) { 195 _mesa_printf("0x%x: %s\n", p->csr, fn); 196 last_fn = fn; 197 } 198#endif 199} 200 201static void emit_1ub_fn( struct x86_program *p, GLubyte b0, const char *fn ) 202{ 203 disassem(p, fn); 204 *(p->csr++) = b0; 205} 206 207static void emit_2ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, const char *fn ) 208{ 209 disassem(p, fn); 210 *(p->csr++) = b0; 211 *(p->csr++) = b1; 212} 213 214static void emit_3ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2, const char *fn ) 215{ 216 disassem(p, fn); 217 *(p->csr++) = b0; 218 *(p->csr++) = b1; 219 *(p->csr++) = b2; 220} 221 222#define emit_1ub(p, b0) emit_1ub_fn(p, b0, __FUNCTION__) 223#define emit_2ub(p, b0, b1) emit_2ub_fn(p, b0, b1, __FUNCTION__) 224#define emit_3ub(p, b0, b1, b2) emit_3ub_fn(p, b0, b1, b2, __FUNCTION__) 225 226 227/* Labels, jumps and fixup: 228 */ 229static GLubyte *get_label( struct x86_program *p ) 230{ 231 return p->csr; 232} 233 234static void x86_jcc( struct x86_program *p, 235 GLuint cc, 236 GLubyte *label ) 237{ 238 GLint offset = label - (get_label(p) + 2); 239 240 if (offset <= 127 && offset >= -128) { 241 emit_1ub(p, 0x70 + cc); 242 emit_1b(p, (GLbyte) offset); 243 } 244 else { 245 offset = label - (get_label(p) + 6); 246 emit_2ub(p, 0x0f, 0x80 + cc); 247 emit_1i(p, offset); 248 } 249} 250 251/* Always use a 32bit offset for forward jumps: 252 */ 253static GLubyte *x86_jcc_forward( struct x86_program *p, 254 GLuint cc ) 255{ 256 emit_2ub(p, 0x0f, 0x80 + cc); 257 emit_1i(p, 0); 258 return get_label(p); 259} 260 261/* Fixup offset from forward jump: 262 */ 263static void do_fixup( struct x86_program *p, 264 GLubyte *fixup ) 265{ 266 *(int *)(fixup - 4) = get_label(p) - fixup; 267} 268 269static void x86_push( struct x86_program *p, 270 struct x86_reg reg ) 271{ 272 assert(reg.mod == mod_REG); 273 emit_1ub(p, 0x50 + reg.idx); 274 p->stack_offset += 4; 275} 276 277static void x86_pop( struct x86_program *p, 278 struct x86_reg reg ) 279{ 280 assert(reg.mod == mod_REG); 281 emit_1ub(p, 0x58 + reg.idx); 282 p->stack_offset -= 4; 283} 284 285static void x86_inc( struct x86_program *p, 286 struct x86_reg reg ) 287{ 288 assert(reg.mod == mod_REG); 289 emit_1ub(p, 0x40 + reg.idx); 290} 291 292static void x86_dec( struct x86_program *p, 293 struct x86_reg reg ) 294{ 295 assert(reg.mod == mod_REG); 296 emit_1ub(p, 0x48 + reg.idx); 297} 298 299static void x86_ret( struct x86_program *p ) 300{ 301 emit_1ub(p, 0xc3); 302} 303 304static void mmx_emms( struct x86_program *p ) 305{ 306 assert(p->need_emms); 307 emit_2ub(p, 0x0f, 0x77); 308 p->need_emms = 0; 309} 310 311 312 313 314/* Build a modRM byte + possible displacement. No treatment of SIB 315 * indexing. BZZT - no way to encode an absolute address. 316 */ 317static void emit_modrm( struct x86_program *p, 318 struct x86_reg reg, 319 struct x86_reg regmem ) 320{ 321 GLubyte val = 0; 322 323 assert(reg.mod == mod_REG); 324 325 val |= regmem.mod << 6; /* mod field */ 326 val |= reg.idx << 3; /* reg field */ 327 val |= regmem.idx; /* r/m field */ 328 329 emit_1ub_fn(p, val, 0); 330 331 /* Oh-oh we've stumbled into the SIB thing. 332 */ 333 if (regmem.idx == reg_SP) { 334 emit_1ub_fn(p, 0x24, 0); /* simplistic! */ 335 } 336 337 switch (regmem.mod) { 338 case mod_REG: 339 case mod_INDIRECT: 340 break; 341 case mod_DISP8: 342 emit_1b(p, regmem.disp); 343 break; 344 case mod_DISP32: 345 emit_1i(p, regmem.disp); 346 break; 347 default: 348 _mesa_printf("unknown regmem.mod %d\n", regmem.mod); 349 abort(); 350 break; 351 } 352} 353 354/* Many x86 instructions have two opcodes to cope with the situations 355 * where the destination is a register or memory reference 356 * respectively. This function selects the correct opcode based on 357 * the arguments presented. 358 */ 359static void emit_op_modrm( struct x86_program *p, 360 GLubyte op_dst_is_reg, 361 GLubyte op_dst_is_mem, 362 struct x86_reg dst, 363 struct x86_reg src ) 364{ 365 switch (dst.mod) { 366 case mod_REG: 367 emit_1ub_fn(p, op_dst_is_reg, 0); 368 emit_modrm(p, dst, src); 369 break; 370 case mod_INDIRECT: 371 case mod_DISP32: 372 case mod_DISP8: 373 assert(src.mod == mod_REG); 374 emit_1ub_fn(p, op_dst_is_mem, 0); 375 emit_modrm(p, src, dst); 376 break; 377 default: 378 _mesa_printf("unknown dst.mod %d\n", dst.mod); 379 abort(); 380 break; 381 } 382} 383 384static void x86_mov( struct x86_program *p, 385 struct x86_reg dst, 386 struct x86_reg src ) 387{ 388 emit_op_modrm( p, 0x8b, 0x89, dst, src ); 389} 390 391static void x86_xor( struct x86_program *p, 392 struct x86_reg dst, 393 struct x86_reg src ) 394{ 395 emit_op_modrm( p, 0x33, 0x31, dst, src ); 396} 397 398static void x86_cmp( struct x86_program *p, 399 struct x86_reg dst, 400 struct x86_reg src ) 401{ 402 emit_op_modrm( p, 0x3b, 0x39, dst, src ); 403} 404 405static void sse2_movd( struct x86_program *p, 406 struct x86_reg dst, 407 struct x86_reg src ) 408{ 409 assert(p->have_sse2); 410 emit_2ub(p, 0x66, X86_TWOB); 411 emit_op_modrm( p, 0x6e, 0x7e, dst, src ); 412} 413 414static void mmx_movd( struct x86_program *p, 415 struct x86_reg dst, 416 struct x86_reg src ) 417{ 418 p->need_emms = 1; 419 emit_1ub(p, X86_TWOB); 420 emit_op_modrm( p, 0x6e, 0x7e, dst, src ); 421} 422 423static void mmx_movq( struct x86_program *p, 424 struct x86_reg dst, 425 struct x86_reg src ) 426{ 427 p->need_emms = 1; 428 emit_1ub(p, X86_TWOB); 429 emit_op_modrm( p, 0x6f, 0x7f, dst, src ); 430} 431 432 433static void sse_movss( struct x86_program *p, 434 struct x86_reg dst, 435 struct x86_reg src ) 436{ 437 emit_2ub(p, 0xF3, X86_TWOB); 438 emit_op_modrm( p, 0x10, 0x11, dst, src ); 439} 440 441static void sse_movaps( struct x86_program *p, 442 struct x86_reg dst, 443 struct x86_reg src ) 444{ 445 emit_1ub(p, X86_TWOB); 446 emit_op_modrm( p, 0x28, 0x29, dst, src ); 447} 448 449static void sse_movups( struct x86_program *p, 450 struct x86_reg dst, 451 struct x86_reg src ) 452{ 453 emit_1ub(p, X86_TWOB); 454 emit_op_modrm( p, 0x10, 0x11, dst, src ); 455} 456 457static void sse_movhps( struct x86_program *p, 458 struct x86_reg dst, 459 struct x86_reg src ) 460{ 461 assert(dst.mod != mod_REG || src.mod != mod_REG); 462 emit_1ub(p, X86_TWOB); 463 emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */ 464} 465 466static void sse_movlps( struct x86_program *p, 467 struct x86_reg dst, 468 struct x86_reg src ) 469{ 470 assert(dst.mod != mod_REG || src.mod != mod_REG); 471 emit_1ub(p, X86_TWOB); 472 emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */ 473} 474 475/* SSE operations often only have one format, with dest constrained to 476 * be a register: 477 */ 478static void sse_mulps( struct x86_program *p, 479 struct x86_reg dst, 480 struct x86_reg src ) 481{ 482 emit_2ub(p, X86_TWOB, 0x59); 483 emit_modrm( p, dst, src ); 484} 485 486static void sse_addps( struct x86_program *p, 487 struct x86_reg dst, 488 struct x86_reg src ) 489{ 490 emit_2ub(p, X86_TWOB, 0x58); 491 emit_modrm( p, dst, src ); 492} 493 494static void sse_movhlps( struct x86_program *p, 495 struct x86_reg dst, 496 struct x86_reg src ) 497{ 498 assert(dst.mod == mod_REG && src.mod == mod_REG); 499 emit_2ub(p, X86_TWOB, 0x12); 500 emit_modrm( p, dst, src ); 501} 502 503static void sse_movlhps( struct x86_program *p, 504 struct x86_reg dst, 505 struct x86_reg src ) 506{ 507 assert(dst.mod == mod_REG && src.mod == mod_REG); 508 emit_2ub(p, X86_TWOB, 0x16); 509 emit_modrm( p, dst, src ); 510} 511 512static void sse2_cvtps2dq( struct x86_program *p, 513 struct x86_reg dst, 514 struct x86_reg src ) 515{ 516 assert(p->have_sse2); 517 emit_3ub(p, 0x66, X86_TWOB, 0x5B); 518 emit_modrm( p, dst, src ); 519} 520 521static void sse2_packssdw( struct x86_program *p, 522 struct x86_reg dst, 523 struct x86_reg src ) 524{ 525 assert(p->have_sse2); 526 emit_3ub(p, 0x66, X86_TWOB, 0x6B); 527 emit_modrm( p, dst, src ); 528} 529 530static void sse2_packsswb( struct x86_program *p, 531 struct x86_reg dst, 532 struct x86_reg src ) 533{ 534 assert(p->have_sse2); 535 emit_3ub(p, 0x66, X86_TWOB, 0x63); 536 emit_modrm( p, dst, src ); 537} 538 539static void sse2_packuswb( struct x86_program *p, 540 struct x86_reg dst, 541 struct x86_reg src ) 542{ 543 assert(p->have_sse2); 544 emit_3ub(p, 0x66, X86_TWOB, 0x67); 545 emit_modrm( p, dst, src ); 546} 547 548static void sse_cvtps2pi( struct x86_program *p, 549 struct x86_reg dst, 550 struct x86_reg src ) 551{ 552 assert(dst.file == file_MMX && 553 (src.file == file_XMM || src.mod != mod_REG)); 554 555 p->need_emms = 1; 556 557 emit_2ub(p, X86_TWOB, 0x2d); 558 emit_modrm( p, dst, src ); 559} 560 561static void mmx_packssdw( struct x86_program *p, 562 struct x86_reg dst, 563 struct x86_reg src ) 564{ 565 assert(dst.file == file_MMX && 566 (src.file == file_MMX || src.mod != mod_REG)); 567 568 p->need_emms = 1; 569 570 emit_2ub(p, X86_TWOB, 0x6b); 571 emit_modrm( p, dst, src ); 572} 573 574static void mmx_packuswb( struct x86_program *p, 575 struct x86_reg dst, 576 struct x86_reg src ) 577{ 578 assert(dst.file == file_MMX && 579 (src.file == file_MMX || src.mod != mod_REG)); 580 581 p->need_emms = 1; 582 583 emit_2ub(p, X86_TWOB, 0x67); 584 emit_modrm( p, dst, src ); 585} 586 587 588/* Load effective address: 589 */ 590static void x86_lea( struct x86_program *p, 591 struct x86_reg dst, 592 struct x86_reg src ) 593{ 594 emit_1ub(p, 0x8d); 595 emit_modrm( p, dst, src ); 596} 597 598static void x86_test( struct x86_program *p, 599 struct x86_reg dst, 600 struct x86_reg src ) 601{ 602 emit_1ub(p, 0x85); 603 emit_modrm( p, dst, src ); 604} 605 606 607 608 609/** 610 * Perform a reduced swizzle: 611 */ 612static void sse2_pshufd( struct x86_program *p, 613 struct x86_reg dest, 614 struct x86_reg arg0, 615 GLubyte x, 616 GLubyte y, 617 GLubyte z, 618 GLubyte w) 619{ 620 assert(p->have_sse2); 621 emit_3ub(p, 0x66, X86_TWOB, 0x70); 622 emit_modrm(p, dest, arg0); 623 emit_1ub(p, (x|(y<<2)|(z<<4)|w<<6)); 624} 625 626 627/* Shufps can also be used to implement a reduced swizzle when dest == 628 * arg0. 629 */ 630static void sse_shufps( struct x86_program *p, 631 struct x86_reg dest, 632 struct x86_reg arg0, 633 GLubyte x, 634 GLubyte y, 635 GLubyte z, 636 GLubyte w) 637{ 638 emit_2ub(p, X86_TWOB, 0xC6); 639 emit_modrm(p, dest, arg0); 640 emit_1ub(p, (x|(y<<2)|(z<<4)|w<<6)); 641} 642 643 644static void emit_load4f_4( struct x86_program *p, 645 struct x86_reg dest, 646 struct x86_reg arg0 ) 647{ 648 sse_movups(p, dest, arg0); 649} 650 651static void emit_load4f_3( struct x86_program *p, 652 struct x86_reg dest, 653 struct x86_reg arg0 ) 654{ 655 /* Have to jump through some hoops: 656 * 657 * c 0 0 0 658 * c 0 0 1 659 * 0 0 c 1 660 * a b c 1 661 */ 662 sse_movss(p, dest, make_disp(arg0, 8)); 663 sse_shufps(p, dest, get_identity(p), X,Y,Z,W ); 664 sse_shufps(p, dest, dest, Y,Z,X,W ); 665 sse_movlps(p, dest, arg0); 666} 667 668static void emit_load4f_2( struct x86_program *p, 669 struct x86_reg dest, 670 struct x86_reg arg0 ) 671{ 672 /* Initialize from identity, then pull in low two words: 673 */ 674 sse_movups(p, dest, get_identity(p)); 675 sse_movlps(p, dest, arg0); 676} 677 678static void emit_load4f_1( struct x86_program *p, 679 struct x86_reg dest, 680 struct x86_reg arg0 ) 681{ 682 /* Pull in low word, then swizzle in identity */ 683 sse_movss(p, dest, arg0); 684 sse_shufps(p, dest, get_identity(p), X,Y,Z,W ); 685} 686 687 688 689static void emit_load3f_3( struct x86_program *p, 690 struct x86_reg dest, 691 struct x86_reg arg0 ) 692{ 693 /* Over-reads by 1 dword - potential SEGV if input is a vertex 694 * array. 695 */ 696 if (p->inputs_safe) { 697 sse_movups(p, dest, arg0); 698 } 699 else { 700 /* c 0 0 0 701 * c c c c 702 * a b c c 703 */ 704 sse_movss(p, dest, make_disp(arg0, 8)); 705 sse_shufps(p, dest, dest, X,X,X,X); 706 sse_movlps(p, dest, arg0); 707 } 708} 709 710static void emit_load3f_2( struct x86_program *p, 711 struct x86_reg dest, 712 struct x86_reg arg0 ) 713{ 714 emit_load4f_2(p, dest, arg0); 715} 716 717static void emit_load3f_1( struct x86_program *p, 718 struct x86_reg dest, 719 struct x86_reg arg0 ) 720{ 721 emit_load4f_1(p, dest, arg0); 722} 723 724static void emit_load2f_2( struct x86_program *p, 725 struct x86_reg dest, 726 struct x86_reg arg0 ) 727{ 728 sse_movlps(p, dest, arg0); 729} 730 731static void emit_load2f_1( struct x86_program *p, 732 struct x86_reg dest, 733 struct x86_reg arg0 ) 734{ 735 emit_load4f_1(p, dest, arg0); 736} 737 738static void emit_load1f_1( struct x86_program *p, 739 struct x86_reg dest, 740 struct x86_reg arg0 ) 741{ 742 sse_movss(p, dest, arg0); 743} 744 745static void (*load[4][4])( struct x86_program *p, 746 struct x86_reg dest, 747 struct x86_reg arg0 ) = { 748 { emit_load1f_1, 749 emit_load1f_1, 750 emit_load1f_1, 751 emit_load1f_1 }, 752 753 { emit_load2f_1, 754 emit_load2f_2, 755 emit_load2f_2, 756 emit_load2f_2 }, 757 758 { emit_load3f_1, 759 emit_load3f_2, 760 emit_load3f_3, 761 emit_load3f_3 }, 762 763 { emit_load4f_1, 764 emit_load4f_2, 765 emit_load4f_3, 766 emit_load4f_4 } 767}; 768 769static void emit_load( struct x86_program *p, 770 struct x86_reg dest, 771 GLuint sz, 772 struct x86_reg src, 773 GLuint src_sz) 774{ 775 if (DISASSEM) 776 _mesa_printf("load %d/%d\n", sz, src_sz); 777 778 load[sz-1][src_sz-1](p, dest, src); 779} 780 781static void emit_store4f( struct x86_program *p, 782 struct x86_reg dest, 783 struct x86_reg arg0 ) 784{ 785 sse_movups(p, dest, arg0); 786} 787 788static void emit_store3f( struct x86_program *p, 789 struct x86_reg dest, 790 struct x86_reg arg0 ) 791{ 792 if (p->outputs_safe) { 793 /* Emit the extra dword anyway. This may hurt writecombining, 794 * may cause other problems. 795 */ 796 sse_movups(p, dest, arg0); 797 } 798 else { 799 /* Alternate strategy - emit two, shuffle, emit one. 800 */ 801 sse_movlps(p, dest, arg0); 802 sse_shufps(p, arg0, arg0, Z, Z, Z, Z ); /* NOTE! destructive */ 803 sse_movss(p, make_disp(dest,8), arg0); 804 } 805} 806 807static void emit_store2f( struct x86_program *p, 808 struct x86_reg dest, 809 struct x86_reg arg0 ) 810{ 811 sse_movlps(p, dest, arg0); 812} 813 814static void emit_store1f( struct x86_program *p, 815 struct x86_reg dest, 816 struct x86_reg arg0 ) 817{ 818 sse_movss(p, dest, arg0); 819} 820 821 822static void (*store[4])( struct x86_program *p, 823 struct x86_reg dest, 824 struct x86_reg arg0 ) = 825{ 826 emit_store1f, 827 emit_store2f, 828 emit_store3f, 829 emit_store4f 830}; 831 832static void emit_store( struct x86_program *p, 833 struct x86_reg dest, 834 GLuint sz, 835 struct x86_reg temp ) 836 837{ 838 if (DISASSEM) 839 _mesa_printf("store %d\n", sz); 840 store[sz-1](p, dest, temp); 841} 842 843static void emit_pack_store_4ub( struct x86_program *p, 844 struct x86_reg dest, 845 struct x86_reg temp ) 846{ 847 /* Scale by 255.0 848 */ 849 sse_mulps(p, temp, p->chan0); 850 851 if (p->have_sse2) { 852 sse2_cvtps2dq(p, temp, temp); 853 sse2_packssdw(p, temp, temp); 854 sse2_packuswb(p, temp, temp); 855 sse_movss(p, dest, temp); 856 } 857 else { 858 struct x86_reg mmx0 = make_reg(file_MMX, 0); 859 struct x86_reg mmx1 = make_reg(file_MMX, 1); 860 sse_cvtps2pi(p, mmx0, temp); 861 sse_movhlps(p, temp, temp); 862 sse_cvtps2pi(p, mmx1, temp); 863 mmx_packssdw(p, mmx0, mmx1); 864 mmx_packuswb(p, mmx0, mmx0); 865 mmx_movd(p, dest, mmx0); 866 } 867} 868 869static GLint get_offset( const void *a, const void *b ) 870{ 871 return (const char *)b - (const char *)a; 872} 873 874/* Not much happens here. Eventually use this function to try and 875 * avoid saving/reloading the source pointers each vertex (if some of 876 * them can fit in registers). 877 */ 878static void get_src_ptr( struct x86_program *p, 879 struct x86_reg srcREG, 880 struct x86_reg vtxREG, 881 struct tnl_clipspace_attr *a ) 882{ 883 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx); 884 struct x86_reg ptr_to_src = make_disp(vtxREG, get_offset(vtx, &a->inputptr)); 885 886 /* Load current a[j].inputptr 887 */ 888 x86_mov(p, srcREG, ptr_to_src); 889} 890 891static void update_src_ptr( struct x86_program *p, 892 struct x86_reg srcREG, 893 struct x86_reg vtxREG, 894 struct tnl_clipspace_attr *a ) 895{ 896 if (a->inputstride) { 897 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx); 898 struct x86_reg ptr_to_src = make_disp(vtxREG, get_offset(vtx, &a->inputptr)); 899 900 /* add a[j].inputstride (hardcoded value - could just as easily 901 * pull the stride value from memory each time). 902 */ 903 x86_lea(p, srcREG, make_disp(srcREG, a->inputstride)); 904 905 /* save new value of a[j].inputptr 906 */ 907 x86_mov(p, ptr_to_src, srcREG); 908 } 909} 910 911 912/* Lots of hardcoding 913 * 914 * EAX -- pointer to current output vertex 915 * ECX -- pointer to current attribute 916 * 917 */ 918static GLboolean build_vertex_emit( struct x86_program *p ) 919{ 920 GLcontext *ctx = p->ctx; 921 TNLcontext *tnl = TNL_CONTEXT(ctx); 922 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx); 923 GLuint j = 0; 924 925 struct x86_reg vertexEAX = make_reg(file_REG32, reg_AX); 926 struct x86_reg srcECX = make_reg(file_REG32, reg_CX); 927 struct x86_reg countEBP = make_reg(file_REG32, reg_BP); 928 struct x86_reg vtxESI = make_reg(file_REG32, reg_SI); 929 struct x86_reg temp = make_reg(file_XMM, 0); 930 struct x86_reg vp0 = make_reg(file_XMM, 1); 931 struct x86_reg vp1 = make_reg(file_XMM, 2); 932 GLubyte *fixup, *label; 933 934 p->csr = p->store; 935 936 /* Push a few regs? 937 */ 938/* x86_push(p, srcECX); */ 939 x86_push(p, countEBP); 940 x86_push(p, vtxESI); 941 942 943 /* Get vertex count, compare to zero 944 */ 945 x86_xor(p, srcECX, srcECX); 946 x86_mov(p, countEBP, make_fn_arg(p, 2)); 947 x86_cmp(p, countEBP, srcECX); 948 fixup = x86_jcc_forward(p, cc_E); 949 950 /* Initialize destination register. 951 */ 952 x86_mov(p, vertexEAX, make_fn_arg(p, 3)); 953 954 /* Dereference ctx to get tnl, then vtx: 955 */ 956 x86_mov(p, vtxESI, make_fn_arg(p, 1)); 957 x86_mov(p, vtxESI, make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context))); 958 vtxESI = make_disp(vtxESI, get_offset(tnl, &tnl->clipspace)); 959 960 961 /* Possibly load vp0, vp1 for viewport calcs: 962 */ 963 if (vtx->need_viewport) { 964 sse_movups(p, vp0, make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0]))); 965 sse_movups(p, vp1, make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0]))); 966 } 967 968 /* always load, needed or not: 969 */ 970 sse_movups(p, p->chan0, make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0]))); 971 sse_movups(p, p->identity, make_disp(vtxESI, get_offset(vtx, &vtx->identity[0]))); 972 973 /* Note address for loop jump */ 974 label = get_label(p); 975 976 /* Emit code for each of the attributes. Currently routes 977 * everything through SSE registers, even when it might be more 978 * efficient to stick with regular old x86. No optimization or 979 * other tricks - enough new ground to cover here just getting 980 * things working. 981 */ 982 while (j < vtx->attr_count) { 983 struct tnl_clipspace_attr *a = &vtx->attr[j]; 984 struct x86_reg dest = make_disp(vertexEAX, a->vertoffset); 985 986 /* Now, load an XMM reg from src, perhaps transform, then save. 987 * Could be shortcircuited in specific cases: 988 */ 989 switch (a->format) { 990 case EMIT_1F: 991 get_src_ptr(p, srcECX, vtxESI, a); 992 emit_load(p, temp, 1, deref(srcECX), a->inputsize); 993 emit_store(p, dest, 1, temp); 994 update_src_ptr(p, srcECX, vtxESI, a); 995 break; 996 case EMIT_2F: 997 get_src_ptr(p, srcECX, vtxESI, a); 998 emit_load(p, temp, 2, deref(srcECX), a->inputsize); 999 emit_store(p, dest, 2, temp); 1000 update_src_ptr(p, srcECX, vtxESI, a); 1001 break; 1002 case EMIT_3F: 1003 /* Potentially the worst case - hardcode 2+1 copying: 1004 */ 1005 if (0) { 1006 get_src_ptr(p, srcECX, vtxESI, a); 1007 emit_load(p, temp, 3, deref(srcECX), a->inputsize); 1008 emit_store(p, dest, 3, temp); 1009 update_src_ptr(p, srcECX, vtxESI, a); 1010 } 1011 else { 1012 get_src_ptr(p, srcECX, vtxESI, a); 1013 emit_load(p, temp, 2, deref(srcECX), a->inputsize); 1014 emit_store(p, dest, 2, temp); 1015 if (a->inputsize > 2) { 1016 emit_load(p, temp, 1, make_disp(srcECX, 8), 1); 1017 emit_store(p, make_disp(dest,8), 1, temp); 1018 } 1019 else { 1020 sse_movss(p, make_disp(dest,8), get_identity(p)); 1021 } 1022 update_src_ptr(p, srcECX, vtxESI, a); 1023 } 1024 break; 1025 case EMIT_4F: 1026 get_src_ptr(p, srcECX, vtxESI, a); 1027 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1028 emit_store(p, dest, 4, temp); 1029 update_src_ptr(p, srcECX, vtxESI, a); 1030 break; 1031 case EMIT_2F_VIEWPORT: 1032 get_src_ptr(p, srcECX, vtxESI, a); 1033 emit_load(p, temp, 2, deref(srcECX), a->inputsize); 1034 sse_mulps(p, temp, vp0); 1035 sse_addps(p, temp, vp1); 1036 emit_store(p, dest, 2, temp); 1037 update_src_ptr(p, srcECX, vtxESI, a); 1038 break; 1039 case EMIT_3F_VIEWPORT: 1040 get_src_ptr(p, srcECX, vtxESI, a); 1041 emit_load(p, temp, 3, deref(srcECX), a->inputsize); 1042 sse_mulps(p, temp, vp0); 1043 sse_addps(p, temp, vp1); 1044 emit_store(p, dest, 3, temp); 1045 update_src_ptr(p, srcECX, vtxESI, a); 1046 break; 1047 case EMIT_4F_VIEWPORT: 1048 get_src_ptr(p, srcECX, vtxESI, a); 1049 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1050 sse_mulps(p, temp, vp0); 1051 sse_addps(p, temp, vp1); 1052 emit_store(p, dest, 4, temp); 1053 update_src_ptr(p, srcECX, vtxESI, a); 1054 break; 1055 case EMIT_3F_XYW: 1056 get_src_ptr(p, srcECX, vtxESI, a); 1057 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1058 sse_shufps(p, temp, temp, X, Y, W, Z); 1059 emit_store(p, dest, 3, temp); 1060 update_src_ptr(p, srcECX, vtxESI, a); 1061 break; 1062 1063 case EMIT_1UB_1F: 1064 /* Test for PAD3 + 1UB: 1065 */ 1066 if (j > 0 && 1067 a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3) 1068 { 1069 get_src_ptr(p, srcECX, vtxESI, a); 1070 emit_load(p, temp, 1, deref(srcECX), a->inputsize); 1071 sse_shufps(p, temp, temp, X, X, X, X); 1072 emit_pack_store_4ub(p, make_disp(dest, -3), temp); /* overkill! */ 1073 update_src_ptr(p, srcECX, vtxESI, a); 1074 } 1075 else { 1076 _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize ); 1077 return GL_FALSE; 1078 } 1079 break; 1080 case EMIT_3UB_3F_RGB: 1081 case EMIT_3UB_3F_BGR: 1082 /* Test for 3UB + PAD1: 1083 */ 1084 if (j == vtx->attr_count - 1 || 1085 a[1].vertoffset >= a->vertoffset + 4) { 1086 get_src_ptr(p, srcECX, vtxESI, a); 1087 emit_load(p, temp, 3, deref(srcECX), a->inputsize); 1088 if (a->format == EMIT_3UB_3F_BGR) 1089 sse_shufps(p, temp, temp, Z, Y, X, W); 1090 emit_pack_store_4ub(p, dest, temp); 1091 update_src_ptr(p, srcECX, vtxESI, a); 1092 } 1093 /* Test for 3UB + 1UB: 1094 */ 1095 else if (j < vtx->attr_count - 1 && 1096 a[1].format == EMIT_1UB_1F && 1097 a[1].vertoffset == a->vertoffset + 3) { 1098 get_src_ptr(p, srcECX, vtxESI, a); 1099 emit_load(p, temp, 3, deref(srcECX), a->inputsize); 1100 update_src_ptr(p, srcECX, vtxESI, a); 1101 1102 /* Make room for incoming value: 1103 */ 1104 sse_shufps(p, temp, temp, W, X, Y, Z); 1105 1106 get_src_ptr(p, srcECX, vtxESI, &a[1]); 1107 emit_load(p, temp, 1, deref(srcECX), a[1].inputsize); 1108 update_src_ptr(p, srcECX, vtxESI, &a[1]); 1109 1110 /* Rearrange and possibly do BGR conversion: 1111 */ 1112 if (a->format == EMIT_3UB_3F_BGR) 1113 sse_shufps(p, temp, temp, W, Z, Y, X); 1114 else 1115 sse_shufps(p, temp, temp, Y, Z, W, X); 1116 1117 emit_pack_store_4ub(p, dest, temp); 1118 j++; /* NOTE: two attrs consumed */ 1119 } 1120 else { 1121 _mesa_printf("Can't emit 3ub\n"); 1122 } 1123 return GL_FALSE; /* add this later */ 1124 break; 1125 1126 case EMIT_4UB_4F_RGBA: 1127 get_src_ptr(p, srcECX, vtxESI, a); 1128 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1129 emit_pack_store_4ub(p, dest, temp); 1130 update_src_ptr(p, srcECX, vtxESI, a); 1131 break; 1132 case EMIT_4UB_4F_BGRA: 1133 get_src_ptr(p, srcECX, vtxESI, a); 1134 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1135 sse_shufps(p, temp, temp, Z, Y, X, W); 1136 emit_pack_store_4ub(p, dest, temp); 1137 update_src_ptr(p, srcECX, vtxESI, a); 1138 break; 1139 case EMIT_4UB_4F_ARGB: 1140 get_src_ptr(p, srcECX, vtxESI, a); 1141 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1142 sse_shufps(p, temp, temp, W, X, Y, Z); 1143 emit_pack_store_4ub(p, dest, temp); 1144 update_src_ptr(p, srcECX, vtxESI, a); 1145 break; 1146 case EMIT_4UB_4F_ABGR: 1147 get_src_ptr(p, srcECX, vtxESI, a); 1148 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1149 sse_shufps(p, temp, temp, W, Z, Y, X); 1150 emit_pack_store_4ub(p, dest, temp); 1151 update_src_ptr(p, srcECX, vtxESI, a); 1152 break; 1153 case EMIT_4CHAN_4F_RGBA: 1154 switch (CHAN_TYPE) { 1155 case GL_UNSIGNED_BYTE: 1156 get_src_ptr(p, srcECX, vtxESI, a); 1157 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1158 emit_pack_store_4ub(p, dest, temp); 1159 update_src_ptr(p, srcECX, vtxESI, a); 1160 break; 1161 case GL_FLOAT: 1162 get_src_ptr(p, srcECX, vtxESI, a); 1163 emit_load(p, temp, 4, deref(srcECX), a->inputsize); 1164 emit_store(p, dest, 4, temp); 1165 update_src_ptr(p, srcECX, vtxESI, a); 1166 break; 1167 case GL_UNSIGNED_SHORT: 1168 default: 1169 _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE)); 1170 return GL_FALSE; 1171 } 1172 break; 1173 default: 1174 _mesa_printf("unknown a[%d].format %d\n", j, a->format); 1175 return GL_FALSE; /* catch any new opcodes */ 1176 } 1177 1178 /* Increment j by at least 1 - may have been incremented above also: 1179 */ 1180 j++; 1181 } 1182 1183 /* Next vertex: 1184 */ 1185 x86_lea(p, vertexEAX, make_disp(vertexEAX, vtx->vertex_size)); 1186 1187 /* decr count, loop if not zero 1188 */ 1189 x86_dec(p, countEBP); 1190 x86_test(p, countEBP, countEBP); 1191 x86_jcc(p, cc_NZ, label); 1192 1193 /* Exit mmx state? 1194 */ 1195 if (p->need_emms) 1196 mmx_emms(p); 1197 1198 /* Land forward jump here: 1199 */ 1200 do_fixup(p, fixup); 1201 1202 /* Pop regs and return 1203 */ 1204 x86_pop(p, get_base_reg(vtxESI)); 1205 x86_pop(p, countEBP); 1206/* x86_pop(p, srcECX); */ 1207 x86_ret(p); 1208 1209 vtx->emit = (tnl_emit_func)p->store; 1210 return GL_TRUE; 1211} 1212 1213#include "x86/common_x86_asm.h" 1214 1215 1216void _tnl_generate_sse_emit( GLcontext *ctx ) 1217{ 1218 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx); 1219 struct x86_program p; 1220 1221 if (!cpu_has_xmm) { 1222 vtx->codegen_emit = NULL; 1223 return; 1224 } 1225 1226 memset(&p, 0, sizeof(p)); 1227 p.ctx = ctx; 1228 p.store = MALLOC(1024); 1229 1230 p.inputs_safe = 0; /* for now */ 1231 p.outputs_safe = 1; /* for now */ 1232 p.have_sse2 = cpu_has_xmm2; 1233 p.identity = make_reg(file_XMM, 6); 1234 p.chan0 = make_reg(file_XMM, 7); 1235 1236 if (build_vertex_emit(&p)) { 1237 _tnl_register_fastpath( vtx, GL_TRUE ); 1238 if (DISASSEM) 1239 _mesa_printf("disassemble 0x%x 0x%x\n", p.store, p.csr); 1240 } 1241 else { 1242 /* Note the failure so that we don't keep trying to codegen an 1243 * impossible state: 1244 */ 1245 _tnl_register_fastpath( vtx, GL_FALSE ); 1246 FREE(p.store); 1247 } 1248 1249 (void)sse2_movd; 1250 (void)x86_inc; 1251 (void)x86_xor; 1252 (void)mmx_movq; 1253 (void)sse_movlhps; 1254 (void)sse_movhps; 1255 (void)sse_movaps; 1256 (void)sse2_packsswb; 1257 (void)sse2_pshufd; 1258} 1259 1260#else 1261 1262void _tnl_generate_sse_emit( GLcontext *ctx ) 1263{ 1264 /* Dummy version for when USE_SSE_ASM not defined */ 1265} 1266 1267#endif 1268