t_vertex_sse.c revision dc7fc173966e314f89502473044933a099c838ae
1/* 2 * Copyright 2003 Tungsten Graphics, inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw@tungstengraphics.com> 26 */ 27 28#include "glheader.h" 29#include "context.h" 30#include "colormac.h" 31#include "t_context.h" 32#include "t_vertex.h" 33#include "simple_list.h" 34 35#include <unistd.h> 36#include <sys/types.h> 37#include <sys/stat.h> 38#include <fcntl.h> 39 40#define X 0 41#define Y 1 42#define Z 2 43#define W 3 44 45#define DISASSEM 1 46 47struct x86_reg { 48 GLuint file:3; 49 GLuint idx:3; 50 GLuint mod:2; /* mod_REG if this is just a register */ 51 GLint disp:24; /* only +/- 23bits of offset - should be enough... */ 52}; 53 54struct x86_program { 55 GLcontext *ctx; 56 57 GLubyte *store; 58 GLubyte *csr; 59 60 GLuint stack_offset; 61 62 GLboolean inputs_safe; 63 GLboolean outputs_safe; 64 65 struct x86_reg identity; 66 struct x86_reg vp0; 67 struct x86_reg vp1; 68}; 69 70 71#define X86_TWOB 0x0f 72 73/* There are more but these are all we'll use: 74 */ 75enum x86_reg_file { 76 file_REG32, 77 file_XMM 78}; 79 80/* Values for mod field of modr/m byte 81 */ 82enum x86_reg_mod { 83 mod_INDIRECT, 84 mod_DISP8, 85 mod_DISP32, 86 mod_REG 87}; 88 89enum x86_reg_name { 90 reg_AX, 91 reg_CX, 92 reg_DX, 93 reg_BX, 94 reg_SP, 95 reg_BP, 96 reg_SI, 97 reg_DI 98}; 99 100 101enum x86_cc { 102 cc_O, /* overflow */ 103 cc_NO, /* not overflow */ 104 cc_NAE, /* not above or equal / carry */ 105 cc_AE, /* above or equal / not carry */ 106 cc_E, /* equal / zero */ 107 cc_NE /* not equal / not zero */ 108}; 109 110#define cc_Z cc_E 111#define cc_NZ cc_NE 112 113 114/* Create and manipulate registers and regmem values: 115 */ 116static struct x86_reg make_reg( GLuint file, 117 GLuint idx ) 118{ 119 struct x86_reg reg; 120 121 reg.file = file; 122 reg.idx = idx; 123 reg.mod = mod_REG; 124 reg.disp = 0; 125 126 return reg; 127} 128 129static struct x86_reg make_disp( struct x86_reg reg, 130 GLint disp ) 131{ 132 assert(reg.file == file_REG32); 133 134 if (reg.mod == mod_REG) 135 reg.disp = disp; 136 else 137 reg.disp += disp; 138 139 if (reg.disp == 0) 140 reg.mod = mod_INDIRECT; 141 else if (reg.disp <= 127 && reg.disp >= -128) 142 reg.mod = mod_DISP8; 143 else 144 reg.mod = mod_DISP32; 145 146 return reg; 147} 148 149static struct x86_reg deref( struct x86_reg reg ) 150{ 151 return make_disp(reg, 0); 152} 153 154static struct x86_reg get_base_reg( struct x86_reg reg ) 155{ 156 return make_reg( reg.file, reg.idx ); 157} 158 159 160/* Retreive a reference to one of the function arguments, taking into 161 * account any push/pop activity: 162 */ 163static struct x86_reg make_fn_arg( struct x86_program *p, 164 GLuint arg ) 165{ 166 return make_disp(make_reg(file_REG32, reg_SP), 167 p->stack_offset + arg * 4); /* ??? */ 168} 169 170 171static struct x86_reg get_identity( struct x86_program *p ) 172{ 173 return p->identity; 174} 175 176static struct x86_reg get_sse_temp( struct x86_program *p ) 177{ 178 return make_reg(file_XMM, 7); /* hardwired */ 179} 180 181static void release_temp( struct x86_program *p, 182 struct x86_reg reg ) 183{ 184 assert(reg.file == file_XMM && 185 reg.idx == 7); 186} 187 188/* Emit bytes to the instruction stream: 189 */ 190static void emit_1b( struct x86_program *p, GLbyte b0 ) 191{ 192 *(GLbyte *)(p->csr++) = b0; 193} 194 195static void emit_1i( struct x86_program *p, GLint i0 ) 196{ 197 *(GLint *)(p->csr) = i0; 198 p->csr += 4; 199} 200 201static void disassem( struct x86_program *p, const char *fn ) 202{ 203#if DISASSEM 204 static const char *last_fn; 205 if (fn && fn != last_fn) { 206 _mesa_printf("0x%x: %s\n", p->csr, fn); 207 last_fn = fn; 208 } 209#endif 210} 211 212static void emit_1ub_fn( struct x86_program *p, GLubyte b0, const char *fn ) 213{ 214 disassem(p, fn); 215 *(p->csr++) = b0; 216} 217 218static void emit_2ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, const char *fn ) 219{ 220 disassem(p, fn); 221 *(p->csr++) = b0; 222 *(p->csr++) = b1; 223} 224 225static void emit_3ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2, const char *fn ) 226{ 227 disassem(p, fn); 228 *(p->csr++) = b0; 229 *(p->csr++) = b1; 230 *(p->csr++) = b2; 231} 232 233#define emit_1ub(p, b0) emit_1ub_fn(p, b0, __FUNCTION__) 234#define emit_2ub(p, b0, b1) emit_2ub_fn(p, b0, b1, __FUNCTION__) 235#define emit_3ub(p, b0, b1, b2) emit_3ub_fn(p, b0, b1, b2, __FUNCTION__) 236 237 238/* Labels, jumps and fixup: 239 */ 240static GLubyte *get_label( struct x86_program *p ) 241{ 242 return p->csr; 243} 244 245static void emit_jcc( struct x86_program *p, 246 GLuint cc, 247 GLubyte *label ) 248{ 249 GLint offset = label - (get_label(p) + 2); 250 251 if (offset <= 127 && offset >= -128) { 252 emit_1ub(p, 0x70 + cc); 253 emit_1b(p, (GLbyte) offset); 254 } 255 else { 256 offset = label - (get_label(p) + 6); 257 emit_2ub(p, 0x0f, 0x80 + cc); 258 emit_1i(p, offset); 259 } 260} 261 262/* Always use a 32bit offset for forward jumps: 263 */ 264static GLubyte *emit_jcc_forward( struct x86_program *p, 265 GLuint cc ) 266{ 267 emit_2ub(p, 0x0f, 0x80 + cc); 268 emit_1i(p, 0); 269 return get_label(p); 270} 271 272/* Fixup offset from forward jump: 273 */ 274static void do_fixup( struct x86_program *p, 275 GLubyte *fixup ) 276{ 277 *(int *)(fixup - 4) = get_label(p) - fixup; 278} 279 280static void emit_push( struct x86_program *p, 281 struct x86_reg reg ) 282{ 283 assert(reg.mod == mod_REG); 284 emit_1ub(p, 0x50 + reg.idx); 285 p->stack_offset += 4; 286} 287 288static void emit_pop( struct x86_program *p, 289 struct x86_reg reg ) 290{ 291 assert(reg.mod == mod_REG); 292 emit_1ub(p, 0x58 + reg.idx); 293 p->stack_offset -= 4; 294} 295 296static void emit_inc( struct x86_program *p, 297 struct x86_reg reg ) 298{ 299 assert(reg.mod == mod_REG); 300 emit_1ub(p, 0x40 + reg.idx); 301} 302 303static void emit_dec( struct x86_program *p, 304 struct x86_reg reg ) 305{ 306 assert(reg.mod == mod_REG); 307 emit_1ub(p, 0x48 + reg.idx); 308} 309 310static void emit_ret( struct x86_program *p ) 311{ 312 emit_1ub(p, 0xc3); 313} 314 315 316 317 318/* Build a modRM byte + possible displacement. No treatment of SIB 319 * indexing. BZZT - no way to encode an absolute address. 320 */ 321static void emit_modrm( struct x86_program *p, 322 struct x86_reg reg, 323 struct x86_reg regmem ) 324{ 325 GLubyte val = 0; 326 327 assert(reg.mod == mod_REG); 328 329 val |= regmem.mod << 6; /* mod field */ 330 val |= reg.idx << 3; /* reg field */ 331 val |= regmem.idx; /* r/m field */ 332 333 emit_1ub_fn(p, val, 0); 334 335 /* Oh-oh we've stumbled into the SIB thing. 336 */ 337 if (regmem.idx == reg_SP) { 338 emit_1ub_fn(p, 0x24, 0); /* simplistic! */ 339 } 340 341 switch (regmem.mod) { 342 case mod_REG: 343 case mod_INDIRECT: 344 break; 345 case mod_DISP8: 346 emit_1b(p, regmem.disp); 347 break; 348 case mod_DISP32: 349 emit_1i(p, regmem.disp); 350 break; 351 } 352} 353 354/* Many x86 instructions have two opcodes to cope with the situations 355 * where the destination is a register or memory reference 356 * respectively. This function selects the correct opcode based on 357 * the arguments presented. 358 */ 359static void emit_op_modrm( struct x86_program *p, 360 GLubyte op_dst_is_reg, 361 GLubyte op_dst_is_mem, 362 struct x86_reg dst, 363 struct x86_reg src ) 364{ 365 switch (dst.mod) { 366 case mod_REG: 367 emit_1ub_fn(p, op_dst_is_reg, 0); 368 emit_modrm(p, dst, src); 369 break; 370 case mod_INDIRECT: 371 case mod_DISP32: 372 case mod_DISP8: 373 assert(src.mod == mod_REG); 374 emit_1ub_fn(p, op_dst_is_mem, 0); 375 emit_modrm(p, src, dst); 376 break; 377 } 378} 379 380static void emit_mov( struct x86_program *p, 381 struct x86_reg dst, 382 struct x86_reg src ) 383{ 384 emit_op_modrm( p, 0x8b, 0x89, dst, src ); 385} 386 387static void emit_xor( struct x86_program *p, 388 struct x86_reg dst, 389 struct x86_reg src ) 390{ 391 emit_op_modrm( p, 0x33, 0x31, dst, src ); 392} 393 394static void emit_cmp( struct x86_program *p, 395 struct x86_reg dst, 396 struct x86_reg src ) 397{ 398 emit_op_modrm( p, 0x3b, 0x39, dst, src ); 399} 400 401static void emit_movlps( struct x86_program *p, 402 struct x86_reg dst, 403 struct x86_reg src ) 404{ 405 emit_1ub(p, X86_TWOB); 406 emit_op_modrm( p, 0x12, 0x13, dst, src ); 407} 408 409static void emit_movhps( struct x86_program *p, 410 struct x86_reg dst, 411 struct x86_reg src ) 412{ 413 emit_1ub(p, X86_TWOB); 414 emit_op_modrm( p, 0x16, 0x17, dst, src ); 415} 416 417static void emit_movd( struct x86_program *p, 418 struct x86_reg dst, 419 struct x86_reg src ) 420{ 421 emit_2ub(p, 0x66, X86_TWOB); 422 emit_op_modrm( p, 0x6e, 0x7e, dst, src ); 423} 424 425static void emit_movss( struct x86_program *p, 426 struct x86_reg dst, 427 struct x86_reg src ) 428{ 429 emit_2ub(p, 0xF3, X86_TWOB); 430 emit_op_modrm( p, 0x10, 0x11, dst, src ); 431} 432 433static void emit_movaps( struct x86_program *p, 434 struct x86_reg dst, 435 struct x86_reg src ) 436{ 437 emit_1ub(p, X86_TWOB); 438 emit_op_modrm( p, 0x28, 0x29, dst, src ); 439} 440 441static void emit_movups( struct x86_program *p, 442 struct x86_reg dst, 443 struct x86_reg src ) 444{ 445 emit_1ub(p, X86_TWOB); 446 emit_op_modrm( p, 0x10, 0x11, dst, src ); 447} 448 449/* SSE operations often only have one format, with dest constrained to 450 * be a register: 451 */ 452static void emit_mulps( struct x86_program *p, 453 struct x86_reg dst, 454 struct x86_reg src ) 455{ 456 emit_2ub(p, X86_TWOB, 0x59); 457 emit_modrm( p, dst, src ); 458} 459 460static void emit_addps( struct x86_program *p, 461 struct x86_reg dst, 462 struct x86_reg src ) 463{ 464 emit_2ub(p, X86_TWOB, 0x58); 465 emit_modrm( p, dst, src ); 466} 467 468static void emit_cvtps2dq( struct x86_program *p, 469 struct x86_reg dst, 470 struct x86_reg src ) 471{ 472 emit_3ub(p, 0x66, X86_TWOB, 0x5B); 473 emit_modrm( p, dst, src ); 474} 475 476static void emit_packssdw( struct x86_program *p, 477 struct x86_reg dst, 478 struct x86_reg src ) 479{ 480 emit_3ub(p, 0x66, X86_TWOB, 0x6B); 481 emit_modrm( p, dst, src ); 482} 483 484static void emit_packsswb( struct x86_program *p, 485 struct x86_reg dst, 486 struct x86_reg src ) 487{ 488 emit_3ub(p, 0x66, X86_TWOB, 0x63); 489 emit_modrm( p, dst, src ); 490} 491 492static void emit_packuswb( struct x86_program *p, 493 struct x86_reg dst, 494 struct x86_reg src ) 495{ 496 emit_3ub(p, 0x66, X86_TWOB, 0x67); 497 emit_modrm( p, dst, src ); 498} 499 500/* Load effective address: 501 */ 502static void emit_lea( struct x86_program *p, 503 struct x86_reg dst, 504 struct x86_reg src ) 505{ 506 emit_1ub(p, 0x8d); 507 emit_modrm( p, dst, src ); 508} 509 510static void emit_add_imm( struct x86_program *p, 511 struct x86_reg dst, 512 struct x86_reg src, 513 GLint value ) 514{ 515 emit_lea(p, dst, make_disp(src, value)); 516} 517 518static void emit_test( struct x86_program *p, 519 struct x86_reg dst, 520 struct x86_reg src ) 521{ 522 emit_1ub(p, 0x85); 523 emit_modrm( p, dst, src ); 524} 525 526 527 528 529/** 530 * Perform a reduced swizzle: 531 */ 532static void emit_pshufd( struct x86_program *p, 533 struct x86_reg dest, 534 struct x86_reg arg0, 535 GLubyte x, 536 GLubyte y, 537 GLubyte z, 538 GLubyte w) 539{ 540 emit_3ub(p, 0x66, X86_TWOB, 0x70); 541 emit_modrm(p, dest, arg0); 542 emit_1ub(p, (x|(y<<2)|(z<<4)|w<<6)); 543} 544 545 546static void emit_pk4ub( struct x86_program *p, 547 struct x86_reg dest, 548 struct x86_reg arg0 ) 549{ 550 emit_cvtps2dq(p, dest, arg0); 551 emit_packssdw(p, dest, dest); 552 emit_packuswb(p, dest, dest); 553} 554 555static void emit_load4f_4( struct x86_program *p, 556 struct x86_reg dest, 557 struct x86_reg arg0 ) 558{ 559 emit_movups(p, dest, arg0); 560} 561 562static void emit_load4f_3( struct x86_program *p, 563 struct x86_reg dest, 564 struct x86_reg arg0 ) 565{ 566 /* Have to jump through some hoops: 567 * 568 * 0 0 0 1 -- skip if reg[3] preserved over loop iterations 569 * c 0 0 1 570 * 0 0 c 1 571 * a b c 1 572 */ 573 emit_movups(p, dest, get_identity(p)); 574 emit_movss(p, dest, make_disp(arg0, 8)); 575 emit_pshufd(p, dest, dest, Y,Z,X,W ); 576 emit_movlps(p, dest, arg0); 577} 578 579static void emit_load4f_2( struct x86_program *p, 580 struct x86_reg dest, 581 struct x86_reg arg0 ) 582{ 583 /* Pull in 2 dwords, then copy the top 2 dwords with 0,1 from id. 584 */ 585 emit_movlps(p, dest, arg0); 586 emit_movhps(p, dest, get_identity(p)); 587} 588 589static void emit_load4f_1( struct x86_program *p, 590 struct x86_reg dest, 591 struct x86_reg arg0 ) 592{ 593 /* Initialized with [0,0,0,1] from id, then pull in the single low 594 * word. 595 */ 596 emit_movups(p, dest, get_identity(p)); 597 emit_movss(p, dest, arg0); 598} 599 600 601 602static void emit_load3f_3( struct x86_program *p, 603 struct x86_reg dest, 604 struct x86_reg arg0 ) 605{ 606 /* Over-reads by 1 dword - potential SEGV... Deal with in 607 * array_cache by treating size-3 arrays specially, copying to 608 * temporary storage if last element (how can you tell?) falls on a 609 * 4k boundary. 610 */ 611 if (p->inputs_safe) { 612 emit_movups(p, dest, arg0); 613 } 614 else { 615 /* c . . . 616 * c c c c 617 * a b c c 618 */ 619 emit_movss(p, dest, make_disp(arg0, 8)); 620 emit_pshufd(p, dest, dest, X,X,X,X); 621 emit_movlps(p, dest, arg0); 622 } 623} 624 625static void emit_load3f_2( struct x86_program *p, 626 struct x86_reg dest, 627 struct x86_reg arg0 ) 628{ 629 emit_load4f_2(p, dest, arg0); 630} 631 632static void emit_load3f_1( struct x86_program *p, 633 struct x86_reg dest, 634 struct x86_reg arg0 ) 635{ 636 emit_load4f_1(p, dest, arg0); 637} 638 639static void emit_load2f_2( struct x86_program *p, 640 struct x86_reg dest, 641 struct x86_reg arg0 ) 642{ 643 emit_movlps(p, dest, arg0); 644} 645 646static void emit_load2f_1( struct x86_program *p, 647 struct x86_reg dest, 648 struct x86_reg arg0 ) 649{ 650 emit_load4f_1(p, dest, arg0); 651} 652 653static void emit_load1f_1( struct x86_program *p, 654 struct x86_reg dest, 655 struct x86_reg arg0 ) 656{ 657 emit_movss(p, dest, arg0); 658} 659 660static void (*load[4][4])( struct x86_program *p, 661 struct x86_reg dest, 662 struct x86_reg arg0 ) = { 663 { emit_load1f_1, 664 emit_load1f_1, 665 emit_load1f_1, 666 emit_load1f_1 }, 667 668 { emit_load2f_1, 669 emit_load2f_2, 670 emit_load2f_2, 671 emit_load2f_2 }, 672 673 { emit_load3f_1, 674 emit_load3f_2, 675 emit_load3f_3, 676 emit_load3f_3 }, 677 678 { emit_load4f_1, 679 emit_load4f_2, 680 emit_load4f_3, 681 emit_load4f_4 } 682}; 683 684static void emit_load( struct x86_program *p, 685 struct x86_reg dest, 686 GLuint sz, 687 struct x86_reg src, 688 GLuint src_sz) 689{ 690 _mesa_printf("load %d/%d\n", sz, src_sz); 691 load[sz-1][src_sz-1](p, dest, src); 692} 693 694 695static void emit_store4f( struct x86_program *p, 696 struct x86_reg dest, 697 struct x86_reg arg0 ) 698{ 699 emit_movups(p, dest, arg0); 700} 701 702static void emit_store3f( struct x86_program *p, 703 struct x86_reg dest, 704 struct x86_reg arg0 ) 705{ 706 if (p->outputs_safe) { 707 /* Emit the extra dword anyway. This may hurt writecombining, 708 * may cause other problems. 709 */ 710 emit_movups(p, dest, arg0); 711 } 712 else { 713 /* Alternate strategy - emit two, shuffle, emit one. 714 */ 715 struct x86_reg tmp = get_sse_temp(p); 716 emit_movlps(p, dest, arg0); 717 718 emit_pshufd(p, tmp, arg0, Z, Z, Z, Z ); 719 emit_movss(p, make_disp(dest,8), tmp); 720 release_temp(p, tmp); 721 } 722} 723 724static void emit_store2f( struct x86_program *p, 725 struct x86_reg dest, 726 struct x86_reg arg0 ) 727{ 728 emit_movlps(p, dest, arg0); 729} 730 731static void emit_store1f( struct x86_program *p, 732 struct x86_reg dest, 733 struct x86_reg arg0 ) 734{ 735 emit_movss(p, dest, arg0); 736} 737 738 739static void (*store[4])( struct x86_program *p, 740 struct x86_reg dest, 741 struct x86_reg arg0 ) = 742{ 743 emit_store1f, 744 emit_store2f, 745 emit_store3f, 746 emit_store4f 747}; 748 749static void emit_store( struct x86_program *p, 750 struct x86_reg dest, 751 GLuint sz, 752 struct x86_reg temp ) 753 754{ 755 store[sz-1](p, dest, temp); 756} 757 758 759static GLint get_offset( const void *a, const void *b ) 760{ 761 return (const char *)b - (const char *)a; 762} 763 764 765 766/* Lots of hardcoding 767 * 768 * EAX -- pointer to current output vertex 769 * ECX -- pointer to current attribute 770 * 771 */ 772static GLboolean build_vertex_emit( struct x86_program *p ) 773{ 774 GLcontext *ctx = p->ctx; 775 TNLcontext *tnl = TNL_CONTEXT(ctx); 776 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx); 777 struct tnl_clipspace_attr *a = vtx->attr; 778 GLuint j; 779 780 struct x86_reg vertexEAX = make_reg(file_REG32, reg_AX); 781 struct x86_reg srcEDI = make_reg(file_REG32, reg_CX); 782 struct x86_reg countEBP = make_reg(file_REG32, reg_BP); 783 struct x86_reg vtxESI = make_reg(file_REG32, reg_SI); 784 struct x86_reg tmp = make_reg(file_XMM, 0); 785 struct x86_reg vp0 = make_reg(file_XMM, 1); 786 struct x86_reg vp1 = make_reg(file_XMM, 2); 787 struct x86_reg chan0 = make_reg(file_XMM, 3); 788 GLubyte *fixup, *label; 789 790 p->csr = p->store; 791 792 /* Push a few regs? 793 */ 794 emit_push(p, srcEDI); 795 emit_push(p, countEBP); 796 emit_push(p, vtxESI); 797 798 799 /* Get vertex count, compare to zero 800 */ 801 emit_xor(p, srcEDI, srcEDI); 802 emit_mov(p, countEBP, make_fn_arg(p, 2)); 803 emit_cmp(p, countEBP, srcEDI); 804 fixup = emit_jcc_forward(p, cc_E); 805 806 807 /* Initialize destination register. 808 */ 809 emit_mov(p, vertexEAX, make_fn_arg(p, 3)); 810 811 /* Dereference ctx to get tnl, then vtx: 812 */ 813 emit_mov(p, vtxESI, make_fn_arg(p, 1)); 814 emit_mov(p, vtxESI, make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context))); 815 vtxESI = make_disp(vtxESI, get_offset(tnl, &tnl->clipspace)); 816 817 818 /* Possibly load vp0, vp1 for viewport calcs: 819 */ 820 if (vtx->need_viewport) { 821 emit_movups(p, vp0, make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0]))); 822 emit_movups(p, vp1, make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0]))); 823 } 824 825 /* always load, needed or not: 826 */ 827 emit_movups(p, chan0, make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0]))); 828 emit_movups(p, p->identity, make_disp(vtxESI, get_offset(vtx, &vtx->identity[0]))); 829 830 /* Note address for loop jump */ 831 label = get_label(p); 832 833 /* Emit code for each of the attributes. Currently routes 834 * everything through SSE registers, even when it might be more 835 * efficient to stick with regular old x86. No optimization or 836 * other tricks - enough new ground to cover here just getting 837 * things working. 838 */ 839 for (j = 0; j < vtx->attr_count; j++) { 840 struct x86_reg dest = make_disp(vertexEAX, vtx->attr[j].vertoffset); 841 struct x86_reg ptr_to_src = make_disp(vtxESI, get_offset(vtx, &vtx->attr[j].inputptr)); 842 843 /* Load current a[j].inputptr 844 */ 845 emit_mov(p, srcEDI, ptr_to_src); 846 847 /* Now, load an XMM reg from src, perhaps transform, then save. 848 * Could be shortcircuited in specific cases: 849 */ 850 switch (a[j].format) { 851 case EMIT_1F: 852 emit_load(p, tmp, 1, deref(srcEDI), vtx->attr[j].inputsize); 853 emit_store(p, dest, 1, tmp); 854 break; 855 case EMIT_2F: 856 emit_load(p, tmp, 2, deref(srcEDI), vtx->attr[j].inputsize); 857 emit_store(p, dest, 2, tmp); 858 break; 859 case EMIT_3F: 860 /* Potentially the worst case - hardcode 2+1 copying: 861 */ 862 emit_load(p, tmp, 3, deref(srcEDI), vtx->attr[j].inputsize); 863 emit_store(p, dest, 3, tmp); 864 break; 865 case EMIT_4F: 866 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 867 emit_store(p, dest, 4, tmp); 868 break; 869 case EMIT_2F_VIEWPORT: 870 emit_load(p, tmp, 2, deref(srcEDI), vtx->attr[j].inputsize); 871 emit_mulps(p, tmp, vp0); 872 emit_addps(p, tmp, vp1); 873 emit_store(p, dest, 2, tmp); 874 break; 875 case EMIT_3F_VIEWPORT: 876 emit_load(p, tmp, 3, deref(srcEDI), vtx->attr[j].inputsize); 877 emit_mulps(p, tmp, vp0); 878 emit_addps(p, tmp, vp1); 879 emit_store(p, dest, 3, tmp); 880 break; 881 case EMIT_4F_VIEWPORT: 882 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 883 emit_mulps(p, tmp, vp0); 884 emit_addps(p, tmp, vp1); 885 emit_store(p, dest, 4, tmp); 886 break; 887 case EMIT_3F_XYW: 888 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 889 emit_pshufd(p, tmp, tmp, X, Y, W, Z); 890 emit_store(p, dest, 3, tmp); 891 break; 892 893 /* Try and bond 3ub + 1ub pairs into a single 4ub operation? 894 */ 895 case EMIT_1UB_1F: 896 case EMIT_3UB_3F_RGB: 897 case EMIT_3UB_3F_BGR: 898 _mesa_printf("non-implemneted format %d\n", a[j].format); 899 return GL_FALSE; /* add this later */ 900 901 case EMIT_4UB_4F_RGBA: 902 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 903 emit_mulps(p, tmp, chan0); 904 emit_pk4ub(p, tmp, tmp); 905 emit_store(p, dest, 1, tmp); 906 break; 907 case EMIT_4UB_4F_BGRA: 908 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 909 emit_pshufd(p, tmp, tmp, Z, Y, X, W); 910 emit_mulps(p, tmp, chan0); 911 emit_pk4ub(p, tmp, tmp); 912 emit_store(p, dest, 1, tmp); 913 break; 914 case EMIT_4UB_4F_ARGB: 915 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 916 emit_pshufd(p, tmp, tmp, W, X, Y, Z); 917 emit_mulps(p, tmp, chan0); 918 emit_pk4ub(p, tmp, tmp); 919 emit_store(p, dest, 1, tmp); 920 break; 921 case EMIT_4UB_4F_ABGR: 922 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 923 emit_pshufd(p, tmp, tmp, W, Z, Y, X); 924 emit_mulps(p, tmp, chan0); 925 emit_pk4ub(p, tmp, tmp); 926 emit_store(p, dest, 1, tmp); 927 break; 928 case EMIT_4CHAN_4F_RGBA: 929 switch (CHAN_TYPE) { 930 case GL_UNSIGNED_BYTE: 931 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 932 emit_mulps(p, tmp, chan0); 933 emit_pk4ub(p, tmp, tmp); 934 emit_store(p, dest, 1, tmp); 935 break; 936 case GL_FLOAT: 937 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); 938 emit_store(p, dest, 4, tmp); 939 break; 940 case GL_UNSIGNED_SHORT: 941 default: 942 _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE)); 943 return GL_FALSE; 944 } 945 break; 946 default: 947 _mesa_printf("unknown a[%d].format %d\n", j, a[j].format); 948 return GL_FALSE; /* catch any new opcodes */ 949 } 950 951 /* add a[j].inputstride (hardcoded value - could just as easily 952 * pull the stride value from memory each time). 953 */ 954 emit_add_imm(p, srcEDI, srcEDI, a[j].inputstride); 955 956 /* save new value of a[j].inputptr 957 */ 958 emit_mov(p, ptr_to_src, srcEDI); 959 960 } 961 962 /* Next vertex: 963 */ 964 emit_add_imm(p, vertexEAX, vertexEAX, vtx->vertex_size); 965 966 /* decr count, loop if not zero 967 */ 968 emit_dec(p, countEBP); 969 emit_test(p, countEBP, countEBP); 970 emit_jcc(p, cc_NZ, label); 971 972 /* Land forward jump here: 973 */ 974 do_fixup(p, fixup); 975 976 /* Pop regs and return 977 */ 978 emit_pop(p, get_base_reg(vtxESI)); 979 emit_pop(p, countEBP); 980 emit_pop(p, srcEDI); 981 emit_ret(p); 982 983 vtx->emit = (tnl_emit_func)p->store; 984 return GL_TRUE; 985} 986 987void _tnl_generate_sse_emit( GLcontext *ctx ) 988{ 989 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx); 990 struct x86_program p; 991 992 memset(&p, 0, sizeof(p)); 993 p.ctx = ctx; 994 p.store = MALLOC(1024); 995 996 p.inputs_safe = 1; /* for now */ 997 p.outputs_safe = 1; /* for now */ 998 p.identity = make_reg(file_XMM, 6); 999 1000 if (build_vertex_emit(&p)) { 1001 _tnl_register_fastpath( vtx, GL_TRUE ); 1002 if (DISASSEM) 1003 _mesa_printf("disassemble 0x%x 0x%x\n", p.store, p.csr); 1004 } 1005 else { 1006 FREE(p.store); 1007 } 1008 1009 (void)emit_movd; 1010 (void)emit_inc; 1011 (void)emit_xor; 1012} 1013