nv50_program.c revision ec5c23551cdb4c369d8f8f392208f4d4bf29911b
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "pipe/p_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33 34#define NV50_SU_MAX_TEMP 64 35#define NV50_SU_MAX_ADDR 7 36//#define NV50_PROGRAM_DUMP 37 38/* ARL - gallium craps itself on progs/vp/arl.txt 39 * 40 * MSB - Like MAD, but MUL+SUB 41 * - Fuck it off, introduce a way to negate args for ops that 42 * support it. 43 * 44 * Look into inlining IMMD for ops other than MOV (make it general?) 45 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 46 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 47 * 48 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 49 * case, if the emit_src() causes the inst to suddenly become long. 50 * 51 * Verify half-insns work where expected - and force disable them where they 52 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 53 * 54 * FUCK! watch dst==src vectors, can overwrite components that are needed. 55 * ie. SUB R0, R0.yzxw, R0 56 * 57 * Things to check with renouveau: 58 * FP attr/result assignment - how? 59 * attrib 60 * - 0x16bc maps vp output onto fp hpos 61 * - 0x16c0 maps vp output onto fp col0 62 * result 63 * - colr always 0-3 64 * - depr always 4 65 * 0x16bc->0x16e8 --> some binding between vp/fp regs 66 * 0x16b8 --> VP output count 67 * 68 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 69 * "MOV rcol.x, fcol.y" = 0x00000004 70 * 0x19a8 --> as above but 0x00000100 and 0x00000000 71 * - 0x00100000 used when KIL used 72 * 0x196c --> as above but 0x00000011 and 0x00000000 73 * 74 * 0x1988 --> 0xXXNNNNNN 75 * - XX == FP high something 76 */ 77struct nv50_reg { 78 enum { 79 P_TEMP, 80 P_ATTR, 81 P_RESULT, 82 P_CONST, 83 P_IMMD, 84 P_ADDR 85 } type; 86 int index; 87 88 int hw; 89 int neg; 90 91 int rhw; /* result hw for FP outputs, or interpolant index */ 92 int acc; /* instruction where this reg is last read (first insn == 1) */ 93}; 94 95/* arbitrary limits */ 96#define MAX_IF_DEPTH 4 97#define MAX_LOOP_DEPTH 4 98 99struct nv50_pc { 100 struct nv50_program *p; 101 102 /* hw resources */ 103 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 104 struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; 105 106 /* tgsi resources */ 107 struct nv50_reg *temp; 108 int temp_nr; 109 struct nv50_reg *attr; 110 int attr_nr; 111 struct nv50_reg *result; 112 int result_nr; 113 struct nv50_reg *param; 114 int param_nr; 115 struct nv50_reg *immd; 116 float *immd_buf; 117 int immd_nr; 118 struct nv50_reg **addr; 119 int addr_nr; 120 121 struct nv50_reg *temp_temp[16]; 122 unsigned temp_temp_nr; 123 124 /* broadcast and destination replacement regs */ 125 struct nv50_reg *r_brdc; 126 struct nv50_reg *r_dst[4]; 127 128 unsigned interp_mode[32]; 129 /* perspective interpolation registers */ 130 struct nv50_reg *iv_p; 131 struct nv50_reg *iv_c; 132 133 struct nv50_program_exec *if_cond; 134 struct nv50_program_exec *if_insn[MAX_IF_DEPTH]; 135 struct nv50_program_exec *br_join[MAX_IF_DEPTH]; 136 struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */ 137 int if_lvl, loop_lvl; 138 unsigned loop_pos[MAX_LOOP_DEPTH]; 139 140 /* current instruction and total number of insns */ 141 unsigned insn_cur; 142 unsigned insn_nr; 143 144 boolean allow32; 145}; 146 147static INLINE void 148ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) 149{ 150 reg->type = type; 151 reg->index = index; 152 reg->hw = hw; 153 reg->neg = 0; 154 reg->rhw = -1; 155 reg->acc = 0; 156} 157 158static INLINE unsigned 159popcnt4(uint32_t val) 160{ 161 static const unsigned cnt[16] 162 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 163 return cnt[val & 0xf]; 164} 165 166static void 167terminate_mbb(struct nv50_pc *pc) 168{ 169 int i; 170 171 /* remove records of temporary address register values */ 172 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 173 if (pc->r_addr[i].index < 0) 174 pc->r_addr[i].rhw = -1; 175} 176 177static void 178alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 179{ 180 int i = 0; 181 182 if (reg->type == P_RESULT) { 183 if (pc->p->cfg.high_result < (reg->hw + 1)) 184 pc->p->cfg.high_result = reg->hw + 1; 185 } 186 187 if (reg->type != P_TEMP) 188 return; 189 190 if (reg->hw >= 0) { 191 /*XXX: do this here too to catch FP temp-as-attr usage.. 192 * not clean, but works */ 193 if (pc->p->cfg.high_temp < (reg->hw + 1)) 194 pc->p->cfg.high_temp = reg->hw + 1; 195 return; 196 } 197 198 if (reg->rhw != -1) { 199 /* try to allocate temporary with index rhw first */ 200 if (!(pc->r_temp[reg->rhw])) { 201 pc->r_temp[reg->rhw] = reg; 202 reg->hw = reg->rhw; 203 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 204 pc->p->cfg.high_temp = reg->rhw + 1; 205 return; 206 } 207 /* make sure we don't get things like $r0 needs to go 208 * in $r1 and $r1 in $r0 209 */ 210 i = pc->result_nr * 4; 211 } 212 213 for (; i < NV50_SU_MAX_TEMP; i++) { 214 if (!(pc->r_temp[i])) { 215 pc->r_temp[i] = reg; 216 reg->hw = i; 217 if (pc->p->cfg.high_temp < (i + 1)) 218 pc->p->cfg.high_temp = i + 1; 219 return; 220 } 221 } 222 223 assert(0); 224} 225 226/* XXX: For shaders that aren't executed linearly (e.g. shaders that 227 * contain loops), we need to assign all hw regs to TGSI TEMPs early, 228 * lest we risk temp_temps overwriting regs alloc'd "later". 229 */ 230static struct nv50_reg * 231alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 232{ 233 struct nv50_reg *r; 234 int i; 235 236 if (dst && dst->type == P_TEMP && dst->hw == -1) 237 return dst; 238 239 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 240 if (!pc->r_temp[i]) { 241 r = MALLOC_STRUCT(nv50_reg); 242 ctor_reg(r, P_TEMP, -1, i); 243 pc->r_temp[i] = r; 244 return r; 245 } 246 } 247 248 assert(0); 249 return NULL; 250} 251 252/* Assign the hw of the discarded temporary register src 253 * to the tgsi register dst and free src. 254 */ 255static void 256assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 257{ 258 assert(src->index == -1 && src->hw != -1); 259 260 if (dst->hw != -1) 261 pc->r_temp[dst->hw] = NULL; 262 pc->r_temp[src->hw] = dst; 263 dst->hw = src->hw; 264 265 FREE(src); 266} 267 268/* release the hardware resource held by r */ 269static void 270release_hw(struct nv50_pc *pc, struct nv50_reg *r) 271{ 272 assert(r->type == P_TEMP); 273 if (r->hw == -1) 274 return; 275 276 assert(pc->r_temp[r->hw] == r); 277 pc->r_temp[r->hw] = NULL; 278 279 r->acc = 0; 280 if (r->index == -1) 281 FREE(r); 282} 283 284static void 285free_temp(struct nv50_pc *pc, struct nv50_reg *r) 286{ 287 if (r->index == -1) { 288 unsigned hw = r->hw; 289 290 FREE(pc->r_temp[hw]); 291 pc->r_temp[hw] = NULL; 292 } 293} 294 295static int 296alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 297{ 298 int i; 299 300 if ((idx + 4) >= NV50_SU_MAX_TEMP) 301 return 1; 302 303 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 304 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 305 return alloc_temp4(pc, dst, idx + 4); 306 307 for (i = 0; i < 4; i++) { 308 dst[i] = MALLOC_STRUCT(nv50_reg); 309 ctor_reg(dst[i], P_TEMP, -1, idx + i); 310 pc->r_temp[idx + i] = dst[i]; 311 } 312 313 return 0; 314} 315 316static void 317free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 318{ 319 int i; 320 321 for (i = 0; i < 4; i++) 322 free_temp(pc, reg[i]); 323} 324 325static struct nv50_reg * 326temp_temp(struct nv50_pc *pc) 327{ 328 if (pc->temp_temp_nr >= 16) 329 assert(0); 330 331 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 332 return pc->temp_temp[pc->temp_temp_nr++]; 333} 334 335static void 336kill_temp_temp(struct nv50_pc *pc) 337{ 338 int i; 339 340 for (i = 0; i < pc->temp_temp_nr; i++) 341 free_temp(pc, pc->temp_temp[i]); 342 pc->temp_temp_nr = 0; 343} 344 345static int 346ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w) 347{ 348 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)), 349 (pc->immd_nr + 1) * 4 * sizeof(float)); 350 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 351 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 352 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 353 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 354 355 return pc->immd_nr++; 356} 357 358static struct nv50_reg * 359alloc_immd(struct nv50_pc *pc, float f) 360{ 361 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); 362 unsigned hw; 363 364 for (hw = 0; hw < pc->immd_nr * 4; hw++) 365 if (pc->immd_buf[hw] == f) 366 break; 367 368 if (hw == pc->immd_nr * 4) 369 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4; 370 371 ctor_reg(r, P_IMMD, -1, hw); 372 return r; 373} 374 375static struct nv50_program_exec * 376exec(struct nv50_pc *pc) 377{ 378 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 379 380 e->param.index = -1; 381 return e; 382} 383 384static void 385emit(struct nv50_pc *pc, struct nv50_program_exec *e) 386{ 387 struct nv50_program *p = pc->p; 388 389 if (p->exec_tail) 390 p->exec_tail->next = e; 391 if (!p->exec_head) 392 p->exec_head = e; 393 p->exec_tail = e; 394 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 395} 396 397static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 398 399static boolean 400is_long(struct nv50_program_exec *e) 401{ 402 if (e->inst[0] & 1) 403 return TRUE; 404 return FALSE; 405} 406 407static boolean 408is_immd(struct nv50_program_exec *e) 409{ 410 if (is_long(e) && (e->inst[1] & 3) == 3) 411 return TRUE; 412 return FALSE; 413} 414 415static INLINE void 416set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 417 struct nv50_program_exec *e) 418{ 419 set_long(pc, e); 420 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 421 e->inst[1] |= (pred << 7) | (idx << 12); 422} 423 424static INLINE void 425set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 426 struct nv50_program_exec *e) 427{ 428 set_long(pc, e); 429 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 430 e->inst[1] |= (idx << 4) | (on << 6); 431} 432 433static INLINE void 434set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 435{ 436 if (is_long(e)) 437 return; 438 439 e->inst[0] |= 1; 440 set_pred(pc, 0xf, 0, e); 441 set_pred_wr(pc, 0, 0, e); 442} 443 444static INLINE void 445set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 446{ 447 if (dst->type == P_RESULT) { 448 set_long(pc, e); 449 e->inst[1] |= 0x00000008; 450 } 451 452 alloc_reg(pc, dst); 453 e->inst[0] |= (dst->hw << 2); 454} 455 456static INLINE void 457set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 458{ 459 float f = pc->immd_buf[imm->hw]; 460 unsigned val = fui(imm->neg ? -f : f); 461 462 set_long(pc, e); 463 /*XXX: can't be predicated - bits overlap.. catch cases where both 464 * are required and avoid them. */ 465 set_pred(pc, 0, 0, e); 466 set_pred_wr(pc, 0, 0, e); 467 468 e->inst[1] |= 0x00000002 | 0x00000001; 469 e->inst[0] |= (val & 0x3f) << 16; 470 e->inst[1] |= (val >> 6) << 2; 471} 472 473static void 474emit_set_addr(struct nv50_pc *pc, struct nv50_reg *dst, unsigned val) 475{ 476 struct nv50_program_exec *e = exec(pc); 477 478 assert(val <= 0xffff); 479 e->inst[0] = 0xd0000000 | ((val & 0xffff) << 9); 480 e->inst[1] = 0x20000000; 481 e->inst[0] |= dst->hw << 2; 482 set_long(pc, e); 483 484 emit(pc, e); 485} 486 487static struct nv50_reg * 488alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref) 489{ 490 int i; 491 struct nv50_reg *a = NULL; 492 493 if (!ref) { 494 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) { 495 if (pc->r_addr[i].index >= 0) 496 continue; 497 if (pc->r_addr[i].rhw >= 0 && 498 pc->r_addr[i].acc == pc->insn_cur) 499 continue; 500 501 pc->r_addr[i].rhw = -1; 502 pc->r_addr[i].index = i; 503 return &pc->r_addr[i]; 504 } 505 assert(0); 506 return NULL; 507 } 508 509 for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) { 510 if (pc->r_addr[i].index >= 0) /* occupied for TGSI */ 511 continue; 512 if (pc->r_addr[i].rhw < 0) { /* unused */ 513 a = &pc->r_addr[i]; 514 continue; 515 } 516 if (!a && pc->r_addr[i].acc != pc->insn_cur) 517 a = &pc->r_addr[i]; 518 519 if (ref->hw - pc->r_addr[i].rhw < 128) { 520 /* alloc'd & suitable */ 521 pc->r_addr[i].acc = pc->insn_cur; 522 return &pc->r_addr[i]; 523 } 524 } 525 assert(a); 526 emit_set_addr(pc, a, ref->hw * 4); 527 528 a->rhw = ref->hw % 128; 529 a->acc = pc->insn_cur; 530 return a; 531} 532 533#define INTERP_LINEAR 0 534#define INTERP_FLAT 1 535#define INTERP_PERSPECTIVE 2 536#define INTERP_CENTROID 4 537 538/* interpolant index has been stored in dst->rhw */ 539static void 540emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 541 unsigned mode) 542{ 543 assert(dst->rhw != -1); 544 struct nv50_program_exec *e = exec(pc); 545 546 e->inst[0] |= 0x80000000; 547 set_dst(pc, dst, e); 548 e->inst[0] |= (dst->rhw << 16); 549 550 if (mode & INTERP_FLAT) { 551 e->inst[0] |= (1 << 8); 552 } else { 553 if (mode & INTERP_PERSPECTIVE) { 554 e->inst[0] |= (1 << 25); 555 alloc_reg(pc, iv); 556 e->inst[0] |= (iv->hw << 9); 557 } 558 559 if (mode & INTERP_CENTROID) 560 e->inst[0] |= (1 << 24); 561 } 562 563 emit(pc, e); 564} 565 566static INLINE void 567set_addr(struct nv50_program_exec *e, struct nv50_reg *a) 568{ 569 assert(!(e->inst[0] & 0x0c000000)); 570 assert(!(e->inst[1] & 0x00000004)); 571 572 e->inst[0] |= (a->hw & 3) << 26; 573 e->inst[1] |= (a->hw >> 2) << 2; 574} 575 576static void 577set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 578 struct nv50_program_exec *e) 579{ 580 set_long(pc, e); 581 582 e->param.index = src->hw; 583 e->param.shift = s; 584 e->param.mask = m << (s % 32); 585 586 if (src->hw > 127) 587 set_addr(e, alloc_addr(pc, src)); 588 else 589 if (src->acc < 0) { 590 assert(src->type == P_CONST); 591 set_addr(e, pc->addr[src->index]); 592 } 593 594 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 595} 596 597static void 598emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 599{ 600 struct nv50_program_exec *e = exec(pc); 601 602 e->inst[0] = 0x10000000; 603 if (!pc->allow32) 604 set_long(pc, e); 605 606 set_dst(pc, dst, e); 607 608 if (!is_long(e) && src->type == P_IMMD) { 609 set_immd(pc, src, e); 610 /*XXX: 32-bit, but steals part of "half" reg space - need to 611 * catch and handle this case if/when we do half-regs 612 */ 613 } else 614 if (src->type == P_IMMD || src->type == P_CONST) { 615 set_long(pc, e); 616 set_data(pc, src, 0x7f, 9, e); 617 e->inst[1] |= 0x20000000; /* src0 const? */ 618 } else { 619 if (src->type == P_ATTR) { 620 set_long(pc, e); 621 e->inst[1] |= 0x00200000; 622 } 623 624 alloc_reg(pc, src); 625 e->inst[0] |= (src->hw << 9); 626 } 627 628 if (is_long(e) && !is_immd(e)) { 629 e->inst[1] |= 0x04000000; /* 32-bit */ 630 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */ 631 if (!(e->inst[1] & 0x20000000)) 632 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */ 633 } else 634 e->inst[0] |= 0x00008000; 635 636 emit(pc, e); 637} 638 639static INLINE void 640emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 641{ 642 struct nv50_reg *imm = alloc_immd(pc, f); 643 emit_mov(pc, dst, imm); 644 FREE(imm); 645} 646 647static boolean 648check_swap_src_0_1(struct nv50_pc *pc, 649 struct nv50_reg **s0, struct nv50_reg **s1) 650{ 651 struct nv50_reg *src0 = *s0, *src1 = *s1; 652 653 if (src0->type == P_CONST) { 654 if (src1->type != P_CONST) { 655 *s0 = src1; 656 *s1 = src0; 657 return TRUE; 658 } 659 } else 660 if (src1->type == P_ATTR) { 661 if (src0->type != P_ATTR) { 662 *s0 = src1; 663 *s1 = src0; 664 return TRUE; 665 } 666 } 667 668 return FALSE; 669} 670 671static void 672set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, 673 struct nv50_program_exec *e) 674{ 675 struct nv50_reg *temp; 676 677 if (src->type != P_TEMP) { 678 temp = temp_temp(pc); 679 emit_mov(pc, temp, src); 680 src = temp; 681 } 682 683 alloc_reg(pc, src); 684 e->inst[0] |= (src->hw << 9); 685} 686 687static void 688set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 689{ 690 if (src->type == P_ATTR) { 691 set_long(pc, e); 692 e->inst[1] |= 0x00200000; 693 } else 694 if (src->type == P_CONST || src->type == P_IMMD) { 695 struct nv50_reg *temp = temp_temp(pc); 696 697 emit_mov(pc, temp, src); 698 src = temp; 699 } 700 701 alloc_reg(pc, src); 702 e->inst[0] |= (src->hw << 9); 703} 704 705static void 706set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 707{ 708 if (src->type == P_ATTR) { 709 struct nv50_reg *temp = temp_temp(pc); 710 711 emit_mov(pc, temp, src); 712 src = temp; 713 } else 714 if (src->type == P_CONST || src->type == P_IMMD) { 715 assert(!(e->inst[0] & 0x00800000)); 716 if (e->inst[0] & 0x01000000) { 717 struct nv50_reg *temp = temp_temp(pc); 718 719 emit_mov(pc, temp, src); 720 src = temp; 721 } else { 722 set_data(pc, src, 0x7f, 16, e); 723 e->inst[0] |= 0x00800000; 724 } 725 } 726 727 alloc_reg(pc, src); 728 e->inst[0] |= ((src->hw & 127) << 16); 729} 730 731static void 732set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 733{ 734 set_long(pc, e); 735 736 if (src->type == P_ATTR) { 737 struct nv50_reg *temp = temp_temp(pc); 738 739 emit_mov(pc, temp, src); 740 src = temp; 741 } else 742 if (src->type == P_CONST || src->type == P_IMMD) { 743 assert(!(e->inst[0] & 0x01000000)); 744 if (e->inst[0] & 0x00800000) { 745 struct nv50_reg *temp = temp_temp(pc); 746 747 emit_mov(pc, temp, src); 748 src = temp; 749 } else { 750 set_data(pc, src, 0x7f, 32+14, e); 751 e->inst[0] |= 0x01000000; 752 } 753 } 754 755 alloc_reg(pc, src); 756 e->inst[1] |= ((src->hw & 127) << 14); 757} 758 759static void 760emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 761 struct nv50_reg *src1) 762{ 763 struct nv50_program_exec *e = exec(pc); 764 765 e->inst[0] |= 0xc0000000; 766 767 if (!pc->allow32) 768 set_long(pc, e); 769 770 check_swap_src_0_1(pc, &src0, &src1); 771 set_dst(pc, dst, e); 772 set_src_0(pc, src0, e); 773 if (src1->type == P_IMMD && !is_long(e)) { 774 if (src0->neg) 775 e->inst[0] |= 0x00008000; 776 set_immd(pc, src1, e); 777 } else { 778 set_src_1(pc, src1, e); 779 if (src0->neg ^ src1->neg) { 780 if (is_long(e)) 781 e->inst[1] |= 0x08000000; 782 else 783 e->inst[0] |= 0x00008000; 784 } 785 } 786 787 emit(pc, e); 788} 789 790static void 791emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 792 struct nv50_reg *src0, struct nv50_reg *src1) 793{ 794 struct nv50_program_exec *e = exec(pc); 795 796 e->inst[0] |= 0xb0000000; 797 798 check_swap_src_0_1(pc, &src0, &src1); 799 800 if (!pc->allow32 || src0->neg || src1->neg) { 801 set_long(pc, e); 802 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27); 803 } 804 805 set_dst(pc, dst, e); 806 set_src_0(pc, src0, e); 807 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 808 set_src_2(pc, src1, e); 809 else 810 if (src1->type == P_IMMD) 811 set_immd(pc, src1, e); 812 else 813 set_src_1(pc, src1, e); 814 815 emit(pc, e); 816} 817 818static void 819emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 820 uint8_t s) 821{ 822 struct nv50_program_exec *e = exec(pc); 823 824 set_long(pc, e); 825 e->inst[1] |= 0xc0000000; 826 827 e->inst[0] |= dst->hw << 2; 828 e->inst[0] |= s << 16; /* shift left */ 829 set_src_0_restricted(pc, src, e); 830 831 emit(pc, e); 832} 833 834static void 835emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 836 struct nv50_reg *src0, struct nv50_reg *src1) 837{ 838 struct nv50_program_exec *e = exec(pc); 839 840 set_long(pc, e); 841 e->inst[0] |= 0xb0000000; 842 e->inst[1] |= (sub << 29); 843 844 check_swap_src_0_1(pc, &src0, &src1); 845 set_dst(pc, dst, e); 846 set_src_0(pc, src0, e); 847 set_src_1(pc, src1, e); 848 849 emit(pc, e); 850} 851 852static INLINE void 853emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 854 struct nv50_reg *src1) 855{ 856 src1->neg ^= 1; 857 emit_add(pc, dst, src0, src1); 858 src1->neg ^= 1; 859} 860 861static void 862emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 863 struct nv50_reg *src1, struct nv50_reg *src2) 864{ 865 struct nv50_program_exec *e = exec(pc); 866 867 e->inst[0] |= 0xe0000000; 868 869 check_swap_src_0_1(pc, &src0, &src1); 870 set_dst(pc, dst, e); 871 set_src_0(pc, src0, e); 872 set_src_1(pc, src1, e); 873 set_src_2(pc, src2, e); 874 875 if (src0->neg ^ src1->neg) 876 e->inst[1] |= 0x04000000; 877 if (src2->neg) 878 e->inst[1] |= 0x08000000; 879 880 emit(pc, e); 881} 882 883static INLINE void 884emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 885 struct nv50_reg *src1, struct nv50_reg *src2) 886{ 887 src2->neg ^= 1; 888 emit_mad(pc, dst, src0, src1, src2); 889 src2->neg ^= 1; 890} 891 892static void 893emit_flop(struct nv50_pc *pc, unsigned sub, 894 struct nv50_reg *dst, struct nv50_reg *src) 895{ 896 struct nv50_program_exec *e = exec(pc); 897 898 e->inst[0] |= 0x90000000; 899 if (sub) { 900 set_long(pc, e); 901 e->inst[1] |= (sub << 29); 902 } 903 904 set_dst(pc, dst, e); 905 906 if (sub == 0 || sub == 2) 907 set_src_0_restricted(pc, src, e); 908 else 909 set_src_0(pc, src, e); 910 911 emit(pc, e); 912} 913 914static void 915emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 916{ 917 struct nv50_program_exec *e = exec(pc); 918 919 e->inst[0] |= 0xb0000000; 920 921 set_dst(pc, dst, e); 922 set_src_0(pc, src, e); 923 set_long(pc, e); 924 e->inst[1] |= (6 << 29) | 0x00004000; 925 926 emit(pc, e); 927} 928 929static void 930emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 931{ 932 struct nv50_program_exec *e = exec(pc); 933 934 e->inst[0] |= 0xb0000000; 935 936 set_dst(pc, dst, e); 937 set_src_0(pc, src, e); 938 set_long(pc, e); 939 e->inst[1] |= (6 << 29); 940 941 emit(pc, e); 942} 943 944#define CVTOP_RN 0x01 945#define CVTOP_FLOOR 0x03 946#define CVTOP_CEIL 0x05 947#define CVTOP_TRUNC 0x07 948#define CVTOP_SAT 0x08 949#define CVTOP_ABS 0x10 950 951/* 0x04 == 32 bit dst */ 952/* 0x40 == dst is float */ 953/* 0x80 == src is float */ 954#define CVT_F32_F32 0xc4 955#define CVT_F32_S32 0x44 956#define CVT_F32_U32 0x64 957#define CVT_S32_F32 0x8c 958#define CVT_S32_S32 0x0c 959#define CVT_NEG 0x20 960#define CVT_RI 0x08 961 962static void 963emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 964 int wp, unsigned cvn, unsigned fmt) 965{ 966 struct nv50_program_exec *e; 967 968 e = exec(pc); 969 set_long(pc, e); 970 971 e->inst[0] |= 0xa0000000; 972 e->inst[1] |= 0x00004000; /* 32 bit src */ 973 e->inst[1] |= (cvn << 16); 974 e->inst[1] |= (fmt << 24); 975 set_src_0(pc, src, e); 976 977 if (wp >= 0) 978 set_pred_wr(pc, 1, wp, e); 979 980 if (dst) 981 set_dst(pc, dst, e); 982 else { 983 e->inst[0] |= 0x000001fc; 984 e->inst[1] |= 0x00000008; 985 } 986 987 emit(pc, e); 988} 989 990/* nv50 Condition codes: 991 * 0x1 = LT 992 * 0x2 = EQ 993 * 0x3 = LE 994 * 0x4 = GT 995 * 0x5 = NE 996 * 0x6 = GE 997 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) 998 * 0x8 = unordered bit (allows NaN) 999 */ 1000static void 1001emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, 1002 struct nv50_reg *src0, struct nv50_reg *src1) 1003{ 1004 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 1005 1006 struct nv50_program_exec *e = exec(pc); 1007 struct nv50_reg *rdst; 1008 1009 assert(ccode < 16); 1010 if (check_swap_src_0_1(pc, &src0, &src1)) 1011 ccode = cc_swapped[ccode & 7] | (ccode & 8); 1012 1013 rdst = dst; 1014 if (dst && dst->type != P_TEMP) 1015 dst = alloc_temp(pc, NULL); 1016 1017 /* set.u32 */ 1018 set_long(pc, e); 1019 e->inst[0] |= 0xb0000000; 1020 e->inst[1] |= 0x60000000 | (ccode << 14); 1021 1022 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but 1023 * that doesn't seem to match what the hw actually does 1024 e->inst[1] |= 0x04000000; << breaks things, u32 by default ? 1025 */ 1026 1027 if (wp >= 0) 1028 set_pred_wr(pc, 1, wp, e); 1029 if (dst) 1030 set_dst(pc, dst, e); 1031 else { 1032 e->inst[0] |= 0x000001fc; 1033 e->inst[1] |= 0x00000008; 1034 } 1035 1036 set_src_0(pc, src0, e); 1037 set_src_1(pc, src1, e); 1038 1039 emit(pc, e); 1040 pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */ 1041 1042 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */ 1043 if (rdst) 1044 emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32); 1045 if (rdst && rdst != dst) 1046 free_temp(pc, dst); 1047} 1048 1049static INLINE unsigned 1050map_tgsi_setop_cc(unsigned op) 1051{ 1052 switch (op) { 1053 case TGSI_OPCODE_SLT: return 0x1; 1054 case TGSI_OPCODE_SGE: return 0x6; 1055 case TGSI_OPCODE_SEQ: return 0x2; 1056 case TGSI_OPCODE_SGT: return 0x4; 1057 case TGSI_OPCODE_SLE: return 0x3; 1058 case TGSI_OPCODE_SNE: return 0xd; 1059 default: 1060 assert(0); 1061 return 0; 1062 } 1063} 1064 1065static INLINE void 1066emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1067{ 1068 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI); 1069} 1070 1071static void 1072emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 1073 struct nv50_reg *v, struct nv50_reg *e) 1074{ 1075 struct nv50_reg *temp = alloc_temp(pc, NULL); 1076 1077 emit_flop(pc, 3, temp, v); 1078 emit_mul(pc, temp, temp, e); 1079 emit_preex2(pc, temp, temp); 1080 emit_flop(pc, 6, dst, temp); 1081 1082 free_temp(pc, temp); 1083} 1084 1085static INLINE void 1086emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1087{ 1088 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32); 1089} 1090 1091static INLINE void 1092emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1093{ 1094 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32); 1095} 1096 1097static void 1098emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1099 struct nv50_reg **src) 1100{ 1101 struct nv50_reg *one = alloc_immd(pc, 1.0); 1102 struct nv50_reg *zero = alloc_immd(pc, 0.0); 1103 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 1104 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 1105 struct nv50_reg *tmp[4]; 1106 boolean allow32 = pc->allow32; 1107 1108 pc->allow32 = FALSE; 1109 1110 if (mask & (3 << 1)) { 1111 tmp[0] = alloc_temp(pc, NULL); 1112 emit_minmax(pc, 4, tmp[0], src[0], zero); 1113 } 1114 1115 if (mask & (1 << 2)) { 1116 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 1117 1118 tmp[1] = temp_temp(pc); 1119 emit_minmax(pc, 4, tmp[1], src[1], zero); 1120 1121 tmp[3] = temp_temp(pc); 1122 emit_minmax(pc, 4, tmp[3], src[3], neg128); 1123 emit_minmax(pc, 5, tmp[3], tmp[3], pos128); 1124 1125 emit_pow(pc, dst[2], tmp[1], tmp[3]); 1126 emit_mov(pc, dst[2], zero); 1127 set_pred(pc, 3, 0, pc->p->exec_tail); 1128 } 1129 1130 if (mask & (1 << 1)) 1131 assimilate_temp(pc, dst[1], tmp[0]); 1132 else 1133 if (mask & (1 << 2)) 1134 free_temp(pc, tmp[0]); 1135 1136 pc->allow32 = allow32; 1137 1138 /* do this last, in case src[i,j] == dst[0,3] */ 1139 if (mask & (1 << 0)) 1140 emit_mov(pc, dst[0], one); 1141 1142 if (mask & (1 << 3)) 1143 emit_mov(pc, dst[3], one); 1144 1145 FREE(pos128); 1146 FREE(neg128); 1147 FREE(zero); 1148 FREE(one); 1149} 1150 1151static INLINE void 1152emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1153{ 1154 emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG); 1155} 1156 1157static void 1158emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 1159{ 1160 struct nv50_program_exec *e; 1161 const int r_pred = 1; 1162 unsigned cvn = CVT_F32_F32; 1163 1164 if (src->neg) 1165 cvn |= CVT_NEG; 1166 /* write predicate reg */ 1167 emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn); 1168 1169 /* conditional discard */ 1170 e = exec(pc); 1171 e->inst[0] = 0x00000002; 1172 set_long(pc, e); 1173 set_pred(pc, 0x1 /* LT */, r_pred, e); 1174 emit(pc, e); 1175} 1176 1177static void 1178emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1179 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj) 1180{ 1181 struct nv50_reg *temp, *t[4]; 1182 struct nv50_program_exec *e; 1183 1184 unsigned c, mode, dim; 1185 1186 switch (type) { 1187 case TGSI_TEXTURE_1D: 1188 dim = 1; 1189 break; 1190 case TGSI_TEXTURE_UNKNOWN: 1191 case TGSI_TEXTURE_2D: 1192 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */ 1193 case TGSI_TEXTURE_RECT: 1194 dim = 2; 1195 break; 1196 case TGSI_TEXTURE_3D: 1197 case TGSI_TEXTURE_CUBE: 1198 case TGSI_TEXTURE_SHADOW2D: 1199 case TGSI_TEXTURE_SHADOWRECT: /* XXX */ 1200 dim = 3; 1201 break; 1202 default: 1203 assert(0); 1204 break; 1205 } 1206 1207 /* some cards need t[0]'s hw index to be a multiple of 4 */ 1208 alloc_temp4(pc, t, 0); 1209 1210 if (proj) { 1211 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1212 mode = pc->interp_mode[src[0]->index]; 1213 1214 t[3]->rhw = src[3]->rhw; 1215 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1216 emit_flop(pc, 0, t[3], t[3]); 1217 1218 for (c = 0; c < dim; c++) { 1219 t[c]->rhw = src[c]->rhw; 1220 emit_interp(pc, t[c], t[3], 1221 (mode | INTERP_PERSPECTIVE)); 1222 } 1223 } else { 1224 emit_flop(pc, 0, t[3], src[3]); 1225 for (c = 0; c < dim; c++) 1226 emit_mul(pc, t[c], src[c], t[3]); 1227 1228 /* XXX: for some reason the blob sometimes uses MAD: 1229 * emit_mad(pc, t[c], src[0][c], t[3], t[3]) 1230 * pc->p->exec_tail->inst[1] |= 0x080fc000; 1231 */ 1232 } 1233 } else { 1234 if (type == TGSI_TEXTURE_CUBE) { 1235 temp = temp_temp(pc); 1236 emit_minmax(pc, 4, temp, src[0], src[1]); 1237 emit_minmax(pc, 4, temp, temp, src[2]); 1238 emit_flop(pc, 0, temp, temp); 1239 for (c = 0; c < 3; c++) 1240 emit_mul(pc, t[c], src[c], temp); 1241 } else { 1242 for (c = 0; c < dim; c++) 1243 emit_mov(pc, t[c], src[c]); 1244 } 1245 } 1246 1247 e = exec(pc); 1248 set_long(pc, e); 1249 e->inst[0] |= 0xf0000000; 1250 e->inst[1] |= 0x00000004; 1251 set_dst(pc, t[0], e); 1252 e->inst[0] |= (unit << 9); 1253 1254 if (dim == 2) 1255 e->inst[0] |= 0x00400000; 1256 else 1257 if (dim == 3) 1258 e->inst[0] |= 0x00800000; 1259 1260 e->inst[0] |= (mask & 0x3) << 25; 1261 e->inst[1] |= (mask & 0xc) << 12; 1262 1263 emit(pc, e); 1264 1265#if 1 1266 c = 0; 1267 if (mask & 1) emit_mov(pc, dst[0], t[c++]); 1268 if (mask & 2) emit_mov(pc, dst[1], t[c++]); 1269 if (mask & 4) emit_mov(pc, dst[2], t[c++]); 1270 if (mask & 8) emit_mov(pc, dst[3], t[c]); 1271 1272 free_temp4(pc, t); 1273#else 1274 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1275 * the texture coordinates, not the fetched values: latency ? */ 1276 1277 for (c = 0; c < 4; c++) { 1278 if (mask & (1 << c)) 1279 assimilate_temp(pc, dst[c], t[c]); 1280 else 1281 free_temp(pc, t[c]); 1282 } 1283#endif 1284} 1285 1286static void 1287emit_branch(struct nv50_pc *pc, int pred, unsigned cc, 1288 struct nv50_program_exec **join) 1289{ 1290 struct nv50_program_exec *e = exec(pc); 1291 1292 if (join) { 1293 set_long(pc, e); 1294 e->inst[0] |= 0xa0000002; 1295 emit(pc, e); 1296 *join = e; 1297 e = exec(pc); 1298 } 1299 1300 set_long(pc, e); 1301 e->inst[0] |= 0x10000002; 1302 if (pred >= 0) 1303 set_pred(pc, cc, pred, e); 1304 emit(pc, e); 1305} 1306 1307static void 1308emit_nop(struct nv50_pc *pc) 1309{ 1310 struct nv50_program_exec *e = exec(pc); 1311 1312 e->inst[0] = 0xf0000000; 1313 set_long(pc, e); 1314 e->inst[1] = 0xe0000000; 1315 emit(pc, e); 1316} 1317 1318static void 1319emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1320{ 1321 struct nv50_program_exec *e = exec(pc); 1322 1323 assert(src->type == P_TEMP); 1324 1325 e->inst[0] = 0xc0140000; 1326 e->inst[1] = 0x89800000; 1327 set_long(pc, e); 1328 set_dst(pc, dst, e); 1329 set_src_0(pc, src, e); 1330 set_src_2(pc, src, e); 1331 1332 emit(pc, e); 1333} 1334 1335static void 1336emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1337{ 1338 struct nv50_program_exec *e = exec(pc); 1339 1340 assert(src->type == P_TEMP); 1341 1342 if (!src->neg) /* ! double negation */ 1343 emit_neg(pc, src, src); 1344 1345 e->inst[0] = 0xc0150000; 1346 e->inst[1] = 0x8a400000; 1347 set_long(pc, e); 1348 set_dst(pc, dst, e); 1349 set_src_0(pc, src, e); 1350 set_src_2(pc, src, e); 1351 1352 emit(pc, e); 1353} 1354 1355static void 1356convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1357{ 1358 unsigned q = 0, m = ~0; 1359 1360 assert(!is_long(e)); 1361 1362 switch (e->inst[0] >> 28) { 1363 case 0x1: 1364 /* MOV */ 1365 q = 0x0403c000; 1366 m = 0xffff7fff; 1367 break; 1368 case 0x8: 1369 /* INTERP (move centroid, perspective and flat bits) */ 1370 m = ~0x03000100; 1371 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 1372 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 1373 break; 1374 case 0x9: 1375 /* RCP */ 1376 break; 1377 case 0xB: 1378 /* ADD */ 1379 m = ~(127 << 16); 1380 q = ((e->inst[0] & (~m)) >> 2); 1381 break; 1382 case 0xC: 1383 /* MUL */ 1384 m = ~0x00008000; 1385 q = ((e->inst[0] & (~m)) << 12); 1386 break; 1387 case 0xE: 1388 /* MAD (if src2 == dst) */ 1389 q = ((e->inst[0] & 0x1fc) << 12); 1390 break; 1391 default: 1392 assert(0); 1393 break; 1394 } 1395 1396 set_long(pc, e); 1397 pc->p->exec_size++; 1398 1399 e->inst[0] &= m; 1400 e->inst[1] |= q; 1401} 1402 1403/* Some operations support an optional negation flag. */ 1404static boolean 1405negate_supported(const struct tgsi_full_instruction *insn, int i) 1406{ 1407 int s; 1408 1409 switch (insn->Instruction.Opcode) { 1410 case TGSI_OPCODE_DDY: 1411 case TGSI_OPCODE_DP3: 1412 case TGSI_OPCODE_DP4: 1413 case TGSI_OPCODE_MUL: 1414 case TGSI_OPCODE_KIL: 1415 case TGSI_OPCODE_ADD: 1416 case TGSI_OPCODE_SUB: 1417 case TGSI_OPCODE_MAD: 1418 break; 1419 case TGSI_OPCODE_POW: 1420 if (i == 1) 1421 break; 1422 return FALSE; 1423 default: 1424 return FALSE; 1425 } 1426 1427 /* Watch out for possible multiple uses of an nv50_reg, we 1428 * can't use nv50_reg::neg in these cases. 1429 */ 1430 for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) { 1431 if (s == i) 1432 continue; 1433 if ((insn->FullSrcRegisters[s].SrcRegister.Index == 1434 insn->FullSrcRegisters[i].SrcRegister.Index) && 1435 (insn->FullSrcRegisters[s].SrcRegister.File == 1436 insn->FullSrcRegisters[i].SrcRegister.File)) 1437 return FALSE; 1438 } 1439 1440 return TRUE; 1441} 1442 1443/* Return a read mask for source registers deduced from opcode & write mask. */ 1444static unsigned 1445nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 1446{ 1447 unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask; 1448 1449 switch (insn->Instruction.Opcode) { 1450 case TGSI_OPCODE_COS: 1451 case TGSI_OPCODE_SIN: 1452 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 1453 case TGSI_OPCODE_DP3: 1454 return 0x7; 1455 case TGSI_OPCODE_DP4: 1456 case TGSI_OPCODE_DPH: 1457 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 1458 return 0xf; 1459 case TGSI_OPCODE_DST: 1460 return mask & (c ? 0xa : 0x6); 1461 case TGSI_OPCODE_EX2: 1462 case TGSI_OPCODE_LG2: 1463 case TGSI_OPCODE_POW: 1464 case TGSI_OPCODE_RCP: 1465 case TGSI_OPCODE_RSQ: 1466 case TGSI_OPCODE_SCS: 1467 return 0x1; 1468 case TGSI_OPCODE_LIT: 1469 return 0xb; 1470 case TGSI_OPCODE_TEX: 1471 case TGSI_OPCODE_TXP: 1472 { 1473 const struct tgsi_instruction_ext_texture *tex; 1474 1475 assert(insn->Instruction.Extended); 1476 tex = &insn->InstructionExtTexture; 1477 1478 mask = 0x7; 1479 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) 1480 mask |= 0x8; 1481 1482 switch (tex->Texture) { 1483 case TGSI_TEXTURE_1D: 1484 mask &= 0x9; 1485 break; 1486 case TGSI_TEXTURE_2D: 1487 mask &= 0xb; 1488 break; 1489 default: 1490 break; 1491 } 1492 } 1493 return mask; 1494 case TGSI_OPCODE_XPD: 1495 x = 0; 1496 if (mask & 1) x |= 0x6; 1497 if (mask & 2) x |= 0x5; 1498 if (mask & 4) x |= 0x3; 1499 return x; 1500 default: 1501 break; 1502 } 1503 1504 return mask; 1505} 1506 1507static struct nv50_reg * 1508tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 1509{ 1510 switch (dst->DstRegister.File) { 1511 case TGSI_FILE_TEMPORARY: 1512 return &pc->temp[dst->DstRegister.Index * 4 + c]; 1513 case TGSI_FILE_OUTPUT: 1514 return &pc->result[dst->DstRegister.Index * 4 + c]; 1515 case TGSI_FILE_ADDRESS: 1516 { 1517 struct nv50_reg *r = pc->addr[dst->DstRegister.Index * 4 + c]; 1518 if (!r) { 1519 r = alloc_addr(pc, NULL); 1520 pc->addr[dst->DstRegister.Index * 4 + c] = r; 1521 } 1522 assert(r); 1523 return r; 1524 } 1525 case TGSI_FILE_NULL: 1526 return NULL; 1527 default: 1528 break; 1529 } 1530 1531 return NULL; 1532} 1533 1534static struct nv50_reg * 1535tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 1536 boolean neg) 1537{ 1538 struct nv50_reg *r = NULL; 1539 struct nv50_reg *temp; 1540 unsigned sgn, c, swz; 1541 1542 if (src->SrcRegister.File != TGSI_FILE_CONSTANT) 1543 assert(!src->SrcRegister.Indirect); 1544 1545 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 1546 1547 c = tgsi_util_get_full_src_register_extswizzle(src, chan); 1548 switch (c) { 1549 case TGSI_EXTSWIZZLE_X: 1550 case TGSI_EXTSWIZZLE_Y: 1551 case TGSI_EXTSWIZZLE_Z: 1552 case TGSI_EXTSWIZZLE_W: 1553 switch (src->SrcRegister.File) { 1554 case TGSI_FILE_INPUT: 1555 r = &pc->attr[src->SrcRegister.Index * 4 + c]; 1556 break; 1557 case TGSI_FILE_TEMPORARY: 1558 r = &pc->temp[src->SrcRegister.Index * 4 + c]; 1559 break; 1560 case TGSI_FILE_CONSTANT: 1561 if (!src->SrcRegister.Indirect) { 1562 r = &pc->param[src->SrcRegister.Index * 4 + c]; 1563 break; 1564 } 1565 /* Indicate indirection by setting r->acc < 0 and 1566 * use the index field to select the address reg. 1567 */ 1568 r = MALLOC_STRUCT(nv50_reg); 1569 swz = tgsi_util_get_src_register_swizzle( 1570 &src->SrcRegisterInd, 0); 1571 ctor_reg(r, P_CONST, 1572 src->SrcRegisterInd.Index * 4 + swz, c); 1573 r->acc = -1; 1574 break; 1575 case TGSI_FILE_IMMEDIATE: 1576 r = &pc->immd[src->SrcRegister.Index * 4 + c]; 1577 break; 1578 case TGSI_FILE_SAMPLER: 1579 break; 1580 case TGSI_FILE_ADDRESS: 1581 r = pc->addr[src->SrcRegister.Index * 4 + c]; 1582 assert(r); 1583 break; 1584 default: 1585 assert(0); 1586 break; 1587 } 1588 break; 1589 case TGSI_EXTSWIZZLE_ZERO: 1590 r = alloc_immd(pc, 0.0); 1591 return r; 1592 case TGSI_EXTSWIZZLE_ONE: 1593 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET) 1594 return alloc_immd(pc, -1.0); 1595 return alloc_immd(pc, 1.0); 1596 default: 1597 assert(0); 1598 break; 1599 } 1600 1601 switch (sgn) { 1602 case TGSI_UTIL_SIGN_KEEP: 1603 break; 1604 case TGSI_UTIL_SIGN_CLEAR: 1605 temp = temp_temp(pc); 1606 emit_abs(pc, temp, r); 1607 r = temp; 1608 break; 1609 case TGSI_UTIL_SIGN_TOGGLE: 1610 if (neg) 1611 r->neg = 1; 1612 else { 1613 temp = temp_temp(pc); 1614 emit_neg(pc, temp, r); 1615 r = temp; 1616 } 1617 break; 1618 case TGSI_UTIL_SIGN_SET: 1619 temp = temp_temp(pc); 1620 emit_abs(pc, temp, r); 1621 if (neg) 1622 temp->neg = 1; 1623 else 1624 emit_neg(pc, temp, temp); 1625 r = temp; 1626 break; 1627 default: 1628 assert(0); 1629 break; 1630 } 1631 1632 return r; 1633} 1634 1635/* return TRUE for ops that produce only a single result */ 1636static boolean 1637is_scalar_op(unsigned op) 1638{ 1639 switch (op) { 1640 case TGSI_OPCODE_COS: 1641 case TGSI_OPCODE_DP2: 1642 case TGSI_OPCODE_DP3: 1643 case TGSI_OPCODE_DP4: 1644 case TGSI_OPCODE_DPH: 1645 case TGSI_OPCODE_EX2: 1646 case TGSI_OPCODE_LG2: 1647 case TGSI_OPCODE_POW: 1648 case TGSI_OPCODE_RCP: 1649 case TGSI_OPCODE_RSQ: 1650 case TGSI_OPCODE_SIN: 1651 /* 1652 case TGSI_OPCODE_KIL: 1653 case TGSI_OPCODE_LIT: 1654 case TGSI_OPCODE_SCS: 1655 */ 1656 return TRUE; 1657 default: 1658 return FALSE; 1659 } 1660} 1661 1662/* Returns a bitmask indicating which dst components depend 1663 * on source s, component c (reverse of nv50_tgsi_src_mask). 1664 */ 1665static unsigned 1666nv50_tgsi_dst_revdep(unsigned op, int s, int c) 1667{ 1668 if (is_scalar_op(op)) 1669 return 0x1; 1670 1671 switch (op) { 1672 case TGSI_OPCODE_DST: 1673 return (1 << c) & (s ? 0xa : 0x6); 1674 case TGSI_OPCODE_XPD: 1675 switch (c) { 1676 case 0: return 0x6; 1677 case 1: return 0x5; 1678 case 2: return 0x3; 1679 case 3: return 0x0; 1680 default: 1681 assert(0); 1682 return 0x0; 1683 } 1684 case TGSI_OPCODE_LIT: 1685 case TGSI_OPCODE_SCS: 1686 case TGSI_OPCODE_TEX: 1687 case TGSI_OPCODE_TXP: 1688 /* these take care of dangerous swizzles themselves */ 1689 return 0x0; 1690 case TGSI_OPCODE_IF: 1691 case TGSI_OPCODE_KIL: 1692 /* don't call this function for these ops */ 1693 assert(0); 1694 return 0; 1695 default: 1696 /* linear vector instruction */ 1697 return (1 << c); 1698 } 1699} 1700 1701static INLINE boolean 1702has_pred(struct nv50_program_exec *e, unsigned cc) 1703{ 1704 if (!is_long(e) || is_immd(e)) 1705 return FALSE; 1706 return ((e->inst[1] & 0x780) == (cc << 7)); 1707} 1708 1709/* on ENDIF see if we can do "@p0.neu single_op" instead of: 1710 * join_at ENDIF 1711 * @p0.eq bra ENDIF 1712 * single_op 1713 * ENDIF: nop.join 1714 */ 1715static boolean 1716nv50_kill_branch(struct nv50_pc *pc) 1717{ 1718 int lvl = pc->if_lvl; 1719 1720 if (pc->if_insn[lvl]->next != pc->p->exec_tail) 1721 return FALSE; 1722 1723 /* if ccode == 'true', the BRA is from an ELSE and the predicate 1724 * reg may no longer be valid, since we currently always use $p0 1725 */ 1726 if (has_pred(pc->if_insn[lvl], 0xf)) 1727 return FALSE; 1728 assert(pc->if_insn[lvl] && pc->br_join[lvl]); 1729 1730 /* We'll use the exec allocated for JOIN_AT (as we can't easily 1731 * update prev's next); if exec_tail is BRK, update the pointer. 1732 */ 1733 if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail) 1734 pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl]; 1735 1736 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ 1737 1738 *pc->br_join[lvl] = *pc->p->exec_tail; 1739 1740 FREE(pc->if_insn[lvl]); 1741 FREE(pc->p->exec_tail); 1742 1743 pc->p->exec_tail = pc->br_join[lvl]; 1744 pc->p->exec_tail->next = NULL; 1745 set_pred(pc, 0xd, 0, pc->p->exec_tail); 1746 1747 return TRUE; 1748} 1749 1750static boolean 1751nv50_program_tx_insn(struct nv50_pc *pc, 1752 const struct tgsi_full_instruction *inst) 1753{ 1754 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 1755 unsigned mask, sat, unit; 1756 int i, c; 1757 1758 mask = inst->FullDstRegisters[0].DstRegister.WriteMask; 1759 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 1760 1761 memset(src, 0, sizeof(src)); 1762 1763 for (c = 0; c < 4; c++) { 1764 if ((mask & (1 << c)) && !pc->r_dst[c]) 1765 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); 1766 else 1767 dst[c] = pc->r_dst[c]; 1768 rdst[c] = dst[c]; 1769 } 1770 1771 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1772 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; 1773 unsigned src_mask; 1774 boolean neg_supp; 1775 1776 src_mask = nv50_tgsi_src_mask(inst, i); 1777 neg_supp = negate_supported(inst, i); 1778 1779 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) 1780 unit = fs->SrcRegister.Index; 1781 1782 for (c = 0; c < 4; c++) 1783 if (src_mask & (1 << c)) 1784 src[i][c] = tgsi_src(pc, c, fs, neg_supp); 1785 } 1786 1787 brdc = temp = pc->r_brdc; 1788 if (brdc && brdc->type != P_TEMP) { 1789 temp = temp_temp(pc); 1790 if (sat) 1791 brdc = temp; 1792 } else 1793 if (sat) { 1794 for (c = 0; c < 4; c++) { 1795 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 1796 continue; 1797 rdst[c] = dst[c]; 1798 dst[c] = temp_temp(pc); 1799 } 1800 } 1801 1802 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 1803 1804 switch (inst->Instruction.Opcode) { 1805 case TGSI_OPCODE_ABS: 1806 for (c = 0; c < 4; c++) { 1807 if (!(mask & (1 << c))) 1808 continue; 1809 emit_abs(pc, dst[c], src[0][c]); 1810 } 1811 break; 1812 case TGSI_OPCODE_ADD: 1813 for (c = 0; c < 4; c++) { 1814 if (!(mask & (1 << c))) 1815 continue; 1816 emit_add(pc, dst[c], src[0][c], src[1][c]); 1817 } 1818 break; 1819 case TGSI_OPCODE_ARL: 1820 assert(src[0][0]); 1821 temp = temp_temp(pc); 1822 emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32); 1823 emit_arl(pc, dst[0], temp, 4); 1824 break; 1825 case TGSI_OPCODE_BGNLOOP: 1826 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; 1827 terminate_mbb(pc); 1828 break; 1829 case TGSI_OPCODE_BRK: 1830 emit_branch(pc, -1, 0, NULL); 1831 assert(pc->loop_lvl > 0); 1832 pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail; 1833 break; 1834 case TGSI_OPCODE_CEIL: 1835 for (c = 0; c < 4; c++) { 1836 if (!(mask & (1 << c))) 1837 continue; 1838 emit_cvt(pc, dst[c], src[0][c], -1, 1839 CVTOP_CEIL, CVT_F32_F32 | CVT_RI); 1840 } 1841 break; 1842 case TGSI_OPCODE_CMP: 1843 pc->allow32 = FALSE; 1844 for (c = 0; c < 4; c++) { 1845 if (!(mask & (1 << c))) 1846 continue; 1847 emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32); 1848 emit_mov(pc, dst[c], src[1][c]); 1849 set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ 1850 emit_mov(pc, dst[c], src[2][c]); 1851 set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ 1852 } 1853 break; 1854 case TGSI_OPCODE_COS: 1855 if (mask & 8) { 1856 emit_precossin(pc, temp, src[0][3]); 1857 emit_flop(pc, 5, dst[3], temp); 1858 if (!(mask &= 7)) 1859 break; 1860 if (temp == dst[3]) 1861 temp = brdc = temp_temp(pc); 1862 } 1863 emit_precossin(pc, temp, src[0][0]); 1864 emit_flop(pc, 5, brdc, temp); 1865 break; 1866 case TGSI_OPCODE_DDX: 1867 for (c = 0; c < 4; c++) { 1868 if (!(mask & (1 << c))) 1869 continue; 1870 emit_ddx(pc, dst[c], src[0][c]); 1871 } 1872 break; 1873 case TGSI_OPCODE_DDY: 1874 for (c = 0; c < 4; c++) { 1875 if (!(mask & (1 << c))) 1876 continue; 1877 emit_ddy(pc, dst[c], src[0][c]); 1878 } 1879 break; 1880 case TGSI_OPCODE_DP3: 1881 emit_mul(pc, temp, src[0][0], src[1][0]); 1882 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1883 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 1884 break; 1885 case TGSI_OPCODE_DP4: 1886 emit_mul(pc, temp, src[0][0], src[1][0]); 1887 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1888 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1889 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 1890 break; 1891 case TGSI_OPCODE_DPH: 1892 emit_mul(pc, temp, src[0][0], src[1][0]); 1893 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1894 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1895 emit_add(pc, brdc, src[1][3], temp); 1896 break; 1897 case TGSI_OPCODE_DST: 1898 if (mask & (1 << 1)) 1899 emit_mul(pc, dst[1], src[0][1], src[1][1]); 1900 if (mask & (1 << 2)) 1901 emit_mov(pc, dst[2], src[0][2]); 1902 if (mask & (1 << 3)) 1903 emit_mov(pc, dst[3], src[1][3]); 1904 if (mask & (1 << 0)) 1905 emit_mov_immdval(pc, dst[0], 1.0f); 1906 break; 1907 case TGSI_OPCODE_ELSE: 1908 emit_branch(pc, -1, 0, NULL); 1909 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 1910 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; 1911 terminate_mbb(pc); 1912 break; 1913 case TGSI_OPCODE_ENDIF: 1914 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 1915 1916 /* try to replace branch over 1 insn with a predicated insn */ 1917 if (nv50_kill_branch(pc) == TRUE) 1918 break; 1919 1920 if (pc->br_join[pc->if_lvl]) { 1921 pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size; 1922 pc->br_join[pc->if_lvl] = NULL; 1923 } 1924 terminate_mbb(pc); 1925 /* emit a NOP as join point, we could set it on the next 1926 * one, but would have to make sure it is long and !immd 1927 */ 1928 emit_nop(pc); 1929 pc->p->exec_tail->inst[1] |= 2; 1930 break; 1931 case TGSI_OPCODE_ENDLOOP: 1932 emit_branch(pc, -1, 0, NULL); 1933 pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl]; 1934 pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size; 1935 terminate_mbb(pc); 1936 break; 1937 case TGSI_OPCODE_EX2: 1938 emit_preex2(pc, temp, src[0][0]); 1939 emit_flop(pc, 6, brdc, temp); 1940 break; 1941 case TGSI_OPCODE_FLR: 1942 for (c = 0; c < 4; c++) { 1943 if (!(mask & (1 << c))) 1944 continue; 1945 emit_flr(pc, dst[c], src[0][c]); 1946 } 1947 break; 1948 case TGSI_OPCODE_FRC: 1949 temp = temp_temp(pc); 1950 for (c = 0; c < 4; c++) { 1951 if (!(mask & (1 << c))) 1952 continue; 1953 emit_flr(pc, temp, src[0][c]); 1954 emit_sub(pc, dst[c], src[0][c], temp); 1955 } 1956 break; 1957 case TGSI_OPCODE_IF: 1958 /* emitting a join_at may not be necessary */ 1959 assert(pc->if_lvl < MAX_IF_DEPTH); 1960 set_pred_wr(pc, 1, 0, pc->if_cond); 1961 emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]); 1962 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; 1963 terminate_mbb(pc); 1964 break; 1965 case TGSI_OPCODE_KIL: 1966 emit_kil(pc, src[0][0]); 1967 emit_kil(pc, src[0][1]); 1968 emit_kil(pc, src[0][2]); 1969 emit_kil(pc, src[0][3]); 1970 break; 1971 case TGSI_OPCODE_LIT: 1972 emit_lit(pc, &dst[0], mask, &src[0][0]); 1973 break; 1974 case TGSI_OPCODE_LG2: 1975 emit_flop(pc, 3, brdc, src[0][0]); 1976 break; 1977 case TGSI_OPCODE_LRP: 1978 temp = temp_temp(pc); 1979 for (c = 0; c < 4; c++) { 1980 if (!(mask & (1 << c))) 1981 continue; 1982 emit_sub(pc, temp, src[1][c], src[2][c]); 1983 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 1984 } 1985 break; 1986 case TGSI_OPCODE_MAD: 1987 for (c = 0; c < 4; c++) { 1988 if (!(mask & (1 << c))) 1989 continue; 1990 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 1991 } 1992 break; 1993 case TGSI_OPCODE_MAX: 1994 for (c = 0; c < 4; c++) { 1995 if (!(mask & (1 << c))) 1996 continue; 1997 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]); 1998 } 1999 break; 2000 case TGSI_OPCODE_MIN: 2001 for (c = 0; c < 4; c++) { 2002 if (!(mask & (1 << c))) 2003 continue; 2004 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]); 2005 } 2006 break; 2007 case TGSI_OPCODE_MOV: 2008 case TGSI_OPCODE_SWZ: 2009 for (c = 0; c < 4; c++) { 2010 if (!(mask & (1 << c))) 2011 continue; 2012 emit_mov(pc, dst[c], src[0][c]); 2013 } 2014 break; 2015 case TGSI_OPCODE_MUL: 2016 for (c = 0; c < 4; c++) { 2017 if (!(mask & (1 << c))) 2018 continue; 2019 emit_mul(pc, dst[c], src[0][c], src[1][c]); 2020 } 2021 break; 2022 case TGSI_OPCODE_POW: 2023 emit_pow(pc, brdc, src[0][0], src[1][0]); 2024 break; 2025 case TGSI_OPCODE_RCP: 2026 emit_flop(pc, 0, brdc, src[0][0]); 2027 break; 2028 case TGSI_OPCODE_RSQ: 2029 emit_flop(pc, 2, brdc, src[0][0]); 2030 break; 2031 case TGSI_OPCODE_SCS: 2032 temp = temp_temp(pc); 2033 if (mask & 3) 2034 emit_precossin(pc, temp, src[0][0]); 2035 if (mask & (1 << 0)) 2036 emit_flop(pc, 5, dst[0], temp); 2037 if (mask & (1 << 1)) 2038 emit_flop(pc, 4, dst[1], temp); 2039 if (mask & (1 << 2)) 2040 emit_mov_immdval(pc, dst[2], 0.0); 2041 if (mask & (1 << 3)) 2042 emit_mov_immdval(pc, dst[3], 1.0); 2043 break; 2044 case TGSI_OPCODE_SIN: 2045 if (mask & 8) { 2046 emit_precossin(pc, temp, src[0][3]); 2047 emit_flop(pc, 4, dst[3], temp); 2048 if (!(mask &= 7)) 2049 break; 2050 if (temp == dst[3]) 2051 temp = brdc = temp_temp(pc); 2052 } 2053 emit_precossin(pc, temp, src[0][0]); 2054 emit_flop(pc, 4, brdc, temp); 2055 break; 2056 case TGSI_OPCODE_SLT: 2057 case TGSI_OPCODE_SGE: 2058 case TGSI_OPCODE_SEQ: 2059 case TGSI_OPCODE_SGT: 2060 case TGSI_OPCODE_SLE: 2061 case TGSI_OPCODE_SNE: 2062 i = map_tgsi_setop_cc(inst->Instruction.Opcode); 2063 for (c = 0; c < 4; c++) { 2064 if (!(mask & (1 << c))) 2065 continue; 2066 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]); 2067 } 2068 break; 2069 case TGSI_OPCODE_SUB: 2070 for (c = 0; c < 4; c++) { 2071 if (!(mask & (1 << c))) 2072 continue; 2073 emit_sub(pc, dst[c], src[0][c], src[1][c]); 2074 } 2075 break; 2076 case TGSI_OPCODE_TEX: 2077 emit_tex(pc, dst, mask, src[0], unit, 2078 inst->InstructionExtTexture.Texture, FALSE); 2079 break; 2080 case TGSI_OPCODE_TXP: 2081 emit_tex(pc, dst, mask, src[0], unit, 2082 inst->InstructionExtTexture.Texture, TRUE); 2083 break; 2084 case TGSI_OPCODE_TRUNC: 2085 for (c = 0; c < 4; c++) { 2086 if (!(mask & (1 << c))) 2087 continue; 2088 emit_cvt(pc, dst[c], src[0][c], -1, 2089 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI); 2090 } 2091 break; 2092 case TGSI_OPCODE_XPD: 2093 temp = temp_temp(pc); 2094 if (mask & (1 << 0)) { 2095 emit_mul(pc, temp, src[0][2], src[1][1]); 2096 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 2097 } 2098 if (mask & (1 << 1)) { 2099 emit_mul(pc, temp, src[0][0], src[1][2]); 2100 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 2101 } 2102 if (mask & (1 << 2)) { 2103 emit_mul(pc, temp, src[0][1], src[1][0]); 2104 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 2105 } 2106 if (mask & (1 << 3)) 2107 emit_mov_immdval(pc, dst[3], 1.0); 2108 break; 2109 case TGSI_OPCODE_END: 2110 break; 2111 default: 2112 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 2113 return FALSE; 2114 } 2115 2116 if (brdc) { 2117 if (sat) 2118 emit_sat(pc, brdc, brdc); 2119 for (c = 0; c < 4; c++) 2120 if ((mask & (1 << c)) && dst[c] != brdc) 2121 emit_mov(pc, dst[c], brdc); 2122 } else 2123 if (sat) { 2124 for (c = 0; c < 4; c++) { 2125 if (!(mask & (1 << c))) 2126 continue; 2127 /* in this case we saturate later */ 2128 if (dst[c]->type == P_TEMP && dst[c]->index < 0) 2129 continue; 2130 emit_sat(pc, rdst[c], dst[c]); 2131 } 2132 } 2133 2134 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2135 for (c = 0; c < 4; c++) { 2136 if (!src[i][c]) 2137 continue; 2138 src[i][c]->neg = 0; 2139 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) 2140 FREE(src[i][c]); 2141 else 2142 if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST) 2143 FREE(src[i][c]); /* indirect constant */ 2144 } 2145 } 2146 2147 kill_temp_temp(pc); 2148 return TRUE; 2149} 2150 2151static void 2152prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) 2153{ 2154 struct nv50_reg *reg = NULL; 2155 const struct tgsi_full_src_register *src; 2156 const struct tgsi_dst_register *dst; 2157 unsigned i, c, k, mask; 2158 2159 dst = &insn->FullDstRegisters[0].DstRegister; 2160 mask = dst->WriteMask; 2161 2162 if (dst->File == TGSI_FILE_TEMPORARY) 2163 reg = pc->temp; 2164 else 2165 if (dst->File == TGSI_FILE_OUTPUT) 2166 reg = pc->result; 2167 2168 if (reg) { 2169 for (c = 0; c < 4; c++) { 2170 if (!(mask & (1 << c))) 2171 continue; 2172 reg[dst->Index * 4 + c].acc = pc->insn_nr; 2173 } 2174 } 2175 2176 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2177 src = &insn->FullSrcRegisters[i]; 2178 2179 if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) 2180 reg = pc->temp; 2181 else 2182 if (src->SrcRegister.File == TGSI_FILE_INPUT) 2183 reg = pc->attr; 2184 else 2185 continue; 2186 2187 mask = nv50_tgsi_src_mask(insn, i); 2188 2189 for (c = 0; c < 4; c++) { 2190 if (!(mask & (1 << c))) 2191 continue; 2192 k = tgsi_util_get_full_src_register_extswizzle(src, c); 2193 2194 if (k > TGSI_EXTSWIZZLE_W) 2195 continue; 2196 2197 reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr; 2198 } 2199 } 2200} 2201 2202/* Returns a bitmask indicating which dst components need to be 2203 * written to temporaries first to avoid 'corrupting' sources. 2204 * 2205 * m[i] (out) indicate component to write in the i-th position 2206 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 2207 */ 2208static unsigned 2209nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 2210{ 2211 unsigned i, c, x, unsafe; 2212 2213 for (c = 0; c < 4; c++) 2214 m[c] = c; 2215 2216 /* Swap as long as a dst component written earlier is depended on 2217 * by one written later, but the next one isn't depended on by it. 2218 */ 2219 for (c = 0; c < 3; c++) { 2220 if (rdep[m[c + 1]] & (1 << m[c])) 2221 continue; /* if next one is depended on by us */ 2222 for (i = c + 1; i < 4; i++) 2223 /* if we are depended on by a later one */ 2224 if (rdep[m[c]] & (1 << m[i])) 2225 break; 2226 if (i == 4) 2227 continue; 2228 /* now, swap */ 2229 x = m[c]; 2230 m[c] = m[c + 1]; 2231 m[c + 1] = x; 2232 2233 /* restart */ 2234 c = 0; 2235 } 2236 2237 /* mark dependencies that could not be resolved by reordering */ 2238 for (i = 0; i < 3; ++i) 2239 for (c = i + 1; c < 4; ++c) 2240 if (rdep[m[i]] & (1 << m[c])) 2241 unsafe |= (1 << i); 2242 2243 /* NOTE: $unsafe is with respect to order, not component */ 2244 return unsafe; 2245} 2246 2247/* Select a suitable dst register for broadcasting scalar results, 2248 * or return NULL if we have to allocate an extra TEMP. 2249 * 2250 * If e.g. only 1 component is written, we may also emit the final 2251 * result to a write-only register. 2252 */ 2253static struct nv50_reg * 2254tgsi_broadcast_dst(struct nv50_pc *pc, 2255 const struct tgsi_full_dst_register *fd, unsigned mask) 2256{ 2257 if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) { 2258 int c = ffs(~mask & fd->DstRegister.WriteMask); 2259 if (c) 2260 return tgsi_dst(pc, c - 1, fd); 2261 } else { 2262 int c = ffs(fd->DstRegister.WriteMask) - 1; 2263 if ((1 << c) == fd->DstRegister.WriteMask) 2264 return tgsi_dst(pc, c, fd); 2265 } 2266 2267 return NULL; 2268} 2269 2270/* Scan source swizzles and return a bitmask indicating dst regs that 2271 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 2272 */ 2273static unsigned 2274nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 2275 unsigned rdep[4]) 2276{ 2277 const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0]; 2278 const struct tgsi_full_src_register *fs; 2279 unsigned i, deqs = 0; 2280 2281 for (i = 0; i < 4; ++i) 2282 rdep[i] = 0; 2283 2284 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2285 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 2286 boolean neg_supp = negate_supported(insn, i); 2287 2288 fs = &insn->FullSrcRegisters[i]; 2289 if (fs->SrcRegister.File != fd->DstRegister.File || 2290 fs->SrcRegister.Index != fd->DstRegister.Index) 2291 continue; 2292 2293 for (chn = 0; chn < 4; ++chn) { 2294 unsigned s, c; 2295 2296 if (!(mask & (1 << chn))) /* src is not read */ 2297 continue; 2298 c = tgsi_util_get_full_src_register_extswizzle(fs, chn); 2299 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 2300 2301 if (c > TGSI_EXTSWIZZLE_W || 2302 !(fd->DstRegister.WriteMask & (1 << c))) 2303 continue; 2304 2305 /* no danger if src is copied to TEMP first */ 2306 if ((s != TGSI_UTIL_SIGN_KEEP) && 2307 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp)) 2308 continue; 2309 2310 rdep[c] |= nv50_tgsi_dst_revdep( 2311 insn->Instruction.Opcode, i, chn); 2312 deqs |= (1 << c); 2313 } 2314 } 2315 2316 return deqs; 2317} 2318 2319static boolean 2320nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 2321{ 2322 struct tgsi_full_instruction insn = tok->FullInstruction; 2323 const struct tgsi_full_dst_register *fd; 2324 unsigned i, deqs, rdep[4], m[4]; 2325 2326 fd = &tok->FullInstruction.FullDstRegisters[0]; 2327 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 2328 2329 if (is_scalar_op(insn.Instruction.Opcode)) { 2330 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 2331 if (!pc->r_brdc) 2332 pc->r_brdc = temp_temp(pc); 2333 return nv50_program_tx_insn(pc, &insn); 2334 } 2335 pc->r_brdc = NULL; 2336 2337 if (!deqs) 2338 return nv50_program_tx_insn(pc, &insn); 2339 2340 deqs = nv50_revdep_reorder(m, rdep); 2341 2342 for (i = 0; i < 4; ++i) { 2343 assert(pc->r_dst[m[i]] == NULL); 2344 2345 insn.FullDstRegisters[0].DstRegister.WriteMask = 2346 fd->DstRegister.WriteMask & (1 << m[i]); 2347 2348 if (!insn.FullDstRegisters[0].DstRegister.WriteMask) 2349 continue; 2350 2351 if (deqs & (1 << i)) 2352 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 2353 2354 if (!nv50_program_tx_insn(pc, &insn)) 2355 return FALSE; 2356 } 2357 2358 for (i = 0; i < 4; i++) { 2359 struct nv50_reg *reg = pc->r_dst[i]; 2360 if (!reg) 2361 continue; 2362 pc->r_dst[i] = NULL; 2363 2364 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 2365 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 2366 else 2367 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 2368 free_temp(pc, reg); 2369 } 2370 2371 return TRUE; 2372} 2373 2374static void 2375load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) 2376{ 2377 struct nv50_reg *iv, **ppiv; 2378 unsigned mode = pc->interp_mode[reg->index]; 2379 2380 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; 2381 iv = *ppiv; 2382 2383 if ((mode & INTERP_PERSPECTIVE) && !iv) { 2384 iv = *ppiv = alloc_temp(pc, NULL); 2385 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; 2386 2387 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); 2388 emit_flop(pc, 0, iv, iv); 2389 2390 /* XXX: when loading interpolants dynamically, move these 2391 * to the program head, or make sure it can't be skipped. 2392 */ 2393 } 2394 2395 emit_interp(pc, reg, iv, mode); 2396} 2397 2398static boolean 2399nv50_program_tx_prep(struct nv50_pc *pc) 2400{ 2401 struct tgsi_parse_context tp; 2402 struct nv50_program *p = pc->p; 2403 boolean ret = FALSE; 2404 unsigned i, c, flat_nr = 0; 2405 2406 tgsi_parse_init(&tp, pc->p->pipe.tokens); 2407 while (!tgsi_parse_end_of_tokens(&tp)) { 2408 const union tgsi_full_token *tok = &tp.FullToken; 2409 2410 tgsi_parse_token(&tp); 2411 switch (tok->Token.Type) { 2412 case TGSI_TOKEN_TYPE_IMMEDIATE: 2413 { 2414 const struct tgsi_full_immediate *imm = 2415 &tp.FullToken.FullImmediate; 2416 2417 ctor_immd(pc, imm->u[0].Float, 2418 imm->u[1].Float, 2419 imm->u[2].Float, 2420 imm->u[3].Float); 2421 } 2422 break; 2423 case TGSI_TOKEN_TYPE_DECLARATION: 2424 { 2425 const struct tgsi_full_declaration *d; 2426 unsigned si, last, first, mode; 2427 2428 d = &tp.FullToken.FullDeclaration; 2429 first = d->DeclarationRange.First; 2430 last = d->DeclarationRange.Last; 2431 2432 switch (d->Declaration.File) { 2433 case TGSI_FILE_TEMPORARY: 2434 break; 2435 case TGSI_FILE_OUTPUT: 2436 if (!d->Declaration.Semantic || 2437 p->type == PIPE_SHADER_FRAGMENT) 2438 break; 2439 2440 si = d->Semantic.SemanticIndex; 2441 switch (d->Semantic.SemanticName) { 2442 case TGSI_SEMANTIC_BCOLOR: 2443 p->cfg.two_side[si].hw = first; 2444 if (p->cfg.io_nr > first) 2445 p->cfg.io_nr = first; 2446 break; 2447 case TGSI_SEMANTIC_PSIZE: 2448 p->cfg.psiz = first; 2449 if (p->cfg.io_nr > first) 2450 p->cfg.io_nr = first; 2451 break; 2452 /* 2453 case TGSI_SEMANTIC_CLIP_DISTANCE: 2454 p->cfg.clpd = MIN2(p->cfg.clpd, first); 2455 break; 2456 */ 2457 default: 2458 break; 2459 } 2460 break; 2461 case TGSI_FILE_INPUT: 2462 { 2463 if (p->type != PIPE_SHADER_FRAGMENT) 2464 break; 2465 2466 switch (d->Declaration.Interpolate) { 2467 case TGSI_INTERPOLATE_CONSTANT: 2468 mode = INTERP_FLAT; 2469 flat_nr++; 2470 break; 2471 case TGSI_INTERPOLATE_PERSPECTIVE: 2472 mode = INTERP_PERSPECTIVE; 2473 p->cfg.regs[1] |= 0x08 << 24; 2474 break; 2475 default: 2476 mode = INTERP_LINEAR; 2477 break; 2478 } 2479 if (d->Declaration.Centroid) 2480 mode |= INTERP_CENTROID; 2481 2482 assert(last < 32); 2483 for (i = first; i <= last; i++) 2484 pc->interp_mode[i] = mode; 2485 } 2486 break; 2487 case TGSI_FILE_ADDRESS: 2488 case TGSI_FILE_CONSTANT: 2489 case TGSI_FILE_SAMPLER: 2490 break; 2491 default: 2492 NOUVEAU_ERR("bad decl file %d\n", 2493 d->Declaration.File); 2494 goto out_err; 2495 } 2496 } 2497 break; 2498 case TGSI_TOKEN_TYPE_INSTRUCTION: 2499 pc->insn_nr++; 2500 prep_inspect_insn(pc, &tok->FullInstruction); 2501 break; 2502 default: 2503 break; 2504 } 2505 } 2506 2507 if (p->type == PIPE_SHADER_VERTEX) { 2508 int rid = 0; 2509 2510 for (i = 0; i < pc->attr_nr * 4; ++i) { 2511 if (pc->attr[i].acc) { 2512 pc->attr[i].hw = rid++; 2513 p->cfg.attr[i / 32] |= 1 << (i % 32); 2514 } 2515 } 2516 2517 for (i = 0, rid = 0; i < pc->result_nr; ++i) { 2518 p->cfg.io[i].hw = rid; 2519 p->cfg.io[i].id_vp = i; 2520 2521 for (c = 0; c < 4; ++c) { 2522 int n = i * 4 + c; 2523 if (!pc->result[n].acc) 2524 continue; 2525 pc->result[n].hw = rid++; 2526 p->cfg.io[i].mask |= 1 << c; 2527 } 2528 } 2529 2530 for (c = 0; c < 2; ++c) 2531 if (p->cfg.two_side[c].hw < 0x40) 2532 p->cfg.two_side[c] = p->cfg.io[ 2533 p->cfg.two_side[c].hw]; 2534 2535 if (p->cfg.psiz < 0x40) 2536 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw; 2537 } else 2538 if (p->type == PIPE_SHADER_FRAGMENT) { 2539 int rid, aid; 2540 unsigned n = 0, m = pc->attr_nr - flat_nr; 2541 2542 int base = (TGSI_SEMANTIC_POSITION == 2543 p->info.input_semantic_name[0]) ? 0 : 1; 2544 2545 /* non-flat interpolants have to be mapped to 2546 * the lower hardware IDs, so sort them: 2547 */ 2548 for (i = 0; i < pc->attr_nr; i++) { 2549 if (pc->interp_mode[i] == INTERP_FLAT) { 2550 p->cfg.io[m].id_vp = i + base; 2551 p->cfg.io[m++].id_fp = i; 2552 } else { 2553 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) 2554 p->cfg.io[n].linear = TRUE; 2555 p->cfg.io[n].id_vp = i + base; 2556 p->cfg.io[n++].id_fp = i; 2557 } 2558 } 2559 2560 if (!base) /* set w-coordinate mask from perspective interp */ 2561 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24; 2562 2563 aid = popcnt4( /* if fcrd isn't contained in cfg.io */ 2564 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask); 2565 2566 for (n = 0; n < pc->attr_nr; ++n) { 2567 p->cfg.io[n].hw = rid = aid; 2568 i = p->cfg.io[n].id_fp; 2569 2570 for (c = 0; c < 4; ++c) { 2571 if (!pc->attr[i * 4 + c].acc) 2572 continue; 2573 pc->attr[i * 4 + c].rhw = rid++; 2574 p->cfg.io[n].mask |= 1 << c; 2575 2576 load_interpolant(pc, &pc->attr[i * 4 + c]); 2577 } 2578 aid += popcnt4(p->cfg.io[n].mask); 2579 } 2580 2581 if (!base) 2582 p->cfg.regs[1] |= p->cfg.io[0].mask << 24; 2583 2584 m = popcnt4(p->cfg.regs[1] >> 24); 2585 2586 /* set count of non-position inputs and of non-flat 2587 * non-position inputs for FP_INTERPOLANT_CTRL 2588 */ 2589 p->cfg.regs[1] |= aid - m; 2590 2591 if (flat_nr) { 2592 i = p->cfg.io[pc->attr_nr - flat_nr].hw; 2593 p->cfg.regs[1] |= (i - m) << 16; 2594 } else 2595 p->cfg.regs[1] |= p->cfg.regs[1] << 16; 2596 2597 /* mark color semantic for light-twoside */ 2598 n = 0x40; 2599 for (i = 0; i < pc->attr_nr; i++) { 2600 ubyte si, sn; 2601 2602 sn = p->info.input_semantic_name[p->cfg.io[i].id_fp]; 2603 si = p->info.input_semantic_index[p->cfg.io[i].id_fp]; 2604 2605 if (sn == TGSI_SEMANTIC_COLOR) { 2606 p->cfg.two_side[si] = p->cfg.io[i]; 2607 2608 /* increase colour count */ 2609 p->cfg.regs[0] += popcnt4( 2610 p->cfg.two_side[si].mask) << 16; 2611 2612 n = MIN2(n, p->cfg.io[i].hw - m); 2613 } 2614 } 2615 if (n < 0x40) 2616 p->cfg.regs[0] += n; 2617 2618 /* Initialize FP results: 2619 * FragDepth is always first TGSI and last hw output 2620 */ 2621 i = p->info.writes_z ? 4 : 0; 2622 for (rid = 0; i < pc->result_nr * 4; i++) 2623 pc->result[i].rhw = rid++; 2624 if (p->info.writes_z) 2625 pc->result[2].rhw = rid; 2626 2627 p->cfg.high_result = rid; 2628 } 2629 2630 if (pc->immd_nr) { 2631 int rid = 0; 2632 2633 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); 2634 if (!pc->immd) 2635 goto out_err; 2636 2637 for (i = 0; i < pc->immd_nr; i++) { 2638 for (c = 0; c < 4; c++, rid++) 2639 ctor_reg(&pc->immd[rid], P_IMMD, i, rid); 2640 } 2641 } 2642 2643 ret = TRUE; 2644out_err: 2645 if (pc->iv_p) 2646 free_temp(pc, pc->iv_p); 2647 if (pc->iv_c) 2648 free_temp(pc, pc->iv_c); 2649 2650 tgsi_parse_free(&tp); 2651 return ret; 2652} 2653 2654static void 2655free_nv50_pc(struct nv50_pc *pc) 2656{ 2657 if (pc->immd) 2658 FREE(pc->immd); 2659 if (pc->param) 2660 FREE(pc->param); 2661 if (pc->result) 2662 FREE(pc->result); 2663 if (pc->attr) 2664 FREE(pc->attr); 2665 if (pc->temp) 2666 FREE(pc->temp); 2667 2668 FREE(pc); 2669} 2670 2671static boolean 2672ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) 2673{ 2674 int i, c; 2675 unsigned rtype[2] = { P_ATTR, P_RESULT }; 2676 2677 pc->p = p; 2678 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; 2679 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; 2680 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; 2681 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; 2682 pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; 2683 assert(pc->addr_nr <= 2); 2684 2685 p->cfg.high_temp = 4; 2686 2687 p->cfg.two_side[0].hw = 0x40; 2688 p->cfg.two_side[1].hw = 0x40; 2689 2690 switch (p->type) { 2691 case PIPE_SHADER_VERTEX: 2692 p->cfg.psiz = 0x40; 2693 p->cfg.clpd = 0x40; 2694 p->cfg.io_nr = pc->result_nr; 2695 break; 2696 case PIPE_SHADER_FRAGMENT: 2697 rtype[0] = rtype[1] = P_TEMP; 2698 2699 p->cfg.regs[0] = 0x01000004; 2700 p->cfg.io_nr = pc->attr_nr; 2701 2702 if (p->info.writes_z) { 2703 p->cfg.regs[2] |= 0x00000100; 2704 p->cfg.regs[3] |= 0x00000011; 2705 } 2706 if (p->info.uses_kill) 2707 p->cfg.regs[2] |= 0x00100000; 2708 break; 2709 } 2710 2711 if (pc->temp_nr) { 2712 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); 2713 if (!pc->temp) 2714 return FALSE; 2715 2716 for (i = 0; i < pc->temp_nr * 4; ++i) 2717 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); 2718 } 2719 2720 if (pc->attr_nr) { 2721 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); 2722 if (!pc->attr) 2723 return FALSE; 2724 2725 for (i = 0; i < pc->attr_nr * 4; ++i) 2726 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); 2727 } 2728 2729 if (pc->result_nr) { 2730 unsigned nr = pc->result_nr * 4; 2731 2732 pc->result = MALLOC(nr * sizeof(struct nv50_reg)); 2733 if (!pc->result) 2734 return FALSE; 2735 2736 for (i = 0; i < nr; ++i) 2737 ctor_reg(&pc->result[i], rtype[1], i / 4, -1); 2738 } 2739 2740 if (pc->param_nr) { 2741 int rid = 0; 2742 2743 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); 2744 if (!pc->param) 2745 return FALSE; 2746 2747 for (i = 0; i < pc->param_nr; ++i) 2748 for (c = 0; c < 4; ++c, ++rid) 2749 ctor_reg(&pc->param[rid], P_CONST, i, rid); 2750 } 2751 2752 if (pc->addr_nr) { 2753 pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); 2754 if (!pc->addr) 2755 return FALSE; 2756 } 2757 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 2758 ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1); 2759 2760 return TRUE; 2761} 2762 2763static void 2764nv50_fp_move_results(struct nv50_pc *pc) 2765{ 2766 struct nv50_reg reg; 2767 unsigned i; 2768 2769 ctor_reg(®, P_TEMP, -1, -1); 2770 2771 for (i = 0; i < pc->result_nr * 4; ++i) { 2772 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) 2773 continue; 2774 if (pc->result[i].rhw != pc->result[i].hw) { 2775 reg.hw = pc->result[i].rhw; 2776 emit_mov(pc, ®, &pc->result[i]); 2777 } 2778 } 2779} 2780 2781static void 2782nv50_program_fixup_insns(struct nv50_pc *pc) 2783{ 2784 struct nv50_program_exec *e, *prev = NULL, **bra_list; 2785 unsigned i, n, pos; 2786 2787 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); 2788 2789 /* Collect branch instructions, we need to adjust their offsets 2790 * when converting 32 bit instructions to 64 bit ones 2791 */ 2792 for (n = 0, e = pc->p->exec_head; e; e = e->next) 2793 if (e->param.index >= 0 && !e->param.mask) 2794 bra_list[n++] = e; 2795 2796 /* Make sure we don't have any single 32 bit instructions. */ 2797 for (e = pc->p->exec_head, pos = 0; e; e = e->next) { 2798 pos += is_long(e) ? 2 : 1; 2799 2800 if ((pos & 1) && (!e->next || is_long(e->next))) { 2801 for (i = 0; i < n; ++i) 2802 if (bra_list[i]->param.index >= pos) 2803 bra_list[i]->param.index += 1; 2804 convert_to_long(pc, e); 2805 ++pos; 2806 } 2807 if (e->next) 2808 prev = e; 2809 } 2810 2811 assert(!is_immd(pc->p->exec_head)); 2812 assert(!is_immd(pc->p->exec_tail)); 2813 2814 /* last instruction must be long so it can have the end bit set */ 2815 if (!is_long(pc->p->exec_tail)) { 2816 convert_to_long(pc, pc->p->exec_tail); 2817 if (prev) 2818 convert_to_long(pc, prev); 2819 } 2820 assert(!(pc->p->exec_tail->inst[1] & 2)); 2821 /* set the end-bit */ 2822 pc->p->exec_tail->inst[1] |= 1; 2823 2824 FREE(bra_list); 2825} 2826 2827static boolean 2828nv50_program_tx(struct nv50_program *p) 2829{ 2830 struct tgsi_parse_context parse; 2831 struct nv50_pc *pc; 2832 boolean ret; 2833 2834 pc = CALLOC_STRUCT(nv50_pc); 2835 if (!pc) 2836 return FALSE; 2837 2838 ret = ctor_nv50_pc(pc, p); 2839 if (ret == FALSE) 2840 goto out_cleanup; 2841 2842 ret = nv50_program_tx_prep(pc); 2843 if (ret == FALSE) 2844 goto out_cleanup; 2845 2846 tgsi_parse_init(&parse, pc->p->pipe.tokens); 2847 while (!tgsi_parse_end_of_tokens(&parse)) { 2848 const union tgsi_full_token *tok = &parse.FullToken; 2849 2850 /* don't allow half insn/immd on first and last instruction */ 2851 pc->allow32 = TRUE; 2852 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr) 2853 pc->allow32 = FALSE; 2854 2855 tgsi_parse_token(&parse); 2856 2857 switch (tok->Token.Type) { 2858 case TGSI_TOKEN_TYPE_INSTRUCTION: 2859 ++pc->insn_cur; 2860 ret = nv50_tgsi_insn(pc, tok); 2861 if (ret == FALSE) 2862 goto out_err; 2863 break; 2864 default: 2865 break; 2866 } 2867 } 2868 2869 if (pc->p->type == PIPE_SHADER_FRAGMENT) 2870 nv50_fp_move_results(pc); 2871 2872 nv50_program_fixup_insns(pc); 2873 2874 p->param_nr = pc->param_nr * 4; 2875 p->immd_nr = pc->immd_nr * 4; 2876 p->immd = pc->immd_buf; 2877 2878out_err: 2879 tgsi_parse_free(&parse); 2880 2881out_cleanup: 2882 free_nv50_pc(pc); 2883 return ret; 2884} 2885 2886static void 2887nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 2888{ 2889 if (nv50_program_tx(p) == FALSE) 2890 assert(0); 2891 p->translated = TRUE; 2892} 2893 2894static void 2895nv50_program_upload_data(struct nv50_context *nv50, float *map, 2896 unsigned start, unsigned count, unsigned cbuf) 2897{ 2898 struct nouveau_channel *chan = nv50->screen->base.channel; 2899 struct nouveau_grobj *tesla = nv50->screen->tesla; 2900 2901 while (count) { 2902 unsigned nr = count > 2047 ? 2047 : count; 2903 2904 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2905 OUT_RING (chan, (cbuf << 0) | (start << 8)); 2906 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2907 OUT_RINGp (chan, map, nr); 2908 2909 map += nr; 2910 start += nr; 2911 count -= nr; 2912 } 2913} 2914 2915static void 2916nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 2917{ 2918 struct pipe_screen *pscreen = nv50->pipe.screen; 2919 2920 if (!p->data[0] && p->immd_nr) { 2921 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 2922 2923 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 2924 while (heap->next && heap->size < p->immd_nr) { 2925 struct nv50_program *evict = heap->next->priv; 2926 nouveau_resource_free(&evict->data[0]); 2927 } 2928 2929 if (nouveau_resource_alloc(heap, p->immd_nr, p, 2930 &p->data[0])) 2931 assert(0); 2932 } 2933 2934 /* immediates only need to be uploaded again when freed */ 2935 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 2936 p->immd_nr, NV50_CB_PMISC); 2937 } 2938 2939 assert(p->param_nr <= 512); 2940 2941 if (p->param_nr) { 2942 unsigned cb; 2943 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], 2944 PIPE_BUFFER_USAGE_CPU_READ); 2945 2946 if (p->type == PIPE_SHADER_VERTEX) 2947 cb = NV50_CB_PVP; 2948 else 2949 cb = NV50_CB_PFP; 2950 2951 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); 2952 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); 2953 } 2954} 2955 2956static void 2957nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 2958{ 2959 struct nouveau_channel *chan = nv50->screen->base.channel; 2960 struct nouveau_grobj *tesla = nv50->screen->tesla; 2961 struct nv50_program_exec *e; 2962 struct nouveau_stateobj *so; 2963 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR; 2964 unsigned start, count, *up, *ptr; 2965 boolean upload = FALSE; 2966 2967 if (!p->bo) { 2968 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 2969 p->exec_size * 4, &p->bo); 2970 upload = TRUE; 2971 } 2972 2973 if (p->data[0] && p->data[0]->start != p->data_start[0]) 2974 upload = TRUE; 2975 2976 if (!upload) 2977 return; 2978 2979 for (e = p->exec_head; e; e = e->next) { 2980 unsigned ei, ci, bs; 2981 2982 if (e->param.index < 0) 2983 continue; 2984 2985 if (e->param.mask == 0) { 2986 assert(!(e->param.index & 1)); 2987 /* seem to be 8 byte steps */ 2988 ei = (e->param.index >> 1) + 0 /* START_ID */; 2989 2990 e->inst[0] &= 0xf0000fff; 2991 e->inst[0] |= ei << 12; 2992 continue; 2993 } 2994 2995 bs = (e->inst[1] >> 22) & 0x07; 2996 assert(bs < 2); 2997 ei = e->param.shift >> 5; 2998 ci = e->param.index; 2999 if (bs == 0) 3000 ci += p->data[bs]->start; 3001 3002 e->inst[ei] &= ~e->param.mask; 3003 e->inst[ei] |= (ci << e->param.shift); 3004 } 3005 3006 if (p->data[0]) 3007 p->data_start[0] = p->data[0]->start; 3008 3009#ifdef NV50_PROGRAM_DUMP 3010 NOUVEAU_ERR("-------\n"); 3011 for (e = p->exec_head; e; e = e->next) { 3012 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 3013 if (is_long(e)) 3014 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 3015 } 3016#endif 3017 3018 up = ptr = MALLOC(p->exec_size * 4); 3019 for (e = p->exec_head; e; e = e->next) { 3020 *(ptr++) = e->inst[0]; 3021 if (is_long(e)) 3022 *(ptr++) = e->inst[1]; 3023 } 3024 3025 so = so_new(4,2); 3026 so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); 3027 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0); 3028 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0); 3029 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4)); 3030 3031 start = 0; count = p->exec_size; 3032 while (count) { 3033 struct nouveau_channel *chan = nv50->screen->base.channel; 3034 unsigned nr; 3035 3036 so_emit(chan, so); 3037 3038 nr = MIN2(count, 2047); 3039 nr = MIN2(chan->pushbuf->remaining, nr); 3040 if (chan->pushbuf->remaining < (nr + 3)) { 3041 FIRE_RING(chan); 3042 continue; 3043 } 3044 3045 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 3046 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD); 3047 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 3048 OUT_RINGp (chan, up + start, nr); 3049 3050 start += nr; 3051 count -= nr; 3052 } 3053 3054 FREE(up); 3055 so_ref(NULL, &so); 3056} 3057 3058void 3059nv50_vertprog_validate(struct nv50_context *nv50) 3060{ 3061 struct nouveau_grobj *tesla = nv50->screen->tesla; 3062 struct nv50_program *p = nv50->vertprog; 3063 struct nouveau_stateobj *so; 3064 3065 if (!p->translated) { 3066 nv50_program_validate(nv50, p); 3067 if (!p->translated) 3068 assert(0); 3069 } 3070 3071 nv50_program_validate_data(nv50, p); 3072 nv50_program_validate_code(nv50, p); 3073 3074 so = so_new(13, 2); 3075 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 3076 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3077 NOUVEAU_BO_HIGH, 0, 0); 3078 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3079 NOUVEAU_BO_LOW, 0, 0); 3080 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 3081 so_data (so, p->cfg.attr[0]); 3082 so_data (so, p->cfg.attr[1]); 3083 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 3084 so_data (so, p->cfg.high_result); 3085 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); 3086 so_data (so, p->cfg.high_result); //8); 3087 so_data (so, p->cfg.high_temp); 3088 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 3089 so_data (so, 0); /* program start offset */ 3090 so_ref(so, &nv50->state.vertprog); 3091 so_ref(NULL, &so); 3092} 3093 3094void 3095nv50_fragprog_validate(struct nv50_context *nv50) 3096{ 3097 struct nouveau_grobj *tesla = nv50->screen->tesla; 3098 struct nv50_program *p = nv50->fragprog; 3099 struct nouveau_stateobj *so; 3100 3101 if (!p->translated) { 3102 nv50_program_validate(nv50, p); 3103 if (!p->translated) 3104 assert(0); 3105 } 3106 3107 nv50_program_validate_data(nv50, p); 3108 nv50_program_validate_code(nv50, p); 3109 3110 so = so_new(64, 2); 3111 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 3112 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3113 NOUVEAU_BO_HIGH, 0, 0); 3114 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3115 NOUVEAU_BO_LOW, 0, 0); 3116 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); 3117 so_data (so, p->cfg.high_temp); 3118 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 3119 so_data (so, p->cfg.high_result); 3120 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1); 3121 so_data (so, p->cfg.regs[2]); 3122 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 3123 so_data (so, p->cfg.regs[3]); 3124 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 3125 so_data (so, 0); /* program start offset */ 3126 so_ref(so, &nv50->state.fragprog); 3127 so_ref(NULL, &so); 3128} 3129 3130static void 3131nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) 3132{ 3133 struct nv50_program *fp = nv50->fragprog; 3134 struct nv50_program *vp = nv50->vertprog; 3135 unsigned i, c, m = base; 3136 3137 /* XXX: This can't work correctly in all cases yet, we either 3138 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has 3139 * to be per FP input instead of per VP output 3140 */ 3141 memset(pntc, 0, 8 * sizeof(uint32_t)); 3142 3143 for (i = 0; i < fp->cfg.io_nr; i++) { 3144 uint8_t sn, si; 3145 uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp; 3146 unsigned n = popcnt4(fp->cfg.io[i].mask); 3147 3148 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) { 3149 m += n; 3150 continue; 3151 } 3152 3153 sn = vp->info.input_semantic_name[j]; 3154 si = vp->info.input_semantic_index[j]; 3155 3156 if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) { 3157 ubyte mode = 3158 nv50->rasterizer->pipe.sprite_coord_mode[si]; 3159 3160 if (mode == PIPE_SPRITE_COORD_NONE) { 3161 m += n; 3162 continue; 3163 } 3164 } 3165 3166 /* this is either PointCoord or replaced by sprite coords */ 3167 for (c = 0; c < 4; c++) { 3168 if (!(fp->cfg.io[i].mask & (1 << c))) 3169 continue; 3170 pntc[m / 8] |= (c + 1) << ((m % 8) * 4); 3171 ++m; 3172 } 3173 } 3174} 3175 3176static int 3177nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4], 3178 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) 3179{ 3180 int c; 3181 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; 3182 uint8_t *map = (uint8_t *)p_map; 3183 3184 for (c = 0; c < 4; ++c) { 3185 if (mf & 1) { 3186 if (fpi->linear == TRUE) 3187 lin[mid / 32] |= 1 << (mid % 32); 3188 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40); 3189 } 3190 3191 oid += mv & 1; 3192 mf >>= 1; 3193 mv >>= 1; 3194 } 3195 3196 return mid; 3197} 3198 3199void 3200nv50_linkage_validate(struct nv50_context *nv50) 3201{ 3202 struct nouveau_grobj *tesla = nv50->screen->tesla; 3203 struct nv50_program *vp = nv50->vertprog; 3204 struct nv50_program *fp = nv50->fragprog; 3205 struct nouveau_stateobj *so; 3206 struct nv50_sreg4 dummy, *vpo; 3207 int i, n, c, m = 0; 3208 uint32_t map[16], lin[4], reg[5], pcrd[8]; 3209 3210 memset(map, 0, sizeof(map)); 3211 memset(lin, 0, sizeof(lin)); 3212 3213 reg[1] = 0x00000004; /* low and high clip distance map ids */ 3214 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ 3215 reg[3] = 0x00000000; /* point size map id & enable */ 3216 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ 3217 reg[4] = fp->cfg.regs[1]; /* interpolant info */ 3218 3219 dummy.linear = FALSE; 3220 dummy.mask = 0xf; /* map all components of HPOS */ 3221 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]); 3222 3223 dummy.mask = 0x0; 3224 3225 if (vp->cfg.clpd < 0x40) { 3226 for (c = 0; c < vp->cfg.clpd_nr; ++c) 3227 map[m++] = vp->cfg.clpd + c; 3228 reg[1] = (m << 8); 3229 } 3230 3231 reg[0] |= m << 8; /* adjust BFC0 id */ 3232 3233 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ 3234 if (nv50->rasterizer->pipe.light_twoside) { 3235 vpo = &vp->cfg.two_side[0]; 3236 3237 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]); 3238 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]); 3239 } 3240 3241 reg[0] += m - 4; /* adjust FFC0 id */ 3242 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ 3243 3244 i = 0; 3245 if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) 3246 i = 1; 3247 for (; i < fp->cfg.io_nr; i++) { 3248 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp]; 3249 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp]; 3250 3251 n = fp->cfg.io[i].id_vp; 3252 if (n >= vp->cfg.io_nr || 3253 vp->info.output_semantic_name[n] != sn || 3254 vp->info.output_semantic_index[n] != si) 3255 vpo = &dummy; 3256 else 3257 vpo = &vp->cfg.io[n]; 3258 3259 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo); 3260 } 3261 3262 if (nv50->rasterizer->pipe.point_size_per_vertex) { 3263 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); 3264 reg[3] = (m++ << 4) | 1; 3265 } 3266 3267 /* now fill the stateobj */ 3268 so = so_new(64, 0); 3269 3270 n = (m + 3) / 4; 3271 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); 3272 so_data (so, m); 3273 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); 3274 so_datap (so, map, n); 3275 3276 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 3277 so_datap (so, reg, 4); 3278 3279 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); 3280 so_data (so, reg[4]); 3281 3282 so_method(so, tesla, 0x1540, 4); 3283 so_datap (so, lin, 4); 3284 3285 if (nv50->rasterizer->pipe.point_sprite) { 3286 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff); 3287 3288 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); 3289 so_datap (so, pcrd, 8); 3290 } 3291 3292 so_ref(so, &nv50->state.programs); 3293 so_ref(NULL, &so); 3294} 3295 3296void 3297nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 3298{ 3299 while (p->exec_head) { 3300 struct nv50_program_exec *e = p->exec_head; 3301 3302 p->exec_head = e->next; 3303 FREE(e); 3304 } 3305 p->exec_tail = NULL; 3306 p->exec_size = 0; 3307 3308 nouveau_bo_ref(NULL, &p->bo); 3309 3310 nouveau_resource_free(&p->data[0]); 3311 3312 p->translated = 0; 3313} 3314