nv50_program.c revision ccc7d0cb7afdac3bca985b7326b53e5c8bf83b3a
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "pipe/p_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33 34#define NV50_SU_MAX_TEMP 127 35#define NV50_SU_MAX_ADDR 4 36//#define NV50_PROGRAM_DUMP 37 38/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ 39 40/* ARL - gallium craps itself on progs/vp/arl.txt 41 * 42 * MSB - Like MAD, but MUL+SUB 43 * - Fuck it off, introduce a way to negate args for ops that 44 * support it. 45 * 46 * Look into inlining IMMD for ops other than MOV (make it general?) 47 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 48 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 49 * 50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 51 * case, if the emit_src() causes the inst to suddenly become long. 52 * 53 * Verify half-insns work where expected - and force disable them where they 54 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 55 * 56 * FUCK! watch dst==src vectors, can overwrite components that are needed. 57 * ie. SUB R0, R0.yzxw, R0 58 * 59 * Things to check with renouveau: 60 * FP attr/result assignment - how? 61 * attrib 62 * - 0x16bc maps vp output onto fp hpos 63 * - 0x16c0 maps vp output onto fp col0 64 * result 65 * - colr always 0-3 66 * - depr always 4 67 * 0x16bc->0x16e8 --> some binding between vp/fp regs 68 * 0x16b8 --> VP output count 69 * 70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 71 * "MOV rcol.x, fcol.y" = 0x00000004 72 * 0x19a8 --> as above but 0x00000100 and 0x00000000 73 * - 0x00100000 used when KIL used 74 * 0x196c --> as above but 0x00000011 and 0x00000000 75 * 76 * 0x1988 --> 0xXXNNNNNN 77 * - XX == FP high something 78 */ 79struct nv50_reg { 80 enum { 81 P_TEMP, 82 P_ATTR, 83 P_RESULT, 84 P_CONST, 85 P_IMMD, 86 P_ADDR 87 } type; 88 int index; 89 90 int hw; 91 int mod; 92 93 int rhw; /* result hw for FP outputs, or interpolant index */ 94 int acc; /* instruction where this reg is last read (first insn == 1) */ 95}; 96 97#define NV50_MOD_NEG 1 98#define NV50_MOD_ABS 2 99#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS) 100#define NV50_MOD_SAT 4 101#define NV50_MOD_I32 8 102 103/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */ 104 105/* STACK: Conditionals and loops have to use the (per warp) stack. 106 * Stack entries consist of an entry type (divergent path, join at), 107 * a mask indicating the active threads of the warp, and an address. 108 * MPs can store 12 stack entries internally, if we need more (and 109 * we probably do), we have to create a stack buffer in VRAM. 110 */ 111/* impose low limits for now */ 112#define NV50_MAX_COND_NESTING 4 113#define NV50_MAX_LOOP_NESTING 3 114 115#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2 116 117struct nv50_pc { 118 struct nv50_program *p; 119 120 /* hw resources */ 121 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 122 struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; 123 124 /* tgsi resources */ 125 struct nv50_reg *temp; 126 int temp_nr; 127 struct nv50_reg *attr; 128 int attr_nr; 129 struct nv50_reg *result; 130 int result_nr; 131 struct nv50_reg *param; 132 int param_nr; 133 struct nv50_reg *immd; 134 uint32_t *immd_buf; 135 int immd_nr; 136 struct nv50_reg **addr; 137 int addr_nr; 138 uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */ 139 140 struct nv50_reg *temp_temp[16]; 141 unsigned temp_temp_nr; 142 143 /* broadcast and destination replacement regs */ 144 struct nv50_reg *r_brdc; 145 struct nv50_reg *r_dst[4]; 146 147 struct nv50_reg reg_instances[16]; 148 unsigned reg_instance_nr; 149 150 unsigned interp_mode[32]; 151 /* perspective interpolation registers */ 152 struct nv50_reg *iv_p; 153 struct nv50_reg *iv_c; 154 155 struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING]; 156 struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING]; 157 struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING]; 158 int if_lvl, loop_lvl; 159 unsigned loop_pos[NV50_MAX_LOOP_NESTING]; 160 161 unsigned *insn_pos; /* actual program offset of each TGSI insn */ 162 boolean in_subroutine; 163 164 /* current instruction and total number of insns */ 165 unsigned insn_cur; 166 unsigned insn_nr; 167 168 boolean allow32; 169 170 uint8_t edgeflag_out; 171}; 172 173static INLINE void 174ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) 175{ 176 reg->type = type; 177 reg->index = index; 178 reg->hw = hw; 179 reg->mod = 0; 180 reg->rhw = -1; 181 reg->acc = 0; 182} 183 184static INLINE unsigned 185popcnt4(uint32_t val) 186{ 187 static const unsigned cnt[16] 188 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 189 return cnt[val & 0xf]; 190} 191 192static void 193terminate_mbb(struct nv50_pc *pc) 194{ 195 int i; 196 197 /* remove records of temporary address register values */ 198 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 199 pc->r_addr[i].rhw = -1; 200} 201 202static void 203alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 204{ 205 int i = 0; 206 207 if (reg->type == P_RESULT) { 208 if (pc->p->cfg.high_result < (reg->hw + 1)) 209 pc->p->cfg.high_result = reg->hw + 1; 210 } 211 212 if (reg->type != P_TEMP) 213 return; 214 215 if (reg->hw >= 0) { 216 /*XXX: do this here too to catch FP temp-as-attr usage.. 217 * not clean, but works */ 218 if (pc->p->cfg.high_temp < (reg->hw + 1)) 219 pc->p->cfg.high_temp = reg->hw + 1; 220 return; 221 } 222 223 if (reg->rhw != -1) { 224 /* try to allocate temporary with index rhw first */ 225 if (!(pc->r_temp[reg->rhw])) { 226 pc->r_temp[reg->rhw] = reg; 227 reg->hw = reg->rhw; 228 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 229 pc->p->cfg.high_temp = reg->rhw + 1; 230 return; 231 } 232 /* make sure we don't get things like $r0 needs to go 233 * in $r1 and $r1 in $r0 234 */ 235 i = pc->result_nr * 4; 236 } 237 238 for (; i < NV50_SU_MAX_TEMP; i++) { 239 if (!(pc->r_temp[i])) { 240 pc->r_temp[i] = reg; 241 reg->hw = i; 242 if (pc->p->cfg.high_temp < (i + 1)) 243 pc->p->cfg.high_temp = i + 1; 244 return; 245 } 246 } 247 248 assert(0); 249} 250 251static INLINE struct nv50_reg * 252reg_instance(struct nv50_pc *pc, struct nv50_reg *reg) 253{ 254 struct nv50_reg *ri; 255 256 assert(pc->reg_instance_nr < 16); 257 ri = &pc->reg_instances[pc->reg_instance_nr++]; 258 if (reg) { 259 alloc_reg(pc, reg); 260 *ri = *reg; 261 reg->mod = 0; 262 } 263 return ri; 264} 265 266/* XXX: For shaders that aren't executed linearly (e.g. shaders that 267 * contain loops), we need to assign all hw regs to TGSI TEMPs early, 268 * lest we risk temp_temps overwriting regs alloc'd "later". 269 */ 270static struct nv50_reg * 271alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 272{ 273 struct nv50_reg *r; 274 int i; 275 276 if (dst && dst->type == P_TEMP && dst->hw == -1) 277 return dst; 278 279 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 280 if (!pc->r_temp[i]) { 281 r = MALLOC_STRUCT(nv50_reg); 282 ctor_reg(r, P_TEMP, -1, i); 283 pc->r_temp[i] = r; 284 return r; 285 } 286 } 287 288 assert(0); 289 return NULL; 290} 291 292/* release the hardware resource held by r */ 293static void 294release_hw(struct nv50_pc *pc, struct nv50_reg *r) 295{ 296 assert(r->type == P_TEMP); 297 if (r->hw == -1) 298 return; 299 300 assert(pc->r_temp[r->hw] == r); 301 pc->r_temp[r->hw] = NULL; 302 303 r->acc = 0; 304 if (r->index == -1) 305 FREE(r); 306} 307 308static void 309free_temp(struct nv50_pc *pc, struct nv50_reg *r) 310{ 311 if (r->index == -1) { 312 unsigned hw = r->hw; 313 314 FREE(pc->r_temp[hw]); 315 pc->r_temp[hw] = NULL; 316 } 317} 318 319static int 320alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 321{ 322 int i; 323 324 if ((idx + 4) >= NV50_SU_MAX_TEMP) 325 return 1; 326 327 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 328 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 329 return alloc_temp4(pc, dst, idx + 4); 330 331 for (i = 0; i < 4; i++) { 332 dst[i] = MALLOC_STRUCT(nv50_reg); 333 ctor_reg(dst[i], P_TEMP, -1, idx + i); 334 pc->r_temp[idx + i] = dst[i]; 335 } 336 337 return 0; 338} 339 340static void 341free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 342{ 343 int i; 344 345 for (i = 0; i < 4; i++) 346 free_temp(pc, reg[i]); 347} 348 349static struct nv50_reg * 350temp_temp(struct nv50_pc *pc) 351{ 352 if (pc->temp_temp_nr >= 16) 353 assert(0); 354 355 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 356 return pc->temp_temp[pc->temp_temp_nr++]; 357} 358 359static void 360kill_temp_temp(struct nv50_pc *pc) 361{ 362 int i; 363 364 for (i = 0; i < pc->temp_temp_nr; i++) 365 free_temp(pc, pc->temp_temp[i]); 366 pc->temp_temp_nr = 0; 367} 368 369static int 370ctor_immd_4u32(struct nv50_pc *pc, 371 uint32_t x, uint32_t y, uint32_t z, uint32_t w) 372{ 373 unsigned size = pc->immd_nr * 4 * sizeof(uint32_t); 374 375 pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t)); 376 377 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 378 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 379 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 380 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 381 382 return pc->immd_nr++; 383} 384 385static INLINE int 386ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w) 387{ 388 return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w)); 389} 390 391static struct nv50_reg * 392alloc_immd(struct nv50_pc *pc, float f) 393{ 394 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); 395 unsigned hw; 396 397 for (hw = 0; hw < pc->immd_nr * 4; hw++) 398 if (pc->immd_buf[hw] == fui(f)) 399 break; 400 401 if (hw == pc->immd_nr * 4) 402 hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4; 403 404 ctor_reg(r, P_IMMD, -1, hw); 405 return r; 406} 407 408static struct nv50_program_exec * 409exec(struct nv50_pc *pc) 410{ 411 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 412 413 e->param.index = -1; 414 return e; 415} 416 417static void 418emit(struct nv50_pc *pc, struct nv50_program_exec *e) 419{ 420 struct nv50_program *p = pc->p; 421 422 if (p->exec_tail) 423 p->exec_tail->next = e; 424 if (!p->exec_head) 425 p->exec_head = e; 426 p->exec_tail = e; 427 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 428} 429 430static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 431 432static boolean 433is_long(struct nv50_program_exec *e) 434{ 435 if (e->inst[0] & 1) 436 return TRUE; 437 return FALSE; 438} 439 440static boolean 441is_immd(struct nv50_program_exec *e) 442{ 443 if (is_long(e) && (e->inst[1] & 3) == 3) 444 return TRUE; 445 return FALSE; 446} 447 448static boolean 449is_join(struct nv50_program_exec *e) 450{ 451 if (is_long(e) && (e->inst[1] & 3) == 2) 452 return TRUE; 453 return FALSE; 454} 455 456static INLINE void 457set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 458 struct nv50_program_exec *e) 459{ 460 assert(!is_immd(e)); 461 set_long(pc, e); 462 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 463 e->inst[1] |= (pred << 7) | (idx << 12); 464} 465 466static INLINE void 467set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 468 struct nv50_program_exec *e) 469{ 470 set_long(pc, e); 471 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 472 e->inst[1] |= (idx << 4) | (on << 6); 473} 474 475static INLINE void 476set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 477{ 478 if (is_long(e)) 479 return; 480 481 e->inst[0] |= 1; 482 set_pred(pc, 0xf, 0, e); 483 set_pred_wr(pc, 0, 0, e); 484} 485 486static INLINE void 487set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 488{ 489 if (dst->type == P_RESULT) { 490 set_long(pc, e); 491 e->inst[1] |= 0x00000008; 492 } 493 494 alloc_reg(pc, dst); 495 if (dst->hw > 63) 496 set_long(pc, e); 497 e->inst[0] |= (dst->hw << 2); 498} 499 500static INLINE void 501set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 502{ 503 set_long(pc, e); 504 /* XXX: can't be predicated - bits overlap; cases where both 505 * are required should be avoided by using pc->allow32 */ 506 set_pred(pc, 0, 0, e); 507 set_pred_wr(pc, 0, 0, e); 508 509 e->inst[1] |= 0x00000002 | 0x00000001; 510 e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16; 511 e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2; 512} 513 514static INLINE void 515set_addr(struct nv50_program_exec *e, struct nv50_reg *a) 516{ 517 assert(!(e->inst[0] & 0x0c000000)); 518 assert(!(e->inst[1] & 0x00000004)); 519 520 e->inst[0] |= (a->hw & 3) << 26; 521 e->inst[1] |= (a->hw >> 2) << 2; 522} 523 524static void 525emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, 526 struct nv50_reg *src0, uint16_t src1_val) 527{ 528 struct nv50_program_exec *e = exec(pc); 529 530 e->inst[0] = 0xd0000000 | (src1_val << 9); 531 e->inst[1] = 0x20000000; 532 set_long(pc, e); 533 e->inst[0] |= dst->hw << 2; 534 if (src0) /* otherwise will add to $a0, which is always 0 */ 535 set_addr(e, src0); 536 537 emit(pc, e); 538} 539 540static struct nv50_reg * 541alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref) 542{ 543 struct nv50_reg *a_tgsi = NULL, *a = NULL; 544 int i; 545 uint8_t avail = ~pc->addr_alloc; 546 547 if (!ref) { 548 /* allocate for TGSI_FILE_ADDRESS */ 549 while (avail) { 550 i = ffs(avail) - 1; 551 552 if (pc->r_addr[i].rhw < 0 || 553 pc->r_addr[i].acc != pc->insn_cur) { 554 pc->addr_alloc |= (1 << i); 555 556 pc->r_addr[i].rhw = -1; 557 pc->r_addr[i].index = i; 558 return &pc->r_addr[i]; 559 } 560 avail &= ~(1 << i); 561 } 562 assert(0); 563 return NULL; 564 } 565 566 /* Allocate and set an address reg so we can access 'ref'. 567 * 568 * If and r_addr->index will be -1 or the hw index the value 569 * value in rhw is relative to. If rhw < 0, the reg has not 570 * been initialized or is in use for TGSI_FILE_ADDRESS. 571 */ 572 while (avail) { /* only consider regs that are not TGSI */ 573 i = ffs(avail) - 1; 574 avail &= ~(1 << i); 575 576 if ((!a || a->rhw >= 0) && pc->r_addr[i].rhw < 0) { 577 /* prefer an usused reg with low hw index */ 578 a = &pc->r_addr[i]; 579 continue; 580 } 581 if (!a && pc->r_addr[i].acc != pc->insn_cur) 582 a = &pc->r_addr[i]; 583 584 if (ref->hw - pc->r_addr[i].rhw >= 128) 585 continue; 586 587 if ((ref->acc >= 0 && pc->r_addr[i].index < 0) || 588 (ref->acc < 0 && pc->r_addr[i].index == ref->index)) { 589 pc->r_addr[i].acc = pc->insn_cur; 590 return &pc->r_addr[i]; 591 } 592 } 593 assert(a); 594 595 if (ref->acc < 0) 596 a_tgsi = pc->addr[ref->index]; 597 598 emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4); 599 600 a->rhw = ref->hw & ~0x7f; 601 a->acc = pc->insn_cur; 602 a->index = a_tgsi ? ref->index : -1; 603 return a; 604} 605 606#define INTERP_LINEAR 0 607#define INTERP_FLAT 1 608#define INTERP_PERSPECTIVE 2 609#define INTERP_CENTROID 4 610 611/* interpolant index has been stored in dst->rhw */ 612static void 613emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 614 unsigned mode) 615{ 616 assert(dst->rhw != -1); 617 struct nv50_program_exec *e = exec(pc); 618 619 e->inst[0] |= 0x80000000; 620 set_dst(pc, dst, e); 621 e->inst[0] |= (dst->rhw << 16); 622 623 if (mode & INTERP_FLAT) { 624 e->inst[0] |= (1 << 8); 625 } else { 626 if (mode & INTERP_PERSPECTIVE) { 627 e->inst[0] |= (1 << 25); 628 alloc_reg(pc, iv); 629 e->inst[0] |= (iv->hw << 9); 630 } 631 632 if (mode & INTERP_CENTROID) 633 e->inst[0] |= (1 << 24); 634 } 635 636 emit(pc, e); 637} 638 639static void 640set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 641 struct nv50_program_exec *e) 642{ 643 set_long(pc, e); 644 645 e->param.index = src->hw & 127; 646 e->param.shift = s; 647 e->param.mask = m << (s % 32); 648 649 if (src->hw > 127) 650 set_addr(e, alloc_addr(pc, src)); 651 else 652 if (src->acc < 0) { 653 assert(src->type == P_CONST); 654 set_addr(e, pc->addr[src->index]); 655 } 656 657 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 658} 659 660/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */ 661static void 662emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 663{ 664 struct nv50_program_exec *e = exec(pc); 665 666 e->inst[0] = 0x10000000; 667 if (!pc->allow32) 668 set_long(pc, e); 669 670 set_dst(pc, dst, e); 671 672 if (!is_long(e) && src->type == P_IMMD) { 673 set_immd(pc, src, e); 674 /*XXX: 32-bit, but steals part of "half" reg space - need to 675 * catch and handle this case if/when we do half-regs 676 */ 677 } else 678 if (src->type == P_IMMD || src->type == P_CONST) { 679 set_long(pc, e); 680 set_data(pc, src, 0x7f, 9, e); 681 e->inst[1] |= 0x20000000; /* mov from c[] */ 682 } else { 683 if (src->type == P_ATTR) { 684 set_long(pc, e); 685 e->inst[1] |= 0x00200000; 686 } 687 688 alloc_reg(pc, src); 689 if (src->hw > 63) 690 set_long(pc, e); 691 e->inst[0] |= (src->hw << 9); 692 } 693 694 if (is_long(e) && !is_immd(e)) { 695 e->inst[1] |= 0x04000000; /* 32-bit */ 696 e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */ 697 if (!(e->inst[1] & 0x20000000)) 698 e->inst[1] |= 0x00030000; /* lane mask 2:3 */ 699 } else 700 e->inst[0] |= 0x00008000; 701 702 emit(pc, e); 703} 704 705static INLINE void 706emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 707{ 708 struct nv50_reg *imm = alloc_immd(pc, f); 709 emit_mov(pc, dst, imm); 710 FREE(imm); 711} 712 713/* Assign the hw of the discarded temporary register src 714 * to the tgsi register dst and free src. 715 */ 716static void 717assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 718{ 719 assert(src->index == -1 && src->hw != -1); 720 721 if (pc->if_lvl || pc->loop_lvl || 722 (dst->type != P_TEMP) || 723 (src->hw < pc->result_nr * 4 && 724 pc->p->type == PIPE_SHADER_FRAGMENT) || 725 pc->p->info.opcode_count[TGSI_OPCODE_CAL] || 726 pc->p->info.opcode_count[TGSI_OPCODE_BRA]) { 727 728 emit_mov(pc, dst, src); 729 free_temp(pc, src); 730 return; 731 } 732 733 if (dst->hw != -1) 734 pc->r_temp[dst->hw] = NULL; 735 pc->r_temp[src->hw] = dst; 736 dst->hw = src->hw; 737 738 FREE(src); 739} 740 741static void 742emit_nop(struct nv50_pc *pc) 743{ 744 struct nv50_program_exec *e = exec(pc); 745 746 e->inst[0] = 0xf0000000; 747 set_long(pc, e); 748 e->inst[1] = 0xe0000000; 749 emit(pc, e); 750} 751 752static boolean 753check_swap_src_0_1(struct nv50_pc *pc, 754 struct nv50_reg **s0, struct nv50_reg **s1) 755{ 756 struct nv50_reg *src0 = *s0, *src1 = *s1; 757 758 if (src0->type == P_CONST) { 759 if (src1->type != P_CONST) { 760 *s0 = src1; 761 *s1 = src0; 762 return TRUE; 763 } 764 } else 765 if (src1->type == P_ATTR) { 766 if (src0->type != P_ATTR) { 767 *s0 = src1; 768 *s1 = src0; 769 return TRUE; 770 } 771 } 772 773 return FALSE; 774} 775 776static void 777set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, 778 struct nv50_program_exec *e) 779{ 780 struct nv50_reg *temp; 781 782 if (src->type != P_TEMP) { 783 temp = temp_temp(pc); 784 emit_mov(pc, temp, src); 785 src = temp; 786 } 787 788 alloc_reg(pc, src); 789 if (src->hw > 63) 790 set_long(pc, e); 791 e->inst[0] |= (src->hw << 9); 792} 793 794static void 795set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 796{ 797 if (src->type == P_ATTR) { 798 set_long(pc, e); 799 e->inst[1] |= 0x00200000; 800 } else 801 if (src->type == P_CONST || src->type == P_IMMD) { 802 struct nv50_reg *temp = temp_temp(pc); 803 804 emit_mov(pc, temp, src); 805 src = temp; 806 } 807 808 alloc_reg(pc, src); 809 if (src->hw > 63) 810 set_long(pc, e); 811 e->inst[0] |= (src->hw << 9); 812} 813 814static void 815set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 816{ 817 if (src->type == P_ATTR) { 818 struct nv50_reg *temp = temp_temp(pc); 819 820 emit_mov(pc, temp, src); 821 src = temp; 822 } else 823 if (src->type == P_CONST || src->type == P_IMMD) { 824 assert(!(e->inst[0] & 0x00800000)); 825 if (e->inst[0] & 0x01000000) { 826 struct nv50_reg *temp = temp_temp(pc); 827 828 emit_mov(pc, temp, src); 829 src = temp; 830 } else { 831 set_data(pc, src, 0x7f, 16, e); 832 e->inst[0] |= 0x00800000; 833 } 834 } 835 836 alloc_reg(pc, src); 837 if (src->hw > 63) 838 set_long(pc, e); 839 e->inst[0] |= ((src->hw & 127) << 16); 840} 841 842static void 843set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 844{ 845 set_long(pc, e); 846 847 if (src->type == P_ATTR) { 848 struct nv50_reg *temp = temp_temp(pc); 849 850 emit_mov(pc, temp, src); 851 src = temp; 852 } else 853 if (src->type == P_CONST || src->type == P_IMMD) { 854 assert(!(e->inst[0] & 0x01000000)); 855 if (e->inst[0] & 0x00800000) { 856 struct nv50_reg *temp = temp_temp(pc); 857 858 emit_mov(pc, temp, src); 859 src = temp; 860 } else { 861 set_data(pc, src, 0x7f, 32+14, e); 862 e->inst[0] |= 0x01000000; 863 } 864 } 865 866 alloc_reg(pc, src); 867 e->inst[1] |= ((src->hw & 127) << 14); 868} 869 870static void 871emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred) 872{ 873 struct nv50_program_exec *e = exec(pc); 874 875 assert(dst->type == P_TEMP); 876 e->inst[1] = 0x20000000 | (pred << 12); 877 set_long(pc, e); 878 set_dst(pc, dst, e); 879 880 emit(pc, e); 881} 882 883static void 884emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src) 885{ 886 struct nv50_program_exec *e = exec(pc); 887 888 e->inst[0] = 0x000001fc; 889 e->inst[1] = 0xa0000008; 890 set_long(pc, e); 891 set_pred_wr(pc, 1, pred, e); 892 set_src_0_restricted(pc, src, e); 893 894 emit(pc, e); 895} 896 897static void 898emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 899 struct nv50_reg *src1) 900{ 901 struct nv50_program_exec *e = exec(pc); 902 903 e->inst[0] |= 0xc0000000; 904 905 if (!pc->allow32) 906 set_long(pc, e); 907 908 check_swap_src_0_1(pc, &src0, &src1); 909 set_dst(pc, dst, e); 910 set_src_0(pc, src0, e); 911 if (src1->type == P_IMMD && !is_long(e)) { 912 if (src0->mod ^ src1->mod) 913 e->inst[0] |= 0x00008000; 914 set_immd(pc, src1, e); 915 } else { 916 set_src_1(pc, src1, e); 917 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { 918 if (is_long(e)) 919 e->inst[1] |= 0x08000000; 920 else 921 e->inst[0] |= 0x00008000; 922 } 923 } 924 925 emit(pc, e); 926} 927 928static void 929emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 930 struct nv50_reg *src0, struct nv50_reg *src1) 931{ 932 struct nv50_program_exec *e = exec(pc); 933 934 e->inst[0] = 0xb0000000; 935 936 alloc_reg(pc, src1); 937 check_swap_src_0_1(pc, &src0, &src1); 938 939 if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { 940 set_long(pc, e); 941 e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | 942 ((src1->mod & NV50_MOD_NEG) << 27); 943 } 944 945 set_dst(pc, dst, e); 946 set_src_0(pc, src0, e); 947 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 948 set_src_2(pc, src1, e); 949 else 950 if (src1->type == P_IMMD) 951 set_immd(pc, src1, e); 952 else 953 set_src_1(pc, src1, e); 954 955 emit(pc, e); 956} 957 958static void 959emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 960 uint8_t s) 961{ 962 struct nv50_program_exec *e = exec(pc); 963 964 set_long(pc, e); 965 e->inst[1] |= 0xc0000000; 966 967 e->inst[0] |= dst->hw << 2; 968 e->inst[0] |= s << 16; /* shift left */ 969 set_src_0_restricted(pc, src, e); 970 971 emit(pc, e); 972} 973 974static void 975emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 976 struct nv50_reg *src0, struct nv50_reg *src1) 977{ 978 struct nv50_program_exec *e = exec(pc); 979 980 set_long(pc, e); 981 e->inst[0] |= 0xb0000000; 982 e->inst[1] |= (sub << 29); 983 984 check_swap_src_0_1(pc, &src0, &src1); 985 set_dst(pc, dst, e); 986 set_src_0(pc, src0, e); 987 set_src_1(pc, src1, e); 988 989 if (src0->mod & NV50_MOD_ABS) 990 e->inst[1] |= 0x00100000; 991 if (src1->mod & NV50_MOD_ABS) 992 e->inst[1] |= 0x00080000; 993 994 emit(pc, e); 995} 996 997static INLINE void 998emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 999 struct nv50_reg *src1) 1000{ 1001 src1->mod ^= NV50_MOD_NEG; 1002 emit_add(pc, dst, src0, src1); 1003 src1->mod ^= NV50_MOD_NEG; 1004} 1005 1006static void 1007emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1008 struct nv50_reg *src1, unsigned op) 1009{ 1010 struct nv50_program_exec *e = exec(pc); 1011 1012 e->inst[0] = 0xd0000000; 1013 set_long(pc, e); 1014 1015 check_swap_src_0_1(pc, &src0, &src1); 1016 set_dst(pc, dst, e); 1017 set_src_0(pc, src0, e); 1018 1019 if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && 1020 op != TGSI_OPCODE_XOR) 1021 assert(!"invalid bit op"); 1022 1023 assert(!(src0->mod | src1->mod)); 1024 1025 if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { 1026 set_immd(pc, src1, e); 1027 if (op == TGSI_OPCODE_OR) 1028 e->inst[0] |= 0x0100; 1029 else 1030 if (op == TGSI_OPCODE_XOR) 1031 e->inst[0] |= 0x8000; 1032 } else { 1033 set_src_1(pc, src1, e); 1034 e->inst[1] |= 0x04000000; /* 32 bit */ 1035 if (op == TGSI_OPCODE_OR) 1036 e->inst[1] |= 0x4000; 1037 else 1038 if (op == TGSI_OPCODE_XOR) 1039 e->inst[1] |= 0x8000; 1040 } 1041 1042 emit(pc, e); 1043} 1044 1045static void 1046emit_shift(struct nv50_pc *pc, struct nv50_reg *dst, 1047 struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir) 1048{ 1049 struct nv50_program_exec *e = exec(pc); 1050 1051 e->inst[0] = 0x30000000; 1052 e->inst[1] = 0xc4000000; 1053 1054 set_long(pc, e); 1055 set_dst(pc, dst, e); 1056 set_src_0(pc, src0, e); 1057 1058 if (src1->type == P_IMMD) { 1059 e->inst[1] |= (1 << 20); 1060 e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16; 1061 } else 1062 set_src_1(pc, src1, e); 1063 1064 if (dir != TGSI_OPCODE_SHL) 1065 e->inst[1] |= (1 << 29); 1066 1067 if (dir == TGSI_OPCODE_ISHR) 1068 e->inst[1] |= (1 << 27); 1069 1070 emit(pc, e); 1071} 1072 1073static void 1074emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1075 struct nv50_reg *src1, struct nv50_reg *src2) 1076{ 1077 struct nv50_program_exec *e = exec(pc); 1078 1079 e->inst[0] |= 0xe0000000; 1080 1081 check_swap_src_0_1(pc, &src0, &src1); 1082 set_dst(pc, dst, e); 1083 set_src_0(pc, src0, e); 1084 set_src_1(pc, src1, e); 1085 set_src_2(pc, src2, e); 1086 1087 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) 1088 e->inst[1] |= 0x04000000; 1089 if (src2->mod & NV50_MOD_NEG) 1090 e->inst[1] |= 0x08000000; 1091 1092 emit(pc, e); 1093} 1094 1095static INLINE void 1096emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1097 struct nv50_reg *src1, struct nv50_reg *src2) 1098{ 1099 src2->mod ^= NV50_MOD_NEG; 1100 emit_mad(pc, dst, src0, src1, src2); 1101 src2->mod ^= NV50_MOD_NEG; 1102} 1103 1104#define NV50_FLOP_RCP 0 1105#define NV50_FLOP_RSQ 2 1106#define NV50_FLOP_LG2 3 1107#define NV50_FLOP_SIN 4 1108#define NV50_FLOP_COS 5 1109#define NV50_FLOP_EX2 6 1110 1111/* rcp, rsqrt, lg2 support neg and abs */ 1112static void 1113emit_flop(struct nv50_pc *pc, unsigned sub, 1114 struct nv50_reg *dst, struct nv50_reg *src) 1115{ 1116 struct nv50_program_exec *e = exec(pc); 1117 1118 e->inst[0] |= 0x90000000; 1119 if (sub || src->mod) { 1120 set_long(pc, e); 1121 e->inst[1] |= (sub << 29); 1122 } 1123 1124 set_dst(pc, dst, e); 1125 set_src_0_restricted(pc, src, e); 1126 1127 assert(!src->mod || sub < 4); 1128 1129 if (src->mod & NV50_MOD_NEG) 1130 e->inst[1] |= 0x04000000; 1131 if (src->mod & NV50_MOD_ABS) 1132 e->inst[1] |= 0x00100000; 1133 1134 emit(pc, e); 1135} 1136 1137static void 1138emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1139{ 1140 struct nv50_program_exec *e = exec(pc); 1141 1142 e->inst[0] |= 0xb0000000; 1143 1144 set_dst(pc, dst, e); 1145 set_src_0(pc, src, e); 1146 set_long(pc, e); 1147 e->inst[1] |= (6 << 29) | 0x00004000; 1148 1149 if (src->mod & NV50_MOD_NEG) 1150 e->inst[1] |= 0x04000000; 1151 if (src->mod & NV50_MOD_ABS) 1152 e->inst[1] |= 0x00100000; 1153 1154 emit(pc, e); 1155} 1156 1157static void 1158emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1159{ 1160 struct nv50_program_exec *e = exec(pc); 1161 1162 e->inst[0] |= 0xb0000000; 1163 1164 set_dst(pc, dst, e); 1165 set_src_0(pc, src, e); 1166 set_long(pc, e); 1167 e->inst[1] |= (6 << 29); 1168 1169 if (src->mod & NV50_MOD_NEG) 1170 e->inst[1] |= 0x04000000; 1171 if (src->mod & NV50_MOD_ABS) 1172 e->inst[1] |= 0x00100000; 1173 1174 emit(pc, e); 1175} 1176 1177#define CVT_RN (0x00 << 16) 1178#define CVT_FLOOR (0x02 << 16) 1179#define CVT_CEIL (0x04 << 16) 1180#define CVT_TRUNC (0x06 << 16) 1181#define CVT_SAT (0x08 << 16) 1182#define CVT_ABS (0x10 << 16) 1183 1184#define CVT_X32_X32 0x04004000 1185#define CVT_X32_S32 0x04014000 1186#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) 1187#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) 1188#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) 1189#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) 1190#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) 1191#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) 1192#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) 1193#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) 1194 1195#define CVT_NEG 0x20000000 1196#define CVT_RI 0x08000000 1197 1198static void 1199emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 1200 int wp, uint32_t cvn) 1201{ 1202 struct nv50_program_exec *e; 1203 1204 e = exec(pc); 1205 1206 if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG; 1207 if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS; 1208 1209 e->inst[0] = 0xa0000000; 1210 e->inst[1] = cvn; 1211 set_long(pc, e); 1212 set_src_0(pc, src, e); 1213 1214 if (wp >= 0) 1215 set_pred_wr(pc, 1, wp, e); 1216 1217 if (dst) 1218 set_dst(pc, dst, e); 1219 else { 1220 e->inst[0] |= 0x000001fc; 1221 e->inst[1] |= 0x00000008; 1222 } 1223 1224 emit(pc, e); 1225} 1226 1227/* nv50 Condition codes: 1228 * 0x1 = LT 1229 * 0x2 = EQ 1230 * 0x3 = LE 1231 * 0x4 = GT 1232 * 0x5 = NE 1233 * 0x6 = GE 1234 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) 1235 * 0x8 = unordered bit (allows NaN) 1236 */ 1237static void 1238emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, 1239 struct nv50_reg *src0, struct nv50_reg *src1) 1240{ 1241 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 1242 1243 struct nv50_program_exec *e = exec(pc); 1244 struct nv50_reg *rdst; 1245 1246 assert(ccode < 16); 1247 if (check_swap_src_0_1(pc, &src0, &src1)) 1248 ccode = cc_swapped[ccode & 7] | (ccode & 8); 1249 1250 rdst = dst; 1251 if (dst && dst->type != P_TEMP) 1252 dst = alloc_temp(pc, NULL); 1253 1254 /* set.u32 */ 1255 set_long(pc, e); 1256 e->inst[0] |= 0xb0000000; 1257 e->inst[1] |= 0x60000000 | (ccode << 14); 1258 1259 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but 1260 * that doesn't seem to match what the hw actually does 1261 e->inst[1] |= 0x04000000; << breaks things, u32 by default ? 1262 */ 1263 1264 if (wp >= 0) 1265 set_pred_wr(pc, 1, wp, e); 1266 if (dst) 1267 set_dst(pc, dst, e); 1268 else { 1269 e->inst[0] |= 0x000001fc; 1270 e->inst[1] |= 0x00000008; 1271 } 1272 1273 set_src_0(pc, src0, e); 1274 set_src_1(pc, src1, e); 1275 1276 emit(pc, e); 1277 1278 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */ 1279 if (rdst) 1280 emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32); 1281 if (rdst && rdst != dst) 1282 free_temp(pc, dst); 1283} 1284 1285static INLINE unsigned 1286map_tgsi_setop_cc(unsigned op) 1287{ 1288 switch (op) { 1289 case TGSI_OPCODE_SLT: return 0x1; 1290 case TGSI_OPCODE_SGE: return 0x6; 1291 case TGSI_OPCODE_SEQ: return 0x2; 1292 case TGSI_OPCODE_SGT: return 0x4; 1293 case TGSI_OPCODE_SLE: return 0x3; 1294 case TGSI_OPCODE_SNE: return 0xd; 1295 default: 1296 assert(0); 1297 return 0; 1298 } 1299} 1300 1301static INLINE void 1302emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1303{ 1304 emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI); 1305} 1306 1307static void 1308emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 1309 struct nv50_reg *v, struct nv50_reg *e) 1310{ 1311 struct nv50_reg *temp = alloc_temp(pc, NULL); 1312 1313 emit_flop(pc, NV50_FLOP_LG2, temp, v); 1314 emit_mul(pc, temp, temp, e); 1315 emit_preex2(pc, temp, temp); 1316 emit_flop(pc, NV50_FLOP_EX2, dst, temp); 1317 1318 free_temp(pc, temp); 1319} 1320 1321static INLINE void 1322emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1323{ 1324 emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32); 1325} 1326 1327static void 1328emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1329 struct nv50_reg **src) 1330{ 1331 struct nv50_reg *one = alloc_immd(pc, 1.0); 1332 struct nv50_reg *zero = alloc_immd(pc, 0.0); 1333 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 1334 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 1335 struct nv50_reg *tmp[4]; 1336 boolean allow32 = pc->allow32; 1337 1338 pc->allow32 = FALSE; 1339 1340 if (mask & (3 << 1)) { 1341 tmp[0] = alloc_temp(pc, NULL); 1342 emit_minmax(pc, 4, tmp[0], src[0], zero); 1343 } 1344 1345 if (mask & (1 << 2)) { 1346 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 1347 1348 tmp[1] = temp_temp(pc); 1349 emit_minmax(pc, 4, tmp[1], src[1], zero); 1350 1351 tmp[3] = temp_temp(pc); 1352 emit_minmax(pc, 4, tmp[3], src[3], neg128); 1353 emit_minmax(pc, 5, tmp[3], tmp[3], pos128); 1354 1355 emit_pow(pc, dst[2], tmp[1], tmp[3]); 1356 emit_mov(pc, dst[2], zero); 1357 set_pred(pc, 3, 0, pc->p->exec_tail); 1358 } 1359 1360 if (mask & (1 << 1)) 1361 assimilate_temp(pc, dst[1], tmp[0]); 1362 else 1363 if (mask & (1 << 2)) 1364 free_temp(pc, tmp[0]); 1365 1366 pc->allow32 = allow32; 1367 1368 /* do this last, in case src[i,j] == dst[0,3] */ 1369 if (mask & (1 << 0)) 1370 emit_mov(pc, dst[0], one); 1371 1372 if (mask & (1 << 3)) 1373 emit_mov(pc, dst[3], one); 1374 1375 FREE(pos128); 1376 FREE(neg128); 1377 FREE(zero); 1378 FREE(one); 1379} 1380 1381static void 1382emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 1383{ 1384 struct nv50_program_exec *e; 1385 const int r_pred = 1; 1386 1387 e = exec(pc); 1388 e->inst[0] = 0x00000002; /* discard */ 1389 set_long(pc, e); /* sets cond code to ALWAYS */ 1390 1391 if (src) { 1392 set_pred(pc, 0x1 /* cc = LT */, r_pred, e); 1393 /* write to predicate reg */ 1394 emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32); 1395 } 1396 1397 emit(pc, e); 1398} 1399 1400static struct nv50_program_exec * 1401emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc) 1402{ 1403 struct nv50_program_exec *e = exec(pc); 1404 1405 e->inst[0] = (op << 28) | 2; 1406 set_long(pc, e); 1407 if (pred >= 0) 1408 set_pred(pc, cc, pred, e); 1409 1410 emit(pc, e); 1411 return e; 1412} 1413 1414static INLINE struct nv50_program_exec * 1415emit_breakaddr(struct nv50_pc *pc) 1416{ 1417 return emit_control_flow(pc, 0x4, -1, 0); 1418} 1419 1420static INLINE void 1421emit_break(struct nv50_pc *pc, int pred, unsigned cc) 1422{ 1423 emit_control_flow(pc, 0x5, pred, cc); 1424} 1425 1426static INLINE struct nv50_program_exec * 1427emit_joinat(struct nv50_pc *pc) 1428{ 1429 return emit_control_flow(pc, 0xa, -1, 0); 1430} 1431 1432static INLINE struct nv50_program_exec * 1433emit_branch(struct nv50_pc *pc, int pred, unsigned cc) 1434{ 1435 return emit_control_flow(pc, 0x1, pred, cc); 1436} 1437 1438static INLINE struct nv50_program_exec * 1439emit_call(struct nv50_pc *pc, int pred, unsigned cc) 1440{ 1441 return emit_control_flow(pc, 0x2, pred, cc); 1442} 1443 1444static INLINE void 1445emit_ret(struct nv50_pc *pc, int pred, unsigned cc) 1446{ 1447 emit_control_flow(pc, 0x3, pred, cc); 1448} 1449 1450#define QOP_ADD 0 1451#define QOP_SUBR 1 1452#define QOP_SUB 2 1453#define QOP_MOV_SRC1 3 1454 1455/* For a quad of threads / top left, top right, bottom left, bottom right 1456 * pixels, do a different operation, and take src0 from a specific thread. 1457 */ 1458static void 1459emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0, 1460 struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop) 1461{ 1462 struct nv50_program_exec *e = exec(pc); 1463 1464 e->inst[0] = 0xc0000000; 1465 e->inst[1] = 0x80000000; 1466 set_long(pc, e); 1467 e->inst[0] |= lane_src0 << 16; 1468 set_src_0(pc, src0, e); 1469 set_src_2(pc, src1, e); 1470 1471 if (wp >= 0) 1472 set_pred_wr(pc, 1, wp, e); 1473 1474 if (dst) 1475 set_dst(pc, dst, e); 1476 else { 1477 e->inst[0] |= 0x000001fc; 1478 e->inst[1] |= 0x00000008; 1479 } 1480 1481 e->inst[0] |= (qop & 3) << 20; 1482 e->inst[1] |= (qop >> 2) << 22; 1483 1484 emit(pc, e); 1485} 1486 1487static void 1488load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], 1489 struct nv50_reg **src, unsigned arg, boolean proj) 1490{ 1491 int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; 1492 1493 src[0]->mod |= NV50_MOD_ABS; 1494 src[1]->mod |= NV50_MOD_ABS; 1495 src[2]->mod |= NV50_MOD_ABS; 1496 1497 emit_minmax(pc, 4, t[2], src[0], src[1]); 1498 emit_minmax(pc, 4, t[2], src[2], t[2]); 1499 1500 src[0]->mod = mod[0]; 1501 src[1]->mod = mod[1]; 1502 src[2]->mod = mod[2]; 1503 1504 if (proj && 0 /* looks more correct without this */) 1505 emit_mul(pc, t[2], t[2], src[3]); 1506 else 1507 if (arg == 4) /* there is no textureProj(samplerCubeShadow) */ 1508 emit_mov(pc, t[3], src[3]); 1509 1510 emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]); 1511 1512 emit_mul(pc, t[0], src[0], t[2]); 1513 emit_mul(pc, t[1], src[1], t[2]); 1514 emit_mul(pc, t[2], src[2], t[2]); 1515} 1516 1517static void 1518load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], 1519 struct nv50_reg **src, unsigned dim, unsigned arg) 1520{ 1521 unsigned c, mode; 1522 1523 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1524 mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE; 1525 1526 t[3]->rhw = src[3]->rhw; 1527 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1528 emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]); 1529 1530 for (c = 0; c < dim; ++c) { 1531 t[c]->rhw = src[c]->rhw; 1532 emit_interp(pc, t[c], t[3], mode); 1533 } 1534 if (arg != dim) { /* depth reference value */ 1535 t[dim]->rhw = src[2]->rhw; 1536 emit_interp(pc, t[dim], t[3], mode); 1537 } 1538 } else { 1539 /* XXX: for some reason the blob sometimes uses MAD 1540 * (mad f32 $rX $rY $rZ neg $r63) 1541 */ 1542 emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]); 1543 for (c = 0; c < dim; ++c) 1544 emit_mul(pc, t[c], src[c], t[3]); 1545 if (arg != dim) /* depth reference value */ 1546 emit_mul(pc, t[dim], src[2], t[3]); 1547 } 1548} 1549 1550static INLINE void 1551get_tex_dim(unsigned type, unsigned *dim, unsigned *arg) 1552{ 1553 switch (type) { 1554 case TGSI_TEXTURE_1D: 1555 *arg = *dim = 1; 1556 break; 1557 case TGSI_TEXTURE_SHADOW1D: 1558 *dim = 1; 1559 *arg = 2; 1560 break; 1561 case TGSI_TEXTURE_UNKNOWN: 1562 case TGSI_TEXTURE_2D: 1563 case TGSI_TEXTURE_RECT: 1564 *arg = *dim = 2; 1565 break; 1566 case TGSI_TEXTURE_SHADOW2D: 1567 case TGSI_TEXTURE_SHADOWRECT: 1568 *dim = 2; 1569 *arg = 3; 1570 break; 1571 case TGSI_TEXTURE_3D: 1572 case TGSI_TEXTURE_CUBE: 1573 *dim = *arg = 3; 1574 break; 1575 default: 1576 assert(0); 1577 break; 1578 } 1579} 1580 1581/* We shouldn't execute TEXLOD if any of the pixels in a quad have 1582 * different LOD values, so branch off groups of equal LOD. 1583 */ 1584static void 1585emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod, 1586 struct nv50_reg *src, struct nv50_program_exec *tex) 1587{ 1588 struct nv50_program_exec *join_at; 1589 unsigned i, target = pc->p->exec_size + 9 * 2; 1590 1591 if (pc->p->type != PIPE_SHADER_FRAGMENT) { 1592 emit(pc, tex); 1593 return; 1594 } 1595 pc->allow32 = FALSE; 1596 1597 /* Subtract lod of each pixel from lod of top left pixel, jump 1598 * texlod insn if result is 0, then repeat for 2 other pixels. 1599 */ 1600 join_at = emit_joinat(pc); 1601 emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55); 1602 emit_branch(pc, 0, 2)->param.index = target; 1603 1604 for (i = 1; i < 4; ++i) { 1605 emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55); 1606 emit_branch(pc, 0, 2)->param.index = target; 1607 } 1608 1609 emit_mov(pc, tlod, src); /* target */ 1610 emit(pc, tex); /* texlod */ 1611 1612 join_at->param.index = target + 2 * 2; 1613 JOIN_ON(emit_nop(pc)); /* join _after_ tex */ 1614} 1615 1616static void 1617emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg, 1618 struct nv50_program_exec *tex) 1619{ 1620 struct nv50_program_exec *e; 1621 struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL); 1622 int r_pred = 0; 1623 unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 }; 1624 1625 pc->allow32 = FALSE; 1626 ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4); 1627 1628 /* Subtract bias value of thread i from bias values of each thread, 1629 * store result in r_pred, and set bit i in r_bits if result was 0. 1630 */ 1631 assert(arg < 4); 1632 for (i = 0; i < 4; ++i, ++imm_1248.hw) { 1633 emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55); 1634 emit_mov(pc, r_bits, &imm_1248); 1635 set_pred(pc, 2, r_pred, pc->p->exec_tail); 1636 } 1637 emit_mov_to_pred(pc, r_pred, r_bits); 1638 1639 /* The lanes of a quad are now grouped by the bit in r_pred they have 1640 * set. Put the input values for TEX into a new register set for each 1641 * group and execute TEX only for a specific group. 1642 * We cannot use the same register set for each group because we need 1643 * the derivatives, which are implicitly calculated, to be correct. 1644 */ 1645 for (i = 1; i < 4; ++i) { 1646 alloc_temp4(pc, t123[i], 0); 1647 1648 for (c = 0; c <= arg; ++c) 1649 emit_mov(pc, t123[i][c], t[c]); 1650 1651 *(e = exec(pc)) = *(tex); 1652 e->inst[0] &= ~0x01fc; 1653 set_dst(pc, t123[i][0], e); 1654 set_pred(pc, cc[i], r_pred, e); 1655 emit(pc, e); 1656 } 1657 /* finally TEX on the original regs (where we kept the input) */ 1658 set_pred(pc, cc[0], r_pred, tex); 1659 emit(pc, tex); 1660 1661 /* put the 3 * n other results into regs for lane 0 */ 1662 n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc)); 1663 for (i = 1; i < 4; ++i) { 1664 for (c = 0; c < n; ++c) { 1665 emit_mov(pc, t[c], t123[i][c]); 1666 set_pred(pc, cc[i], r_pred, pc->p->exec_tail); 1667 } 1668 free_temp4(pc, t123[i]); 1669 } 1670 1671 emit_nop(pc); 1672 free_temp(pc, r_bits); 1673} 1674 1675static void 1676emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1677 struct nv50_reg **src, unsigned unit, unsigned type, 1678 boolean proj, int bias_lod) 1679{ 1680 struct nv50_reg *t[4]; 1681 struct nv50_program_exec *e; 1682 unsigned c, dim, arg; 1683 1684 /* t[i] must be within a single 128 bit super-reg */ 1685 alloc_temp4(pc, t, 0); 1686 1687 e = exec(pc); 1688 e->inst[0] = 0xf0000000; 1689 set_long(pc, e); 1690 set_dst(pc, t[0], e); 1691 1692 /* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */ 1693 e->inst[0] |= (unit << 9) /* | (unit << 17) */; 1694 1695 /* live flag (don't set if TEX results affect input to another TEX): */ 1696 /* e->inst[0] |= 0x00000004; */ 1697 1698 get_tex_dim(type, &dim, &arg); 1699 1700 if (type == TGSI_TEXTURE_CUBE) { 1701 e->inst[0] |= 0x08000000; 1702 load_cube_tex_coords(pc, t, src, arg, proj); 1703 } else 1704 if (proj) 1705 load_proj_tex_coords(pc, t, src, dim, arg); 1706 else { 1707 for (c = 0; c < dim; c++) 1708 emit_mov(pc, t[c], src[c]); 1709 if (arg != dim) /* depth reference value (always src.z here) */ 1710 emit_mov(pc, t[dim], src[2]); 1711 } 1712 1713 e->inst[0] |= (mask & 0x3) << 25; 1714 e->inst[1] |= (mask & 0xc) << 12; 1715 1716 if (!bias_lod) { 1717 e->inst[0] |= (arg - 1) << 22; 1718 emit(pc, e); 1719 } else 1720 if (bias_lod < 0) { 1721 assert(pc->p->type == PIPE_SHADER_FRAGMENT); 1722 e->inst[0] |= arg << 22; 1723 e->inst[1] |= 0x20000000; /* texbias */ 1724 emit_mov(pc, t[arg], src[3]); 1725 emit_texbias_sequence(pc, t, arg, e); 1726 } else { 1727 e->inst[0] |= arg << 22; 1728 e->inst[1] |= 0x40000000; /* texlod */ 1729 emit_mov(pc, t[arg], src[3]); 1730 emit_texlod_sequence(pc, t[arg], src[3], e); 1731 } 1732 1733#if 1 1734 c = 0; 1735 if (mask & 1) emit_mov(pc, dst[0], t[c++]); 1736 if (mask & 2) emit_mov(pc, dst[1], t[c++]); 1737 if (mask & 4) emit_mov(pc, dst[2], t[c++]); 1738 if (mask & 8) emit_mov(pc, dst[3], t[c]); 1739 1740 free_temp4(pc, t); 1741#else 1742 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1743 * the texture coordinates, not the fetched values: latency ? */ 1744 1745 for (c = 0; c < 4; c++) { 1746 if (mask & (1 << c)) 1747 assimilate_temp(pc, dst[c], t[c]); 1748 else 1749 free_temp(pc, t[c]); 1750 } 1751#endif 1752} 1753 1754static void 1755emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1756{ 1757 struct nv50_program_exec *e = exec(pc); 1758 1759 assert(src->type == P_TEMP); 1760 1761 e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000; 1762 e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000; 1763 set_long(pc, e); 1764 set_dst(pc, dst, e); 1765 set_src_0(pc, src, e); 1766 set_src_2(pc, src, e); 1767 1768 emit(pc, e); 1769} 1770 1771static void 1772emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1773{ 1774 struct nv50_program_exec *e = exec(pc); 1775 1776 assert(src->type == P_TEMP); 1777 1778 e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000; 1779 e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000; 1780 set_long(pc, e); 1781 set_dst(pc, dst, e); 1782 set_src_0(pc, src, e); 1783 set_src_2(pc, src, e); 1784 1785 emit(pc, e); 1786} 1787 1788static void 1789convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1790{ 1791 unsigned q = 0, m = ~0; 1792 1793 assert(!is_long(e)); 1794 1795 switch (e->inst[0] >> 28) { 1796 case 0x1: 1797 /* MOV */ 1798 q = 0x0403c000; 1799 m = 0xffff7fff; 1800 break; 1801 case 0x8: 1802 /* INTERP (move centroid, perspective and flat bits) */ 1803 m = ~0x03000100; 1804 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 1805 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 1806 break; 1807 case 0x9: 1808 /* RCP */ 1809 break; 1810 case 0xB: 1811 /* ADD */ 1812 m = ~(127 << 16); 1813 q = ((e->inst[0] & (~m)) >> 2); 1814 break; 1815 case 0xC: 1816 /* MUL */ 1817 m = ~0x00008000; 1818 q = ((e->inst[0] & (~m)) << 12); 1819 break; 1820 case 0xE: 1821 /* MAD (if src2 == dst) */ 1822 q = ((e->inst[0] & 0x1fc) << 12); 1823 break; 1824 default: 1825 assert(0); 1826 break; 1827 } 1828 1829 set_long(pc, e); 1830 pc->p->exec_size++; 1831 1832 e->inst[0] &= m; 1833 e->inst[1] |= q; 1834} 1835 1836/* Some operations support an optional negation flag. */ 1837static int 1838get_supported_mods(const struct tgsi_full_instruction *insn, int i) 1839{ 1840 switch (insn->Instruction.Opcode) { 1841 case TGSI_OPCODE_ADD: 1842 case TGSI_OPCODE_COS: 1843 case TGSI_OPCODE_DDX: 1844 case TGSI_OPCODE_DDY: 1845 case TGSI_OPCODE_DP3: 1846 case TGSI_OPCODE_DP4: 1847 case TGSI_OPCODE_EX2: 1848 case TGSI_OPCODE_KIL: 1849 case TGSI_OPCODE_LG2: 1850 case TGSI_OPCODE_MAD: 1851 case TGSI_OPCODE_MUL: 1852 case TGSI_OPCODE_POW: 1853 case TGSI_OPCODE_RCP: 1854 case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */ 1855 case TGSI_OPCODE_SCS: 1856 case TGSI_OPCODE_SIN: 1857 case TGSI_OPCODE_SUB: 1858 return NV50_MOD_NEG; 1859 case TGSI_OPCODE_MAX: 1860 case TGSI_OPCODE_MIN: 1861 case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */ 1862 return NV50_MOD_ABS; 1863 case TGSI_OPCODE_CEIL: 1864 case TGSI_OPCODE_FLR: 1865 case TGSI_OPCODE_TRUNC: 1866 return NV50_MOD_NEG | NV50_MOD_ABS; 1867 case TGSI_OPCODE_F2I: 1868 case TGSI_OPCODE_F2U: 1869 case TGSI_OPCODE_I2F: 1870 case TGSI_OPCODE_U2F: 1871 return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32; 1872 case TGSI_OPCODE_SHL: 1873 case TGSI_OPCODE_ISHR: 1874 case TGSI_OPCODE_USHR: 1875 return NV50_MOD_I32; 1876 default: 1877 return 0; 1878 } 1879} 1880 1881/* Return a read mask for source registers deduced from opcode & write mask. */ 1882static unsigned 1883nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 1884{ 1885 unsigned x, mask = insn->Dst[0].Register.WriteMask; 1886 1887 switch (insn->Instruction.Opcode) { 1888 case TGSI_OPCODE_COS: 1889 case TGSI_OPCODE_SIN: 1890 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 1891 case TGSI_OPCODE_DP3: 1892 return 0x7; 1893 case TGSI_OPCODE_DP4: 1894 case TGSI_OPCODE_DPH: 1895 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 1896 return 0xf; 1897 case TGSI_OPCODE_DST: 1898 return mask & (c ? 0xa : 0x6); 1899 case TGSI_OPCODE_EX2: 1900 case TGSI_OPCODE_EXP: 1901 case TGSI_OPCODE_LG2: 1902 case TGSI_OPCODE_LOG: 1903 case TGSI_OPCODE_POW: 1904 case TGSI_OPCODE_RCP: 1905 case TGSI_OPCODE_RSQ: 1906 case TGSI_OPCODE_SCS: 1907 return 0x1; 1908 case TGSI_OPCODE_IF: 1909 return 0x1; 1910 case TGSI_OPCODE_LIT: 1911 return 0xb; 1912 case TGSI_OPCODE_TEX: 1913 case TGSI_OPCODE_TXB: 1914 case TGSI_OPCODE_TXL: 1915 case TGSI_OPCODE_TXP: 1916 { 1917 const struct tgsi_instruction_texture *tex; 1918 1919 assert(insn->Instruction.Texture); 1920 tex = &insn->Texture; 1921 1922 mask = 0x7; 1923 if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && 1924 insn->Instruction.Opcode != TGSI_OPCODE_TXD) 1925 mask |= 0x8; /* bias, lod or proj */ 1926 1927 switch (tex->Texture) { 1928 case TGSI_TEXTURE_1D: 1929 mask &= 0x9; 1930 break; 1931 case TGSI_TEXTURE_SHADOW1D: 1932 mask &= 0x5; 1933 break; 1934 case TGSI_TEXTURE_2D: 1935 mask &= 0xb; 1936 break; 1937 default: 1938 break; 1939 } 1940 } 1941 return mask; 1942 case TGSI_OPCODE_XPD: 1943 x = 0; 1944 if (mask & 1) x |= 0x6; 1945 if (mask & 2) x |= 0x5; 1946 if (mask & 4) x |= 0x3; 1947 return x; 1948 default: 1949 break; 1950 } 1951 1952 return mask; 1953} 1954 1955static struct nv50_reg * 1956tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 1957{ 1958 switch (dst->Register.File) { 1959 case TGSI_FILE_TEMPORARY: 1960 return &pc->temp[dst->Register.Index * 4 + c]; 1961 case TGSI_FILE_OUTPUT: 1962 return &pc->result[dst->Register.Index * 4 + c]; 1963 case TGSI_FILE_ADDRESS: 1964 { 1965 struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; 1966 if (!r) { 1967 r = alloc_addr(pc, NULL); 1968 pc->addr[dst->Register.Index * 4 + c] = r; 1969 } 1970 assert(r); 1971 return r; 1972 } 1973 case TGSI_FILE_NULL: 1974 return NULL; 1975 default: 1976 break; 1977 } 1978 1979 return NULL; 1980} 1981 1982static struct nv50_reg * 1983tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 1984 int mod) 1985{ 1986 struct nv50_reg *r = NULL; 1987 struct nv50_reg *temp = NULL; 1988 unsigned sgn, c, swz, cvn; 1989 1990 if (src->Register.File != TGSI_FILE_CONSTANT) 1991 assert(!src->Register.Indirect); 1992 1993 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 1994 1995 c = tgsi_util_get_full_src_register_swizzle(src, chan); 1996 switch (c) { 1997 case TGSI_SWIZZLE_X: 1998 case TGSI_SWIZZLE_Y: 1999 case TGSI_SWIZZLE_Z: 2000 case TGSI_SWIZZLE_W: 2001 switch (src->Register.File) { 2002 case TGSI_FILE_INPUT: 2003 r = &pc->attr[src->Register.Index * 4 + c]; 2004 break; 2005 case TGSI_FILE_TEMPORARY: 2006 r = &pc->temp[src->Register.Index * 4 + c]; 2007 break; 2008 case TGSI_FILE_CONSTANT: 2009 if (!src->Register.Indirect) { 2010 r = &pc->param[src->Register.Index * 4 + c]; 2011 break; 2012 } 2013 /* Indicate indirection by setting r->acc < 0 and 2014 * use the index field to select the address reg. 2015 */ 2016 r = reg_instance(pc, NULL); 2017 swz = tgsi_util_get_src_register_swizzle( 2018 &src->Indirect, 0); 2019 ctor_reg(r, P_CONST, 2020 src->Indirect.Index * 4 + swz, 2021 src->Register.Index * 4 + c); 2022 r->acc = -1; 2023 break; 2024 case TGSI_FILE_IMMEDIATE: 2025 r = &pc->immd[src->Register.Index * 4 + c]; 2026 break; 2027 case TGSI_FILE_SAMPLER: 2028 return NULL; 2029 case TGSI_FILE_ADDRESS: 2030 r = pc->addr[src->Register.Index * 4 + c]; 2031 assert(r); 2032 break; 2033 default: 2034 assert(0); 2035 break; 2036 } 2037 break; 2038 default: 2039 assert(0); 2040 break; 2041 } 2042 2043 cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32; 2044 2045 switch (sgn) { 2046 case TGSI_UTIL_SIGN_CLEAR: 2047 r->mod = NV50_MOD_ABS; 2048 break; 2049 case TGSI_UTIL_SIGN_SET: 2050 r->mod = NV50_MOD_NEG_ABS; 2051 break; 2052 case TGSI_UTIL_SIGN_TOGGLE: 2053 r->mod = NV50_MOD_NEG; 2054 break; 2055 default: 2056 assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP); 2057 break; 2058 } 2059 2060 if ((r->mod & mod) != r->mod) { 2061 temp = temp_temp(pc); 2062 emit_cvt(pc, temp, r, -1, cvn); 2063 r->mod = 0; 2064 r = temp; 2065 } else 2066 r->mod |= mod & NV50_MOD_I32; 2067 2068 assert(r); 2069 if (r->acc >= 0 && r != temp) 2070 return reg_instance(pc, r); /* will clear r->mod */ 2071 return r; 2072} 2073 2074/* return TRUE for ops that produce only a single result */ 2075static boolean 2076is_scalar_op(unsigned op) 2077{ 2078 switch (op) { 2079 case TGSI_OPCODE_COS: 2080 case TGSI_OPCODE_DP2: 2081 case TGSI_OPCODE_DP3: 2082 case TGSI_OPCODE_DP4: 2083 case TGSI_OPCODE_DPH: 2084 case TGSI_OPCODE_EX2: 2085 case TGSI_OPCODE_LG2: 2086 case TGSI_OPCODE_POW: 2087 case TGSI_OPCODE_RCP: 2088 case TGSI_OPCODE_RSQ: 2089 case TGSI_OPCODE_SIN: 2090 /* 2091 case TGSI_OPCODE_KIL: 2092 case TGSI_OPCODE_LIT: 2093 case TGSI_OPCODE_SCS: 2094 */ 2095 return TRUE; 2096 default: 2097 return FALSE; 2098 } 2099} 2100 2101/* Returns a bitmask indicating which dst components depend 2102 * on source s, component c (reverse of nv50_tgsi_src_mask). 2103 */ 2104static unsigned 2105nv50_tgsi_dst_revdep(unsigned op, int s, int c) 2106{ 2107 if (is_scalar_op(op)) 2108 return 0x1; 2109 2110 switch (op) { 2111 case TGSI_OPCODE_DST: 2112 return (1 << c) & (s ? 0xa : 0x6); 2113 case TGSI_OPCODE_XPD: 2114 switch (c) { 2115 case 0: return 0x6; 2116 case 1: return 0x5; 2117 case 2: return 0x3; 2118 case 3: return 0x0; 2119 default: 2120 assert(0); 2121 return 0x0; 2122 } 2123 case TGSI_OPCODE_EXP: 2124 case TGSI_OPCODE_LOG: 2125 case TGSI_OPCODE_LIT: 2126 case TGSI_OPCODE_SCS: 2127 case TGSI_OPCODE_TEX: 2128 case TGSI_OPCODE_TXB: 2129 case TGSI_OPCODE_TXL: 2130 case TGSI_OPCODE_TXP: 2131 /* these take care of dangerous swizzles themselves */ 2132 return 0x0; 2133 case TGSI_OPCODE_IF: 2134 case TGSI_OPCODE_KIL: 2135 /* don't call this function for these ops */ 2136 assert(0); 2137 return 0; 2138 default: 2139 /* linear vector instruction */ 2140 return (1 << c); 2141 } 2142} 2143 2144static INLINE boolean 2145has_pred(struct nv50_program_exec *e, unsigned cc) 2146{ 2147 if (!is_long(e) || is_immd(e)) 2148 return FALSE; 2149 return ((e->inst[1] & 0x780) == (cc << 7)); 2150} 2151 2152/* on ENDIF see if we can do "@p0.neu single_op" instead of: 2153 * join_at ENDIF 2154 * @p0.eq bra ENDIF 2155 * single_op 2156 * ENDIF: nop.join 2157 */ 2158static boolean 2159nv50_kill_branch(struct nv50_pc *pc) 2160{ 2161 int lvl = pc->if_lvl; 2162 2163 if (pc->if_insn[lvl]->next != pc->p->exec_tail) 2164 return FALSE; 2165 if (is_immd(pc->p->exec_tail)) 2166 return FALSE; 2167 2168 /* if ccode == 'true', the BRA is from an ELSE and the predicate 2169 * reg may no longer be valid, since we currently always use $p0 2170 */ 2171 if (has_pred(pc->if_insn[lvl], 0xf)) 2172 return FALSE; 2173 assert(pc->if_insn[lvl] && pc->if_join[lvl]); 2174 2175 /* We'll use the exec allocated for JOIN_AT (we can't easily 2176 * access nv50_program_exec's prev). 2177 */ 2178 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ 2179 2180 *pc->if_join[lvl] = *pc->p->exec_tail; 2181 2182 FREE(pc->if_insn[lvl]); 2183 FREE(pc->p->exec_tail); 2184 2185 pc->p->exec_tail = pc->if_join[lvl]; 2186 pc->p->exec_tail->next = NULL; 2187 set_pred(pc, 0xd, 0, pc->p->exec_tail); 2188 2189 return TRUE; 2190} 2191 2192static void 2193nv50_fp_move_results(struct nv50_pc *pc) 2194{ 2195 struct nv50_reg reg; 2196 unsigned i; 2197 2198 ctor_reg(®, P_TEMP, -1, -1); 2199 2200 for (i = 0; i < pc->result_nr * 4; ++i) { 2201 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) 2202 continue; 2203 if (pc->result[i].rhw != pc->result[i].hw) { 2204 reg.hw = pc->result[i].rhw; 2205 emit_mov(pc, ®, &pc->result[i]); 2206 } 2207 } 2208} 2209 2210static boolean 2211nv50_program_tx_insn(struct nv50_pc *pc, 2212 const struct tgsi_full_instruction *inst) 2213{ 2214 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 2215 unsigned mask, sat, unit; 2216 int i, c; 2217 2218 mask = inst->Dst[0].Register.WriteMask; 2219 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 2220 2221 memset(src, 0, sizeof(src)); 2222 2223 for (c = 0; c < 4; c++) { 2224 if ((mask & (1 << c)) && !pc->r_dst[c]) 2225 dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); 2226 else 2227 dst[c] = pc->r_dst[c]; 2228 rdst[c] = dst[c]; 2229 } 2230 2231 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2232 const struct tgsi_full_src_register *fs = &inst->Src[i]; 2233 unsigned src_mask; 2234 int mod_supp; 2235 2236 src_mask = nv50_tgsi_src_mask(inst, i); 2237 mod_supp = get_supported_mods(inst, i); 2238 2239 if (fs->Register.File == TGSI_FILE_SAMPLER) 2240 unit = fs->Register.Index; 2241 2242 for (c = 0; c < 4; c++) 2243 if (src_mask & (1 << c)) 2244 src[i][c] = tgsi_src(pc, c, fs, mod_supp); 2245 } 2246 2247 brdc = temp = pc->r_brdc; 2248 if (brdc && brdc->type != P_TEMP) { 2249 temp = temp_temp(pc); 2250 if (sat) 2251 brdc = temp; 2252 } else 2253 if (sat) { 2254 for (c = 0; c < 4; c++) { 2255 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 2256 continue; 2257 /* rdst[c] = dst[c]; */ /* done above */ 2258 dst[c] = temp_temp(pc); 2259 } 2260 } 2261 2262 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 2263 2264 switch (inst->Instruction.Opcode) { 2265 case TGSI_OPCODE_ABS: 2266 for (c = 0; c < 4; c++) { 2267 if (!(mask & (1 << c))) 2268 continue; 2269 emit_cvt(pc, dst[c], src[0][c], -1, 2270 CVT_ABS | CVT_F32_F32); 2271 } 2272 break; 2273 case TGSI_OPCODE_ADD: 2274 for (c = 0; c < 4; c++) { 2275 if (!(mask & (1 << c))) 2276 continue; 2277 emit_add(pc, dst[c], src[0][c], src[1][c]); 2278 } 2279 break; 2280 case TGSI_OPCODE_AND: 2281 case TGSI_OPCODE_XOR: 2282 case TGSI_OPCODE_OR: 2283 for (c = 0; c < 4; c++) { 2284 if (!(mask & (1 << c))) 2285 continue; 2286 emit_bitop2(pc, dst[c], src[0][c], src[1][c], 2287 inst->Instruction.Opcode); 2288 } 2289 break; 2290 case TGSI_OPCODE_ARL: 2291 assert(src[0][0]); 2292 temp = temp_temp(pc); 2293 emit_cvt(pc, temp, src[0][0], -1, CVT_FLOOR | CVT_S32_F32); 2294 emit_arl(pc, dst[0], temp, 4); 2295 break; 2296 case TGSI_OPCODE_BGNLOOP: 2297 pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc); 2298 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; 2299 terminate_mbb(pc); 2300 break; 2301 case TGSI_OPCODE_BGNSUB: 2302 assert(!pc->in_subroutine); 2303 pc->in_subroutine = TRUE; 2304 /* probably not necessary, but align to 8 byte boundary */ 2305 if (!is_long(pc->p->exec_tail)) 2306 convert_to_long(pc, pc->p->exec_tail); 2307 break; 2308 case TGSI_OPCODE_BRK: 2309 assert(pc->loop_lvl > 0); 2310 emit_break(pc, -1, 0); 2311 break; 2312 case TGSI_OPCODE_CAL: 2313 assert(inst->Label.Label < pc->insn_nr); 2314 emit_call(pc, -1, 0)->param.index = inst->Label.Label; 2315 /* replaced by actual offset in nv50_program_fixup_insns */ 2316 break; 2317 case TGSI_OPCODE_CEIL: 2318 for (c = 0; c < 4; c++) { 2319 if (!(mask & (1 << c))) 2320 continue; 2321 emit_cvt(pc, dst[c], src[0][c], -1, 2322 CVT_CEIL | CVT_F32_F32 | CVT_RI); 2323 } 2324 break; 2325 case TGSI_OPCODE_CMP: 2326 pc->allow32 = FALSE; 2327 for (c = 0; c < 4; c++) { 2328 if (!(mask & (1 << c))) 2329 continue; 2330 emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32); 2331 emit_mov(pc, dst[c], src[1][c]); 2332 set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ 2333 emit_mov(pc, dst[c], src[2][c]); 2334 set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ 2335 } 2336 break; 2337 case TGSI_OPCODE_CONT: 2338 assert(pc->loop_lvl > 0); 2339 emit_branch(pc, -1, 0)->param.index = 2340 pc->loop_pos[pc->loop_lvl - 1]; 2341 break; 2342 case TGSI_OPCODE_COS: 2343 if (mask & 8) { 2344 emit_precossin(pc, temp, src[0][3]); 2345 emit_flop(pc, NV50_FLOP_COS, dst[3], temp); 2346 if (!(mask &= 7)) 2347 break; 2348 if (temp == dst[3]) 2349 temp = brdc = temp_temp(pc); 2350 } 2351 emit_precossin(pc, temp, src[0][0]); 2352 emit_flop(pc, NV50_FLOP_COS, brdc, temp); 2353 break; 2354 case TGSI_OPCODE_DDX: 2355 for (c = 0; c < 4; c++) { 2356 if (!(mask & (1 << c))) 2357 continue; 2358 emit_ddx(pc, dst[c], src[0][c]); 2359 } 2360 break; 2361 case TGSI_OPCODE_DDY: 2362 for (c = 0; c < 4; c++) { 2363 if (!(mask & (1 << c))) 2364 continue; 2365 emit_ddy(pc, dst[c], src[0][c]); 2366 } 2367 break; 2368 case TGSI_OPCODE_DP3: 2369 emit_mul(pc, temp, src[0][0], src[1][0]); 2370 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2371 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 2372 break; 2373 case TGSI_OPCODE_DP4: 2374 emit_mul(pc, temp, src[0][0], src[1][0]); 2375 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2376 emit_mad(pc, temp, src[0][2], src[1][2], temp); 2377 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 2378 break; 2379 case TGSI_OPCODE_DPH: 2380 emit_mul(pc, temp, src[0][0], src[1][0]); 2381 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2382 emit_mad(pc, temp, src[0][2], src[1][2], temp); 2383 emit_add(pc, brdc, src[1][3], temp); 2384 break; 2385 case TGSI_OPCODE_DST: 2386 if (mask & (1 << 1)) 2387 emit_mul(pc, dst[1], src[0][1], src[1][1]); 2388 if (mask & (1 << 2)) 2389 emit_mov(pc, dst[2], src[0][2]); 2390 if (mask & (1 << 3)) 2391 emit_mov(pc, dst[3], src[1][3]); 2392 if (mask & (1 << 0)) 2393 emit_mov_immdval(pc, dst[0], 1.0f); 2394 break; 2395 case TGSI_OPCODE_ELSE: 2396 emit_branch(pc, -1, 0); 2397 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 2398 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; 2399 terminate_mbb(pc); 2400 break; 2401 case TGSI_OPCODE_ENDIF: 2402 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 2403 2404 /* try to replace branch over 1 insn with a predicated insn */ 2405 if (nv50_kill_branch(pc) == TRUE) 2406 break; 2407 2408 if (pc->if_join[pc->if_lvl]) { 2409 pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size; 2410 pc->if_join[pc->if_lvl] = NULL; 2411 } 2412 terminate_mbb(pc); 2413 /* emit a NOP as join point, we could set it on the next 2414 * one, but would have to make sure it is long and !immd 2415 */ 2416 JOIN_ON(emit_nop(pc)); 2417 break; 2418 case TGSI_OPCODE_ENDLOOP: 2419 emit_branch(pc, -1, 0)->param.index = 2420 pc->loop_pos[--pc->loop_lvl]; 2421 pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size; 2422 terminate_mbb(pc); 2423 break; 2424 case TGSI_OPCODE_ENDSUB: 2425 assert(pc->in_subroutine); 2426 pc->in_subroutine = FALSE; 2427 break; 2428 case TGSI_OPCODE_EX2: 2429 emit_preex2(pc, temp, src[0][0]); 2430 emit_flop(pc, NV50_FLOP_EX2, brdc, temp); 2431 break; 2432 case TGSI_OPCODE_EXP: 2433 { 2434 struct nv50_reg *t[2]; 2435 2436 assert(!temp); 2437 t[0] = temp_temp(pc); 2438 t[1] = temp_temp(pc); 2439 2440 if (mask & 0x6) 2441 emit_mov(pc, t[0], src[0][0]); 2442 if (mask & 0x3) 2443 emit_flr(pc, t[1], src[0][0]); 2444 2445 if (mask & (1 << 1)) 2446 emit_sub(pc, dst[1], t[0], t[1]); 2447 if (mask & (1 << 0)) { 2448 emit_preex2(pc, t[1], t[1]); 2449 emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]); 2450 } 2451 if (mask & (1 << 2)) { 2452 emit_preex2(pc, t[0], t[0]); 2453 emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]); 2454 } 2455 if (mask & (1 << 3)) 2456 emit_mov_immdval(pc, dst[3], 1.0f); 2457 } 2458 break; 2459 case TGSI_OPCODE_F2I: 2460 for (c = 0; c < 4; c++) { 2461 if (!(mask & (1 << c))) 2462 continue; 2463 emit_cvt(pc, dst[c], src[0][c], -1, 2464 CVT_TRUNC | CVT_S32_F32); 2465 } 2466 break; 2467 case TGSI_OPCODE_F2U: 2468 for (c = 0; c < 4; c++) { 2469 if (!(mask & (1 << c))) 2470 continue; 2471 emit_cvt(pc, dst[c], src[0][c], -1, 2472 CVT_TRUNC | CVT_U32_F32); 2473 } 2474 break; 2475 case TGSI_OPCODE_FLR: 2476 for (c = 0; c < 4; c++) { 2477 if (!(mask & (1 << c))) 2478 continue; 2479 emit_flr(pc, dst[c], src[0][c]); 2480 } 2481 break; 2482 case TGSI_OPCODE_FRC: 2483 temp = temp_temp(pc); 2484 for (c = 0; c < 4; c++) { 2485 if (!(mask & (1 << c))) 2486 continue; 2487 emit_flr(pc, temp, src[0][c]); 2488 emit_sub(pc, dst[c], src[0][c], temp); 2489 } 2490 break; 2491 case TGSI_OPCODE_I2F: 2492 for (c = 0; c < 4; c++) { 2493 if (!(mask & (1 << c))) 2494 continue; 2495 emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32); 2496 } 2497 break; 2498 case TGSI_OPCODE_IF: 2499 assert(pc->if_lvl < NV50_MAX_COND_NESTING); 2500 emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32); 2501 pc->if_join[pc->if_lvl] = emit_joinat(pc); 2502 pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);; 2503 terminate_mbb(pc); 2504 break; 2505 case TGSI_OPCODE_INEG: 2506 for (c = 0; c < 4; c++) { 2507 if (!(mask & (1 << c))) 2508 continue; 2509 emit_cvt(pc, dst[c], src[0][c], -1, 2510 CVT_S32_S32 | CVT_NEG); 2511 } 2512 break; 2513 case TGSI_OPCODE_KIL: 2514 assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]); 2515 emit_kil(pc, src[0][0]); 2516 emit_kil(pc, src[0][1]); 2517 emit_kil(pc, src[0][2]); 2518 emit_kil(pc, src[0][3]); 2519 break; 2520 case TGSI_OPCODE_KILP: 2521 emit_kil(pc, NULL); 2522 break; 2523 case TGSI_OPCODE_LIT: 2524 emit_lit(pc, &dst[0], mask, &src[0][0]); 2525 break; 2526 case TGSI_OPCODE_LG2: 2527 emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]); 2528 break; 2529 case TGSI_OPCODE_LOG: 2530 { 2531 struct nv50_reg *t[2]; 2532 2533 t[0] = temp_temp(pc); 2534 if (mask & (1 << 1)) 2535 t[1] = temp_temp(pc); 2536 else 2537 t[1] = t[0]; 2538 2539 emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32); 2540 emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]); 2541 if (mask & (1 << 2)) 2542 emit_mov(pc, dst[2], t[1]); 2543 emit_flr(pc, t[1], t[1]); 2544 if (mask & (1 << 0)) 2545 emit_mov(pc, dst[0], t[1]); 2546 if (mask & (1 << 1)) { 2547 t[1]->mod = NV50_MOD_NEG; 2548 emit_preex2(pc, t[1], t[1]); 2549 t[1]->mod = 0; 2550 emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]); 2551 emit_mul(pc, dst[1], t[0], t[1]); 2552 } 2553 if (mask & (1 << 3)) 2554 emit_mov_immdval(pc, dst[3], 1.0f); 2555 } 2556 break; 2557 case TGSI_OPCODE_LRP: 2558 temp = temp_temp(pc); 2559 for (c = 0; c < 4; c++) { 2560 if (!(mask & (1 << c))) 2561 continue; 2562 emit_sub(pc, temp, src[1][c], src[2][c]); 2563 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 2564 } 2565 break; 2566 case TGSI_OPCODE_MAD: 2567 for (c = 0; c < 4; c++) { 2568 if (!(mask & (1 << c))) 2569 continue; 2570 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 2571 } 2572 break; 2573 case TGSI_OPCODE_MAX: 2574 for (c = 0; c < 4; c++) { 2575 if (!(mask & (1 << c))) 2576 continue; 2577 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]); 2578 } 2579 break; 2580 case TGSI_OPCODE_MIN: 2581 for (c = 0; c < 4; c++) { 2582 if (!(mask & (1 << c))) 2583 continue; 2584 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]); 2585 } 2586 break; 2587 case TGSI_OPCODE_MOV: 2588 for (c = 0; c < 4; c++) { 2589 if (!(mask & (1 << c))) 2590 continue; 2591 emit_mov(pc, dst[c], src[0][c]); 2592 } 2593 break; 2594 case TGSI_OPCODE_MUL: 2595 for (c = 0; c < 4; c++) { 2596 if (!(mask & (1 << c))) 2597 continue; 2598 emit_mul(pc, dst[c], src[0][c], src[1][c]); 2599 } 2600 break; 2601 case TGSI_OPCODE_POW: 2602 emit_pow(pc, brdc, src[0][0], src[1][0]); 2603 break; 2604 case TGSI_OPCODE_RCP: 2605 emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]); 2606 break; 2607 case TGSI_OPCODE_RET: 2608 if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine) 2609 nv50_fp_move_results(pc); 2610 emit_ret(pc, -1, 0); 2611 break; 2612 case TGSI_OPCODE_RSQ: 2613 src[0][0]->mod |= NV50_MOD_ABS; 2614 emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]); 2615 break; 2616 case TGSI_OPCODE_SCS: 2617 temp = temp_temp(pc); 2618 if (mask & 3) 2619 emit_precossin(pc, temp, src[0][0]); 2620 if (mask & (1 << 0)) 2621 emit_flop(pc, NV50_FLOP_COS, dst[0], temp); 2622 if (mask & (1 << 1)) 2623 emit_flop(pc, NV50_FLOP_SIN, dst[1], temp); 2624 if (mask & (1 << 2)) 2625 emit_mov_immdval(pc, dst[2], 0.0); 2626 if (mask & (1 << 3)) 2627 emit_mov_immdval(pc, dst[3], 1.0); 2628 break; 2629 case TGSI_OPCODE_SHL: 2630 case TGSI_OPCODE_ISHR: 2631 case TGSI_OPCODE_USHR: 2632 for (c = 0; c < 4; c++) { 2633 if (!(mask & (1 << c))) 2634 continue; 2635 emit_shift(pc, dst[c], src[0][c], src[1][c], 2636 inst->Instruction.Opcode); 2637 } 2638 break; 2639 case TGSI_OPCODE_SIN: 2640 if (mask & 8) { 2641 emit_precossin(pc, temp, src[0][3]); 2642 emit_flop(pc, NV50_FLOP_SIN, dst[3], temp); 2643 if (!(mask &= 7)) 2644 break; 2645 if (temp == dst[3]) 2646 temp = brdc = temp_temp(pc); 2647 } 2648 emit_precossin(pc, temp, src[0][0]); 2649 emit_flop(pc, NV50_FLOP_SIN, brdc, temp); 2650 break; 2651 case TGSI_OPCODE_SLT: 2652 case TGSI_OPCODE_SGE: 2653 case TGSI_OPCODE_SEQ: 2654 case TGSI_OPCODE_SGT: 2655 case TGSI_OPCODE_SLE: 2656 case TGSI_OPCODE_SNE: 2657 i = map_tgsi_setop_cc(inst->Instruction.Opcode); 2658 for (c = 0; c < 4; c++) { 2659 if (!(mask & (1 << c))) 2660 continue; 2661 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]); 2662 } 2663 break; 2664 case TGSI_OPCODE_SUB: 2665 for (c = 0; c < 4; c++) { 2666 if (!(mask & (1 << c))) 2667 continue; 2668 emit_sub(pc, dst[c], src[0][c], src[1][c]); 2669 } 2670 break; 2671 case TGSI_OPCODE_TEX: 2672 emit_tex(pc, dst, mask, src[0], unit, 2673 inst->Texture.Texture, FALSE, 0); 2674 break; 2675 case TGSI_OPCODE_TXB: 2676 emit_tex(pc, dst, mask, src[0], unit, 2677 inst->Texture.Texture, FALSE, -1); 2678 break; 2679 case TGSI_OPCODE_TXL: 2680 emit_tex(pc, dst, mask, src[0], unit, 2681 inst->Texture.Texture, FALSE, 1); 2682 break; 2683 case TGSI_OPCODE_TXP: 2684 emit_tex(pc, dst, mask, src[0], unit, 2685 inst->Texture.Texture, TRUE, 0); 2686 break; 2687 case TGSI_OPCODE_TRUNC: 2688 for (c = 0; c < 4; c++) { 2689 if (!(mask & (1 << c))) 2690 continue; 2691 emit_cvt(pc, dst[c], src[0][c], -1, 2692 CVT_TRUNC | CVT_F32_F32 | CVT_RI); 2693 } 2694 break; 2695 case TGSI_OPCODE_U2F: 2696 for (c = 0; c < 4; c++) { 2697 if (!(mask & (1 << c))) 2698 continue; 2699 emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32); 2700 } 2701 break; 2702 case TGSI_OPCODE_XPD: 2703 temp = temp_temp(pc); 2704 if (mask & (1 << 0)) { 2705 emit_mul(pc, temp, src[0][2], src[1][1]); 2706 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 2707 } 2708 if (mask & (1 << 1)) { 2709 emit_mul(pc, temp, src[0][0], src[1][2]); 2710 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 2711 } 2712 if (mask & (1 << 2)) { 2713 emit_mul(pc, temp, src[0][1], src[1][0]); 2714 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 2715 } 2716 if (mask & (1 << 3)) 2717 emit_mov_immdval(pc, dst[3], 1.0); 2718 break; 2719 case TGSI_OPCODE_END: 2720 if (pc->p->type == PIPE_SHADER_FRAGMENT) 2721 nv50_fp_move_results(pc); 2722 2723 /* last insn must be long so it can have the exit bit set */ 2724 if (!is_long(pc->p->exec_tail)) 2725 convert_to_long(pc, pc->p->exec_tail); 2726 else 2727 if (is_immd(pc->p->exec_tail) || is_join(pc->p->exec_tail)) 2728 emit_nop(pc); 2729 2730 pc->p->exec_tail->inst[1] |= 1; /* set exit bit */ 2731 break; 2732 default: 2733 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 2734 return FALSE; 2735 } 2736 2737 if (brdc) { 2738 if (sat) 2739 emit_sat(pc, brdc, brdc); 2740 for (c = 0; c < 4; c++) 2741 if ((mask & (1 << c)) && dst[c] != brdc) 2742 emit_mov(pc, dst[c], brdc); 2743 } else 2744 if (sat) { 2745 for (c = 0; c < 4; c++) { 2746 if (!(mask & (1 << c))) 2747 continue; 2748 /* In this case we saturate later, and dst[c] won't 2749 * be another temp_temp (and thus lost), since rdst 2750 * already is TEMP (see above). */ 2751 if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) 2752 continue; 2753 emit_sat(pc, rdst[c], dst[c]); 2754 } 2755 } 2756 2757 kill_temp_temp(pc); 2758 pc->reg_instance_nr = 0; 2759 2760 return TRUE; 2761} 2762 2763static void 2764prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) 2765{ 2766 struct nv50_reg *reg = NULL; 2767 const struct tgsi_full_src_register *src; 2768 const struct tgsi_dst_register *dst; 2769 unsigned i, c, k, mask; 2770 2771 dst = &insn->Dst[0].Register; 2772 mask = dst->WriteMask; 2773 2774 if (dst->File == TGSI_FILE_TEMPORARY) 2775 reg = pc->temp; 2776 else 2777 if (dst->File == TGSI_FILE_OUTPUT) { 2778 reg = pc->result; 2779 2780 if (insn->Instruction.Opcode == TGSI_OPCODE_MOV && 2781 dst->Index == pc->edgeflag_out && 2782 insn->Src[0].Register.File == TGSI_FILE_INPUT) 2783 pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index; 2784 } 2785 2786 if (reg) { 2787 for (c = 0; c < 4; c++) { 2788 if (!(mask & (1 << c))) 2789 continue; 2790 reg[dst->Index * 4 + c].acc = pc->insn_nr; 2791 } 2792 } 2793 2794 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2795 src = &insn->Src[i]; 2796 2797 if (src->Register.File == TGSI_FILE_TEMPORARY) 2798 reg = pc->temp; 2799 else 2800 if (src->Register.File == TGSI_FILE_INPUT) 2801 reg = pc->attr; 2802 else 2803 continue; 2804 2805 mask = nv50_tgsi_src_mask(insn, i); 2806 2807 for (c = 0; c < 4; c++) { 2808 if (!(mask & (1 << c))) 2809 continue; 2810 k = tgsi_util_get_full_src_register_swizzle(src, c); 2811 2812 reg[src->Register.Index * 4 + k].acc = pc->insn_nr; 2813 } 2814 } 2815} 2816 2817/* Returns a bitmask indicating which dst components need to be 2818 * written to temporaries first to avoid 'corrupting' sources. 2819 * 2820 * m[i] (out) indicate component to write in the i-th position 2821 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 2822 */ 2823static unsigned 2824nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 2825{ 2826 unsigned i, c, x, unsafe; 2827 2828 for (c = 0; c < 4; c++) 2829 m[c] = c; 2830 2831 /* Swap as long as a dst component written earlier is depended on 2832 * by one written later, but the next one isn't depended on by it. 2833 */ 2834 for (c = 0; c < 3; c++) { 2835 if (rdep[m[c + 1]] & (1 << m[c])) 2836 continue; /* if next one is depended on by us */ 2837 for (i = c + 1; i < 4; i++) 2838 /* if we are depended on by a later one */ 2839 if (rdep[m[c]] & (1 << m[i])) 2840 break; 2841 if (i == 4) 2842 continue; 2843 /* now, swap */ 2844 x = m[c]; 2845 m[c] = m[c + 1]; 2846 m[c + 1] = x; 2847 2848 /* restart */ 2849 c = 0; 2850 } 2851 2852 /* mark dependencies that could not be resolved by reordering */ 2853 for (i = 0; i < 3; ++i) 2854 for (c = i + 1; c < 4; ++c) 2855 if (rdep[m[i]] & (1 << m[c])) 2856 unsafe |= (1 << i); 2857 2858 /* NOTE: $unsafe is with respect to order, not component */ 2859 return unsafe; 2860} 2861 2862/* Select a suitable dst register for broadcasting scalar results, 2863 * or return NULL if we have to allocate an extra TEMP. 2864 * 2865 * If e.g. only 1 component is written, we may also emit the final 2866 * result to a write-only register. 2867 */ 2868static struct nv50_reg * 2869tgsi_broadcast_dst(struct nv50_pc *pc, 2870 const struct tgsi_full_dst_register *fd, unsigned mask) 2871{ 2872 if (fd->Register.File == TGSI_FILE_TEMPORARY) { 2873 int c = ffs(~mask & fd->Register.WriteMask); 2874 if (c) 2875 return tgsi_dst(pc, c - 1, fd); 2876 } else { 2877 int c = ffs(fd->Register.WriteMask) - 1; 2878 if ((1 << c) == fd->Register.WriteMask) 2879 return tgsi_dst(pc, c, fd); 2880 } 2881 2882 return NULL; 2883} 2884 2885/* Scan source swizzles and return a bitmask indicating dst regs that 2886 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 2887 */ 2888static unsigned 2889nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 2890 unsigned rdep[4]) 2891{ 2892 const struct tgsi_full_dst_register *fd = &insn->Dst[0]; 2893 const struct tgsi_full_src_register *fs; 2894 unsigned i, deqs = 0; 2895 2896 for (i = 0; i < 4; ++i) 2897 rdep[i] = 0; 2898 2899 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2900 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 2901 int ms = get_supported_mods(insn, i); 2902 2903 fs = &insn->Src[i]; 2904 if (fs->Register.File != fd->Register.File || 2905 fs->Register.Index != fd->Register.Index) 2906 continue; 2907 2908 for (chn = 0; chn < 4; ++chn) { 2909 unsigned s, c; 2910 2911 if (!(mask & (1 << chn))) /* src is not read */ 2912 continue; 2913 c = tgsi_util_get_full_src_register_swizzle(fs, chn); 2914 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 2915 2916 if (!(fd->Register.WriteMask & (1 << c))) 2917 continue; 2918 2919 if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG)) 2920 continue; 2921 if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS)) 2922 continue; 2923 if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3)) 2924 continue; 2925 2926 rdep[c] |= nv50_tgsi_dst_revdep( 2927 insn->Instruction.Opcode, i, chn); 2928 deqs |= (1 << c); 2929 } 2930 } 2931 2932 return deqs; 2933} 2934 2935static boolean 2936nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 2937{ 2938 struct tgsi_full_instruction insn = tok->FullInstruction; 2939 const struct tgsi_full_dst_register *fd; 2940 unsigned i, deqs, rdep[4], m[4]; 2941 2942 fd = &tok->FullInstruction.Dst[0]; 2943 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 2944 2945 if (is_scalar_op(insn.Instruction.Opcode)) { 2946 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 2947 if (!pc->r_brdc) 2948 pc->r_brdc = temp_temp(pc); 2949 return nv50_program_tx_insn(pc, &insn); 2950 } 2951 pc->r_brdc = NULL; 2952 2953 if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3])) 2954 return nv50_program_tx_insn(pc, &insn); 2955 2956 deqs = nv50_revdep_reorder(m, rdep); 2957 2958 for (i = 0; i < 4; ++i) { 2959 assert(pc->r_dst[m[i]] == NULL); 2960 2961 insn.Dst[0].Register.WriteMask = 2962 fd->Register.WriteMask & (1 << m[i]); 2963 2964 if (!insn.Dst[0].Register.WriteMask) 2965 continue; 2966 2967 if (deqs & (1 << i)) 2968 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 2969 2970 if (!nv50_program_tx_insn(pc, &insn)) 2971 return FALSE; 2972 } 2973 2974 for (i = 0; i < 4; i++) { 2975 struct nv50_reg *reg = pc->r_dst[i]; 2976 if (!reg) 2977 continue; 2978 pc->r_dst[i] = NULL; 2979 2980 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 2981 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 2982 else 2983 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 2984 free_temp(pc, reg); 2985 } 2986 2987 return TRUE; 2988} 2989 2990static void 2991load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) 2992{ 2993 struct nv50_reg *iv, **ppiv; 2994 unsigned mode = pc->interp_mode[reg->index]; 2995 2996 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; 2997 iv = *ppiv; 2998 2999 if ((mode & INTERP_PERSPECTIVE) && !iv) { 3000 iv = *ppiv = alloc_temp(pc, NULL); 3001 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; 3002 3003 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); 3004 emit_flop(pc, NV50_FLOP_RCP, iv, iv); 3005 3006 /* XXX: when loading interpolants dynamically, move these 3007 * to the program head, or make sure it can't be skipped. 3008 */ 3009 } 3010 3011 emit_interp(pc, reg, iv, mode); 3012} 3013 3014/* The face input is always at v[255] (varying space), with a 3015 * value of 0 for back-facing, and 0xffffffff for front-facing. 3016 */ 3017static void 3018load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a) 3019{ 3020 struct nv50_reg *one = alloc_immd(pc, 1.0f); 3021 3022 assert(a->rhw == -1); 3023 alloc_reg(pc, a); /* do this before rhw is set */ 3024 a->rhw = 255; 3025 load_interpolant(pc, a); 3026 emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND); 3027 3028 FREE(one); 3029} 3030 3031static boolean 3032nv50_program_tx_prep(struct nv50_pc *pc) 3033{ 3034 struct tgsi_parse_context tp; 3035 struct nv50_program *p = pc->p; 3036 boolean ret = FALSE; 3037 unsigned i, c, flat_nr = 0; 3038 3039 tgsi_parse_init(&tp, pc->p->pipe.tokens); 3040 while (!tgsi_parse_end_of_tokens(&tp)) { 3041 const union tgsi_full_token *tok = &tp.FullToken; 3042 3043 tgsi_parse_token(&tp); 3044 switch (tok->Token.Type) { 3045 case TGSI_TOKEN_TYPE_IMMEDIATE: 3046 { 3047 const struct tgsi_full_immediate *imm = 3048 &tp.FullToken.FullImmediate; 3049 3050 ctor_immd_4f32(pc, imm->u[0].Float, 3051 imm->u[1].Float, 3052 imm->u[2].Float, 3053 imm->u[3].Float); 3054 } 3055 break; 3056 case TGSI_TOKEN_TYPE_DECLARATION: 3057 { 3058 const struct tgsi_full_declaration *d; 3059 unsigned si, last, first, mode; 3060 3061 d = &tp.FullToken.FullDeclaration; 3062 first = d->Range.First; 3063 last = d->Range.Last; 3064 3065 switch (d->Declaration.File) { 3066 case TGSI_FILE_TEMPORARY: 3067 break; 3068 case TGSI_FILE_OUTPUT: 3069 if (!d->Declaration.Semantic || 3070 p->type == PIPE_SHADER_FRAGMENT) 3071 break; 3072 3073 si = d->Semantic.Index; 3074 switch (d->Semantic.Name) { 3075 case TGSI_SEMANTIC_BCOLOR: 3076 p->cfg.two_side[si].hw = first; 3077 if (p->cfg.io_nr > first) 3078 p->cfg.io_nr = first; 3079 break; 3080 case TGSI_SEMANTIC_PSIZE: 3081 p->cfg.psiz = first; 3082 if (p->cfg.io_nr > first) 3083 p->cfg.io_nr = first; 3084 break; 3085 case TGSI_SEMANTIC_EDGEFLAG: 3086 pc->edgeflag_out = first; 3087 break; 3088 /* 3089 case TGSI_SEMANTIC_CLIP_DISTANCE: 3090 p->cfg.clpd = MIN2(p->cfg.clpd, first); 3091 break; 3092 */ 3093 default: 3094 break; 3095 } 3096 break; 3097 case TGSI_FILE_INPUT: 3098 { 3099 if (p->type != PIPE_SHADER_FRAGMENT) 3100 break; 3101 3102 switch (d->Declaration.Interpolate) { 3103 case TGSI_INTERPOLATE_CONSTANT: 3104 mode = INTERP_FLAT; 3105 flat_nr++; 3106 break; 3107 case TGSI_INTERPOLATE_PERSPECTIVE: 3108 mode = INTERP_PERSPECTIVE; 3109 p->cfg.regs[1] |= 0x08 << 24; 3110 break; 3111 default: 3112 mode = INTERP_LINEAR; 3113 break; 3114 } 3115 if (d->Declaration.Centroid) 3116 mode |= INTERP_CENTROID; 3117 3118 assert(last < 32); 3119 for (i = first; i <= last; i++) 3120 pc->interp_mode[i] = mode; 3121 } 3122 break; 3123 case TGSI_FILE_ADDRESS: 3124 case TGSI_FILE_CONSTANT: 3125 case TGSI_FILE_SAMPLER: 3126 break; 3127 default: 3128 NOUVEAU_ERR("bad decl file %d\n", 3129 d->Declaration.File); 3130 goto out_err; 3131 } 3132 } 3133 break; 3134 case TGSI_TOKEN_TYPE_INSTRUCTION: 3135 pc->insn_nr++; 3136 prep_inspect_insn(pc, &tok->FullInstruction); 3137 break; 3138 default: 3139 break; 3140 } 3141 } 3142 3143 if (p->type == PIPE_SHADER_VERTEX) { 3144 int rid = 0; 3145 3146 for (i = 0; i < pc->attr_nr * 4; ++i) { 3147 if (pc->attr[i].acc) { 3148 pc->attr[i].hw = rid++; 3149 p->cfg.attr[i / 32] |= 1 << (i % 32); 3150 } 3151 } 3152 3153 for (i = 0, rid = 0; i < pc->result_nr; ++i) { 3154 p->cfg.io[i].hw = rid; 3155 p->cfg.io[i].id = i; 3156 3157 for (c = 0; c < 4; ++c) { 3158 int n = i * 4 + c; 3159 if (!pc->result[n].acc) 3160 continue; 3161 pc->result[n].hw = rid++; 3162 p->cfg.io[i].mask |= 1 << c; 3163 } 3164 } 3165 3166 for (c = 0; c < 2; ++c) 3167 if (p->cfg.two_side[c].hw < 0x40) 3168 p->cfg.two_side[c] = p->cfg.io[ 3169 p->cfg.two_side[c].hw]; 3170 3171 if (p->cfg.psiz < 0x40) 3172 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw; 3173 } else 3174 if (p->type == PIPE_SHADER_FRAGMENT) { 3175 int rid, aid; 3176 unsigned n = 0, m = pc->attr_nr - flat_nr; 3177 3178 pc->allow32 = TRUE; 3179 3180 int base = (TGSI_SEMANTIC_POSITION == 3181 p->info.input_semantic_name[0]) ? 0 : 1; 3182 3183 /* non-flat interpolants have to be mapped to 3184 * the lower hardware IDs, so sort them: 3185 */ 3186 for (i = 0; i < pc->attr_nr; i++) { 3187 if (pc->interp_mode[i] == INTERP_FLAT) 3188 p->cfg.io[m++].id = i; 3189 else { 3190 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) 3191 p->cfg.io[n].linear = TRUE; 3192 p->cfg.io[n++].id = i; 3193 } 3194 } 3195 3196 if (!base) /* set w-coordinate mask from perspective interp */ 3197 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24; 3198 3199 aid = popcnt4( /* if fcrd isn't contained in cfg.io */ 3200 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask); 3201 3202 for (n = 0; n < pc->attr_nr; ++n) { 3203 p->cfg.io[n].hw = rid = aid; 3204 i = p->cfg.io[n].id; 3205 3206 if (p->info.input_semantic_name[n] == 3207 TGSI_SEMANTIC_FACE) { 3208 load_frontfacing(pc, &pc->attr[i * 4]); 3209 continue; 3210 } 3211 3212 for (c = 0; c < 4; ++c) { 3213 if (!pc->attr[i * 4 + c].acc) 3214 continue; 3215 pc->attr[i * 4 + c].rhw = rid++; 3216 p->cfg.io[n].mask |= 1 << c; 3217 3218 load_interpolant(pc, &pc->attr[i * 4 + c]); 3219 } 3220 aid += popcnt4(p->cfg.io[n].mask); 3221 } 3222 3223 if (!base) 3224 p->cfg.regs[1] |= p->cfg.io[0].mask << 24; 3225 3226 m = popcnt4(p->cfg.regs[1] >> 24); 3227 3228 /* set count of non-position inputs and of non-flat 3229 * non-position inputs for FP_INTERPOLANT_CTRL 3230 */ 3231 p->cfg.regs[1] |= aid - m; 3232 3233 if (flat_nr) { 3234 i = p->cfg.io[pc->attr_nr - flat_nr].hw; 3235 p->cfg.regs[1] |= (i - m) << 16; 3236 } else 3237 p->cfg.regs[1] |= p->cfg.regs[1] << 16; 3238 3239 /* mark color semantic for light-twoside */ 3240 n = 0x40; 3241 for (i = 0; i < pc->attr_nr; i++) { 3242 ubyte si, sn; 3243 3244 sn = p->info.input_semantic_name[p->cfg.io[i].id]; 3245 si = p->info.input_semantic_index[p->cfg.io[i].id]; 3246 3247 if (sn == TGSI_SEMANTIC_COLOR) { 3248 p->cfg.two_side[si] = p->cfg.io[i]; 3249 3250 /* increase colour count */ 3251 p->cfg.regs[0] += popcnt4( 3252 p->cfg.two_side[si].mask) << 16; 3253 3254 n = MIN2(n, p->cfg.io[i].hw - m); 3255 } 3256 } 3257 if (n < 0x40) 3258 p->cfg.regs[0] += n; 3259 3260 /* Initialize FP results: 3261 * FragDepth is always first TGSI and last hw output 3262 */ 3263 i = p->info.writes_z ? 4 : 0; 3264 for (rid = 0; i < pc->result_nr * 4; i++) 3265 pc->result[i].rhw = rid++; 3266 if (p->info.writes_z) 3267 pc->result[2].rhw = rid; 3268 3269 p->cfg.high_result = rid; 3270 3271 /* separate/different colour results for MRTs ? */ 3272 if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) 3273 p->cfg.regs[2] |= 1; 3274 } 3275 3276 if (pc->immd_nr) { 3277 int rid = 0; 3278 3279 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); 3280 if (!pc->immd) 3281 goto out_err; 3282 3283 for (i = 0; i < pc->immd_nr; i++) { 3284 for (c = 0; c < 4; c++, rid++) 3285 ctor_reg(&pc->immd[rid], P_IMMD, i, rid); 3286 } 3287 } 3288 3289 ret = TRUE; 3290out_err: 3291 if (pc->iv_p) 3292 free_temp(pc, pc->iv_p); 3293 if (pc->iv_c) 3294 free_temp(pc, pc->iv_c); 3295 3296 tgsi_parse_free(&tp); 3297 return ret; 3298} 3299 3300static void 3301free_nv50_pc(struct nv50_pc *pc) 3302{ 3303 if (pc->immd) 3304 FREE(pc->immd); 3305 if (pc->param) 3306 FREE(pc->param); 3307 if (pc->result) 3308 FREE(pc->result); 3309 if (pc->attr) 3310 FREE(pc->attr); 3311 if (pc->temp) 3312 FREE(pc->temp); 3313 3314 FREE(pc); 3315} 3316 3317static boolean 3318ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) 3319{ 3320 int i, c; 3321 unsigned rtype[2] = { P_ATTR, P_RESULT }; 3322 3323 pc->p = p; 3324 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; 3325 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; 3326 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; 3327 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; 3328 pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; 3329 assert(pc->addr_nr <= 2); 3330 3331 p->cfg.high_temp = 4; 3332 3333 p->cfg.two_side[0].hw = 0x40; 3334 p->cfg.two_side[1].hw = 0x40; 3335 3336 p->cfg.edgeflag_in = pc->edgeflag_out = 0xff; 3337 3338 switch (p->type) { 3339 case PIPE_SHADER_VERTEX: 3340 p->cfg.psiz = 0x40; 3341 p->cfg.clpd = 0x40; 3342 p->cfg.io_nr = pc->result_nr; 3343 break; 3344 case PIPE_SHADER_FRAGMENT: 3345 rtype[0] = rtype[1] = P_TEMP; 3346 3347 p->cfg.regs[0] = 0x01000004; 3348 p->cfg.io_nr = pc->attr_nr; 3349 3350 if (p->info.writes_z) { 3351 p->cfg.regs[2] |= 0x00000100; 3352 p->cfg.regs[3] |= 0x00000011; 3353 } 3354 if (p->info.uses_kill) 3355 p->cfg.regs[2] |= 0x00100000; 3356 break; 3357 } 3358 3359 if (pc->temp_nr) { 3360 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); 3361 if (!pc->temp) 3362 return FALSE; 3363 3364 for (i = 0; i < pc->temp_nr * 4; ++i) 3365 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); 3366 } 3367 3368 if (pc->attr_nr) { 3369 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); 3370 if (!pc->attr) 3371 return FALSE; 3372 3373 for (i = 0; i < pc->attr_nr * 4; ++i) 3374 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); 3375 } 3376 3377 if (pc->result_nr) { 3378 unsigned nr = pc->result_nr * 4; 3379 3380 pc->result = MALLOC(nr * sizeof(struct nv50_reg)); 3381 if (!pc->result) 3382 return FALSE; 3383 3384 for (i = 0; i < nr; ++i) 3385 ctor_reg(&pc->result[i], rtype[1], i / 4, -1); 3386 } 3387 3388 if (pc->param_nr) { 3389 int rid = 0; 3390 3391 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); 3392 if (!pc->param) 3393 return FALSE; 3394 3395 for (i = 0; i < pc->param_nr; ++i) 3396 for (c = 0; c < 4; ++c, ++rid) 3397 ctor_reg(&pc->param[rid], P_CONST, i, rid); 3398 } 3399 3400 if (pc->addr_nr) { 3401 pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); 3402 if (!pc->addr) 3403 return FALSE; 3404 } 3405 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 3406 ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1); 3407 3408 return TRUE; 3409} 3410 3411static void 3412nv50_program_fixup_insns(struct nv50_pc *pc) 3413{ 3414 struct nv50_program_exec *e, **bra_list; 3415 unsigned i, n, pos; 3416 3417 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); 3418 3419 /* Collect branch instructions, we need to adjust their offsets 3420 * when converting 32 bit instructions to 64 bit ones 3421 */ 3422 for (n = 0, e = pc->p->exec_head; e; e = e->next) 3423 if (e->param.index >= 0 && !e->param.mask) 3424 bra_list[n++] = e; 3425 3426 /* Make sure we don't have any single 32 bit instructions. */ 3427 for (e = pc->p->exec_head, pos = 0; e; e = e->next) { 3428 pos += is_long(e) ? 2 : 1; 3429 3430 if ((pos & 1) && (!e->next || is_long(e->next))) { 3431 for (i = 0; i < n; ++i) 3432 if (bra_list[i]->param.index >= pos) 3433 bra_list[i]->param.index += 1; 3434 for (i = 0; i < pc->insn_nr; ++i) 3435 if (pc->insn_pos[i] >= pos) 3436 pc->insn_pos[i] += 1; 3437 convert_to_long(pc, e); 3438 ++pos; 3439 } 3440 } 3441 3442 FREE(bra_list); 3443 3444 if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL]) 3445 return; 3446 3447 /* fill in CALL offsets */ 3448 for (e = pc->p->exec_head; e; e = e->next) { 3449 if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2) 3450 e->param.index = pc->insn_pos[e->param.index]; 3451 } 3452} 3453 3454static boolean 3455nv50_program_tx(struct nv50_program *p) 3456{ 3457 struct tgsi_parse_context parse; 3458 struct nv50_pc *pc; 3459 boolean ret; 3460 3461 pc = CALLOC_STRUCT(nv50_pc); 3462 if (!pc) 3463 return FALSE; 3464 3465 ret = ctor_nv50_pc(pc, p); 3466 if (ret == FALSE) 3467 goto out_cleanup; 3468 3469 ret = nv50_program_tx_prep(pc); 3470 if (ret == FALSE) 3471 goto out_cleanup; 3472 3473 pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned)); 3474 3475 tgsi_parse_init(&parse, pc->p->pipe.tokens); 3476 while (!tgsi_parse_end_of_tokens(&parse)) { 3477 const union tgsi_full_token *tok = &parse.FullToken; 3478 3479 /* previously allow32 was FALSE for first & last instruction */ 3480 pc->allow32 = TRUE; 3481 3482 tgsi_parse_token(&parse); 3483 3484 switch (tok->Token.Type) { 3485 case TGSI_TOKEN_TYPE_INSTRUCTION: 3486 pc->insn_pos[pc->insn_cur] = pc->p->exec_size; 3487 ++pc->insn_cur; 3488 ret = nv50_tgsi_insn(pc, tok); 3489 if (ret == FALSE) 3490 goto out_err; 3491 break; 3492 default: 3493 break; 3494 } 3495 } 3496 3497 nv50_program_fixup_insns(pc); 3498 3499 p->param_nr = pc->param_nr * 4; 3500 p->immd_nr = pc->immd_nr * 4; 3501 p->immd = pc->immd_buf; 3502 3503out_err: 3504 tgsi_parse_free(&parse); 3505 3506out_cleanup: 3507 free_nv50_pc(pc); 3508 return ret; 3509} 3510 3511static void 3512nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 3513{ 3514 if (nv50_program_tx(p) == FALSE) 3515 assert(0); 3516 p->translated = TRUE; 3517} 3518 3519static void 3520nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map, 3521 unsigned start, unsigned count, unsigned cbuf) 3522{ 3523 struct nouveau_channel *chan = nv50->screen->base.channel; 3524 struct nouveau_grobj *tesla = nv50->screen->tesla; 3525 3526 while (count) { 3527 unsigned nr = count > 2047 ? 2047 : count; 3528 3529 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 3530 OUT_RING (chan, (cbuf << 0) | (start << 8)); 3531 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 3532 OUT_RINGp (chan, map, nr); 3533 3534 map += nr; 3535 start += nr; 3536 count -= nr; 3537 } 3538} 3539 3540static void 3541nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 3542{ 3543 struct pipe_screen *pscreen = nv50->pipe.screen; 3544 3545 if (!p->data[0] && p->immd_nr) { 3546 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 3547 3548 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 3549 while (heap->next && heap->size < p->immd_nr) { 3550 struct nv50_program *evict = heap->next->priv; 3551 nouveau_resource_free(&evict->data[0]); 3552 } 3553 3554 if (nouveau_resource_alloc(heap, p->immd_nr, p, 3555 &p->data[0])) 3556 assert(0); 3557 } 3558 3559 /* immediates only need to be uploaded again when freed */ 3560 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 3561 p->immd_nr, NV50_CB_PMISC); 3562 } 3563 3564 assert(p->param_nr <= 512); 3565 3566 if (p->param_nr) { 3567 unsigned cb; 3568 uint32_t *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], 3569 PIPE_BUFFER_USAGE_CPU_READ); 3570 3571 if (p->type == PIPE_SHADER_VERTEX) 3572 cb = NV50_CB_PVP; 3573 else 3574 cb = NV50_CB_PFP; 3575 3576 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); 3577 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); 3578 } 3579} 3580 3581static void 3582nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 3583{ 3584 struct nouveau_channel *chan = nv50->screen->base.channel; 3585 struct nv50_program_exec *e; 3586 uint32_t *up, i; 3587 boolean upload = FALSE; 3588 3589 if (!p->bo) { 3590 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 3591 p->exec_size * 4, &p->bo); 3592 upload = TRUE; 3593 } 3594 3595 if (p->data[0] && p->data[0]->start != p->data_start[0]) 3596 upload = TRUE; 3597 3598 if (!upload) 3599 return; 3600 3601 up = MALLOC(p->exec_size * 4); 3602 3603 for (i = 0, e = p->exec_head; e; e = e->next) { 3604 unsigned ei, ci, bs; 3605 3606 if (e->param.index >= 0 && e->param.mask) { 3607 bs = (e->inst[1] >> 22) & 0x07; 3608 assert(bs < 2); 3609 ei = e->param.shift >> 5; 3610 ci = e->param.index; 3611 if (bs == 0) 3612 ci += p->data[bs]->start; 3613 3614 e->inst[ei] &= ~e->param.mask; 3615 e->inst[ei] |= (ci << e->param.shift); 3616 } else 3617 if (e->param.index >= 0) { 3618 /* zero mask means param is a jump/branch offset */ 3619 assert(!(e->param.index & 1)); 3620 /* seem to be 8 byte steps */ 3621 ei = (e->param.index >> 1) + 0 /* START_ID */; 3622 3623 e->inst[0] &= 0xf0000fff; 3624 e->inst[0] |= ei << 12; 3625 } 3626 3627 up[i++] = e->inst[0]; 3628 if (is_long(e)) 3629 up[i++] = e->inst[1]; 3630 } 3631 assert(i == p->exec_size); 3632 3633 if (p->data[0]) 3634 p->data_start[0] = p->data[0]->start; 3635 3636#ifdef NV50_PROGRAM_DUMP 3637 NOUVEAU_ERR("-------\n"); 3638 for (e = p->exec_head; e; e = e->next) { 3639 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 3640 if (is_long(e)) 3641 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 3642 } 3643#endif 3644 nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM, 3645 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, 3646 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0, 3647 0, 0, p->exec_size * 4, 1, 1); 3648 3649 FREE(up); 3650} 3651 3652void 3653nv50_vertprog_validate(struct nv50_context *nv50) 3654{ 3655 struct nouveau_grobj *tesla = nv50->screen->tesla; 3656 struct nv50_program *p = nv50->vertprog; 3657 struct nouveau_stateobj *so; 3658 3659 if (!p->translated) { 3660 nv50_program_validate(nv50, p); 3661 if (!p->translated) 3662 assert(0); 3663 } 3664 3665 nv50_program_validate_data(nv50, p); 3666 nv50_program_validate_code(nv50, p); 3667 3668 so = so_new(5, 8, 2); 3669 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 3670 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3671 NOUVEAU_BO_HIGH, 0, 0); 3672 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3673 NOUVEAU_BO_LOW, 0, 0); 3674 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 3675 so_data (so, p->cfg.attr[0]); 3676 so_data (so, p->cfg.attr[1]); 3677 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 3678 so_data (so, p->cfg.high_result); 3679 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); 3680 so_data (so, p->cfg.high_result); //8); 3681 so_data (so, p->cfg.high_temp); 3682 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 3683 so_data (so, 0); /* program start offset */ 3684 so_ref(so, &nv50->state.vertprog); 3685 so_ref(NULL, &so); 3686} 3687 3688void 3689nv50_fragprog_validate(struct nv50_context *nv50) 3690{ 3691 struct nouveau_grobj *tesla = nv50->screen->tesla; 3692 struct nv50_program *p = nv50->fragprog; 3693 struct nouveau_stateobj *so; 3694 3695 if (!p->translated) { 3696 nv50_program_validate(nv50, p); 3697 if (!p->translated) 3698 assert(0); 3699 } 3700 3701 nv50_program_validate_data(nv50, p); 3702 nv50_program_validate_code(nv50, p); 3703 3704 so = so_new(6, 7, 2); 3705 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 3706 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3707 NOUVEAU_BO_HIGH, 0, 0); 3708 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3709 NOUVEAU_BO_LOW, 0, 0); 3710 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); 3711 so_data (so, p->cfg.high_temp); 3712 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 3713 so_data (so, p->cfg.high_result); 3714 so_method(so, tesla, NV50TCL_FP_CONTROL, 1); 3715 so_data (so, p->cfg.regs[2]); 3716 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 3717 so_data (so, p->cfg.regs[3]); 3718 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 3719 so_data (so, 0); /* program start offset */ 3720 so_ref(so, &nv50->state.fragprog); 3721 so_ref(NULL, &so); 3722} 3723 3724static void 3725nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) 3726{ 3727 struct nv50_program *fp = nv50->fragprog; 3728 struct nv50_program *vp = nv50->vertprog; 3729 unsigned i, c, m = base; 3730 3731 /* XXX: this might not work correctly in all cases yet - we'll 3732 * just assume that an FP generic input that is not written in 3733 * the VP is PointCoord. 3734 */ 3735 memset(pntc, 0, 8 * sizeof(uint32_t)); 3736 3737 for (i = 0; i < fp->cfg.io_nr; i++) { 3738 uint8_t sn, si; 3739 uint8_t j, k = fp->cfg.io[i].id; 3740 unsigned n = popcnt4(fp->cfg.io[i].mask); 3741 3742 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) { 3743 m += n; 3744 continue; 3745 } 3746 3747 for (j = 0; j < vp->info.num_outputs; ++j) { 3748 sn = vp->info.output_semantic_name[j]; 3749 si = vp->info.output_semantic_index[j]; 3750 3751 if (sn == fp->info.input_semantic_name[k] && 3752 si == fp->info.input_semantic_index[k]) 3753 break; 3754 } 3755 3756 if (j < vp->info.num_outputs) { 3757 ubyte mode = 3758 nv50->rasterizer->pipe.sprite_coord_mode[si]; 3759 3760 if (mode == PIPE_SPRITE_COORD_NONE) { 3761 m += n; 3762 continue; 3763 } 3764 } 3765 3766 /* this is either PointCoord or replaced by sprite coords */ 3767 for (c = 0; c < 4; c++) { 3768 if (!(fp->cfg.io[i].mask & (1 << c))) 3769 continue; 3770 pntc[m / 8] |= (c + 1) << ((m % 8) * 4); 3771 ++m; 3772 } 3773 } 3774} 3775 3776static int 3777nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4], 3778 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) 3779{ 3780 int c; 3781 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; 3782 uint8_t *map = (uint8_t *)p_map; 3783 3784 for (c = 0; c < 4; ++c) { 3785 if (mf & 1) { 3786 if (fpi->linear == TRUE) 3787 lin[mid / 32] |= 1 << (mid % 32); 3788 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40); 3789 } 3790 3791 oid += mv & 1; 3792 mf >>= 1; 3793 mv >>= 1; 3794 } 3795 3796 return mid; 3797} 3798 3799void 3800nv50_linkage_validate(struct nv50_context *nv50) 3801{ 3802 struct nouveau_grobj *tesla = nv50->screen->tesla; 3803 struct nv50_program *vp = nv50->vertprog; 3804 struct nv50_program *fp = nv50->fragprog; 3805 struct nouveau_stateobj *so; 3806 struct nv50_sreg4 dummy, *vpo; 3807 int i, n, c, m = 0; 3808 uint32_t map[16], lin[4], reg[5], pcrd[8]; 3809 3810 memset(map, 0, sizeof(map)); 3811 memset(lin, 0, sizeof(lin)); 3812 3813 reg[1] = 0x00000004; /* low and high clip distance map ids */ 3814 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ 3815 reg[3] = 0x00000000; /* point size map id & enable */ 3816 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ 3817 reg[4] = fp->cfg.regs[1]; /* interpolant info */ 3818 3819 dummy.linear = FALSE; 3820 dummy.mask = 0xf; /* map all components of HPOS */ 3821 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]); 3822 3823 dummy.mask = 0x0; 3824 3825 if (vp->cfg.clpd < 0x40) { 3826 for (c = 0; c < vp->cfg.clpd_nr; ++c) 3827 map[m++] = vp->cfg.clpd + c; 3828 reg[1] = (m << 8); 3829 } 3830 3831 reg[0] |= m << 8; /* adjust BFC0 id */ 3832 3833 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ 3834 if (nv50->rasterizer->pipe.light_twoside) { 3835 vpo = &vp->cfg.two_side[0]; 3836 3837 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]); 3838 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]); 3839 } 3840 3841 reg[0] += m - 4; /* adjust FFC0 id */ 3842 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ 3843 3844 for (i = 0; i < fp->cfg.io_nr; i++) { 3845 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id]; 3846 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id]; 3847 3848 /* position must be mapped first */ 3849 assert(i == 0 || sn != TGSI_SEMANTIC_POSITION); 3850 3851 /* maybe even remove these from cfg.io */ 3852 if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE) 3853 continue; 3854 3855 /* VP outputs and vp->cfg.io are in the same order */ 3856 for (n = 0; n < vp->info.num_outputs; ++n) { 3857 if (vp->info.output_semantic_name[n] == sn && 3858 vp->info.output_semantic_index[n] == si) 3859 break; 3860 } 3861 vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy; 3862 3863 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo); 3864 } 3865 3866 if (nv50->rasterizer->pipe.point_size_per_vertex) { 3867 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); 3868 reg[3] = (m++ << 4) | 1; 3869 } 3870 3871 /* now fill the stateobj */ 3872 so = so_new(6, 58, 0); 3873 3874 n = (m + 3) / 4; 3875 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); 3876 so_data (so, m); 3877 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); 3878 so_datap (so, map, n); 3879 3880 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 3881 so_datap (so, reg, 4); 3882 3883 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); 3884 so_data (so, reg[4]); 3885 3886 so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); 3887 so_datap (so, lin, 4); 3888 3889 if (nv50->rasterizer->pipe.point_sprite) { 3890 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff); 3891 3892 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); 3893 so_datap (so, pcrd, 8); 3894 } 3895 3896 so_ref(so, &nv50->state.programs); 3897 so_ref(NULL, &so); 3898} 3899 3900void 3901nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 3902{ 3903 while (p->exec_head) { 3904 struct nv50_program_exec *e = p->exec_head; 3905 3906 p->exec_head = e->next; 3907 FREE(e); 3908 } 3909 p->exec_tail = NULL; 3910 p->exec_size = 0; 3911 3912 nouveau_bo_ref(NULL, &p->bo); 3913 3914 nouveau_resource_free(&p->data[0]); 3915 3916 p->translated = 0; 3917} 3918