nv50_program.c revision 152b3bd6ef70b74e2df50ff555cfacb5423ebf17
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "pipe/p_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33 34#define NV50_SU_MAX_TEMP 127 35#define NV50_SU_MAX_ADDR 4 36//#define NV50_PROGRAM_DUMP 37 38/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ 39 40/* ARL - gallium craps itself on progs/vp/arl.txt 41 * 42 * MSB - Like MAD, but MUL+SUB 43 * - Fuck it off, introduce a way to negate args for ops that 44 * support it. 45 * 46 * Look into inlining IMMD for ops other than MOV (make it general?) 47 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 48 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 49 * 50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 51 * case, if the emit_src() causes the inst to suddenly become long. 52 * 53 * Verify half-insns work where expected - and force disable them where they 54 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 55 * 56 * FUCK! watch dst==src vectors, can overwrite components that are needed. 57 * ie. SUB R0, R0.yzxw, R0 58 * 59 * Things to check with renouveau: 60 * FP attr/result assignment - how? 61 * attrib 62 * - 0x16bc maps vp output onto fp hpos 63 * - 0x16c0 maps vp output onto fp col0 64 * result 65 * - colr always 0-3 66 * - depr always 4 67 * 0x16bc->0x16e8 --> some binding between vp/fp regs 68 * 0x16b8 --> VP output count 69 * 70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 71 * "MOV rcol.x, fcol.y" = 0x00000004 72 * 0x19a8 --> as above but 0x00000100 and 0x00000000 73 * - 0x00100000 used when KIL used 74 * 0x196c --> as above but 0x00000011 and 0x00000000 75 * 76 * 0x1988 --> 0xXXNNNNNN 77 * - XX == FP high something 78 */ 79struct nv50_reg { 80 enum { 81 P_TEMP, 82 P_ATTR, 83 P_RESULT, 84 P_CONST, 85 P_IMMD, 86 P_ADDR 87 } type; 88 int index; 89 90 int hw; 91 int mod; 92 93 int rhw; /* result hw for FP outputs, or interpolant index */ 94 int acc; /* instruction where this reg is last read (first insn == 1) */ 95}; 96 97#define NV50_MOD_NEG 1 98#define NV50_MOD_ABS 2 99#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS) 100#define NV50_MOD_SAT 4 101#define NV50_MOD_I32 8 102 103/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */ 104 105/* STACK: Conditionals and loops have to use the (per warp) stack. 106 * Stack entries consist of an entry type (divergent path, join at), 107 * a mask indicating the active threads of the warp, and an address. 108 * MPs can store 12 stack entries internally, if we need more (and 109 * we probably do), we have to create a stack buffer in VRAM. 110 */ 111/* impose low limits for now */ 112#define NV50_MAX_COND_NESTING 4 113#define NV50_MAX_LOOP_NESTING 3 114 115#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2 116 117struct nv50_pc { 118 struct nv50_program *p; 119 120 /* hw resources */ 121 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 122 struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; 123 124 /* tgsi resources */ 125 struct nv50_reg *temp; 126 int temp_nr; 127 struct nv50_reg *attr; 128 int attr_nr; 129 struct nv50_reg *result; 130 int result_nr; 131 struct nv50_reg *param; 132 int param_nr; 133 struct nv50_reg *immd; 134 uint32_t *immd_buf; 135 int immd_nr; 136 struct nv50_reg **addr; 137 int addr_nr; 138 uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */ 139 140 struct nv50_reg *temp_temp[16]; 141 unsigned temp_temp_nr; 142 143 /* broadcast and destination replacement regs */ 144 struct nv50_reg *r_brdc; 145 struct nv50_reg *r_dst[4]; 146 147 struct nv50_reg reg_instances[16]; 148 unsigned reg_instance_nr; 149 150 unsigned interp_mode[32]; 151 /* perspective interpolation registers */ 152 struct nv50_reg *iv_p; 153 struct nv50_reg *iv_c; 154 155 struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING]; 156 struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING]; 157 struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING]; 158 int if_lvl, loop_lvl; 159 unsigned loop_pos[NV50_MAX_LOOP_NESTING]; 160 161 unsigned *insn_pos; /* actual program offset of each TGSI insn */ 162 boolean in_subroutine; 163 164 /* current instruction and total number of insns */ 165 unsigned insn_cur; 166 unsigned insn_nr; 167 168 boolean allow32; 169 170 uint8_t edgeflag_out; 171}; 172 173static INLINE void 174ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) 175{ 176 reg->type = type; 177 reg->index = index; 178 reg->hw = hw; 179 reg->mod = 0; 180 reg->rhw = -1; 181 reg->acc = 0; 182} 183 184static INLINE unsigned 185popcnt4(uint32_t val) 186{ 187 static const unsigned cnt[16] 188 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 189 return cnt[val & 0xf]; 190} 191 192static void 193terminate_mbb(struct nv50_pc *pc) 194{ 195 int i; 196 197 /* remove records of temporary address register values */ 198 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 199 pc->r_addr[i].rhw = -1; 200} 201 202static void 203alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 204{ 205 int i = 0; 206 207 if (reg->type == P_RESULT) { 208 if (pc->p->cfg.high_result < (reg->hw + 1)) 209 pc->p->cfg.high_result = reg->hw + 1; 210 } 211 212 if (reg->type != P_TEMP) 213 return; 214 215 if (reg->hw >= 0) { 216 /*XXX: do this here too to catch FP temp-as-attr usage.. 217 * not clean, but works */ 218 if (pc->p->cfg.high_temp < (reg->hw + 1)) 219 pc->p->cfg.high_temp = reg->hw + 1; 220 return; 221 } 222 223 if (reg->rhw != -1) { 224 /* try to allocate temporary with index rhw first */ 225 if (!(pc->r_temp[reg->rhw])) { 226 pc->r_temp[reg->rhw] = reg; 227 reg->hw = reg->rhw; 228 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 229 pc->p->cfg.high_temp = reg->rhw + 1; 230 return; 231 } 232 /* make sure we don't get things like $r0 needs to go 233 * in $r1 and $r1 in $r0 234 */ 235 i = pc->result_nr * 4; 236 } 237 238 for (; i < NV50_SU_MAX_TEMP; i++) { 239 if (!(pc->r_temp[i])) { 240 pc->r_temp[i] = reg; 241 reg->hw = i; 242 if (pc->p->cfg.high_temp < (i + 1)) 243 pc->p->cfg.high_temp = i + 1; 244 return; 245 } 246 } 247 248 assert(0); 249} 250 251static INLINE struct nv50_reg * 252reg_instance(struct nv50_pc *pc, struct nv50_reg *reg) 253{ 254 struct nv50_reg *ri; 255 256 assert(pc->reg_instance_nr < 16); 257 ri = &pc->reg_instances[pc->reg_instance_nr++]; 258 if (reg) { 259 alloc_reg(pc, reg); 260 *ri = *reg; 261 reg->mod = 0; 262 } 263 return ri; 264} 265 266/* XXX: For shaders that aren't executed linearly (e.g. shaders that 267 * contain loops), we need to assign all hw regs to TGSI TEMPs early, 268 * lest we risk temp_temps overwriting regs alloc'd "later". 269 */ 270static struct nv50_reg * 271alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 272{ 273 struct nv50_reg *r; 274 int i; 275 276 if (dst && dst->type == P_TEMP && dst->hw == -1) 277 return dst; 278 279 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 280 if (!pc->r_temp[i]) { 281 r = MALLOC_STRUCT(nv50_reg); 282 ctor_reg(r, P_TEMP, -1, i); 283 pc->r_temp[i] = r; 284 return r; 285 } 286 } 287 288 assert(0); 289 return NULL; 290} 291 292/* release the hardware resource held by r */ 293static void 294release_hw(struct nv50_pc *pc, struct nv50_reg *r) 295{ 296 assert(r->type == P_TEMP); 297 if (r->hw == -1) 298 return; 299 300 assert(pc->r_temp[r->hw] == r); 301 pc->r_temp[r->hw] = NULL; 302 303 r->acc = 0; 304 if (r->index == -1) 305 FREE(r); 306} 307 308static void 309free_temp(struct nv50_pc *pc, struct nv50_reg *r) 310{ 311 if (r->index == -1) { 312 unsigned hw = r->hw; 313 314 FREE(pc->r_temp[hw]); 315 pc->r_temp[hw] = NULL; 316 } 317} 318 319static int 320alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 321{ 322 int i; 323 324 if ((idx + 4) >= NV50_SU_MAX_TEMP) 325 return 1; 326 327 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 328 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 329 return alloc_temp4(pc, dst, idx + 4); 330 331 for (i = 0; i < 4; i++) { 332 dst[i] = MALLOC_STRUCT(nv50_reg); 333 ctor_reg(dst[i], P_TEMP, -1, idx + i); 334 pc->r_temp[idx + i] = dst[i]; 335 } 336 337 return 0; 338} 339 340static void 341free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 342{ 343 int i; 344 345 for (i = 0; i < 4; i++) 346 free_temp(pc, reg[i]); 347} 348 349static struct nv50_reg * 350temp_temp(struct nv50_pc *pc) 351{ 352 if (pc->temp_temp_nr >= 16) 353 assert(0); 354 355 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 356 return pc->temp_temp[pc->temp_temp_nr++]; 357} 358 359static void 360kill_temp_temp(struct nv50_pc *pc) 361{ 362 int i; 363 364 for (i = 0; i < pc->temp_temp_nr; i++) 365 free_temp(pc, pc->temp_temp[i]); 366 pc->temp_temp_nr = 0; 367} 368 369static int 370ctor_immd_4u32(struct nv50_pc *pc, 371 uint32_t x, uint32_t y, uint32_t z, uint32_t w) 372{ 373 unsigned size = pc->immd_nr * 4 * sizeof(uint32_t); 374 375 pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t)); 376 377 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 378 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 379 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 380 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 381 382 return pc->immd_nr++; 383} 384 385static INLINE int 386ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w) 387{ 388 return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w)); 389} 390 391static struct nv50_reg * 392alloc_immd(struct nv50_pc *pc, float f) 393{ 394 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); 395 unsigned hw; 396 397 for (hw = 0; hw < pc->immd_nr * 4; hw++) 398 if (pc->immd_buf[hw] == fui(f)) 399 break; 400 401 if (hw == pc->immd_nr * 4) 402 hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4; 403 404 ctor_reg(r, P_IMMD, -1, hw); 405 return r; 406} 407 408static struct nv50_program_exec * 409exec(struct nv50_pc *pc) 410{ 411 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 412 413 e->param.index = -1; 414 return e; 415} 416 417static void 418emit(struct nv50_pc *pc, struct nv50_program_exec *e) 419{ 420 struct nv50_program *p = pc->p; 421 422 if (p->exec_tail) 423 p->exec_tail->next = e; 424 if (!p->exec_head) 425 p->exec_head = e; 426 p->exec_tail = e; 427 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 428} 429 430static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 431 432static boolean 433is_long(struct nv50_program_exec *e) 434{ 435 if (e->inst[0] & 1) 436 return TRUE; 437 return FALSE; 438} 439 440static boolean 441is_immd(struct nv50_program_exec *e) 442{ 443 if (is_long(e) && (e->inst[1] & 3) == 3) 444 return TRUE; 445 return FALSE; 446} 447 448static boolean 449is_join(struct nv50_program_exec *e) 450{ 451 if (is_long(e) && (e->inst[1] & 3) == 2) 452 return TRUE; 453 return FALSE; 454} 455 456static INLINE void 457set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 458 struct nv50_program_exec *e) 459{ 460 assert(!is_immd(e)); 461 set_long(pc, e); 462 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 463 e->inst[1] |= (pred << 7) | (idx << 12); 464} 465 466static INLINE void 467set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 468 struct nv50_program_exec *e) 469{ 470 set_long(pc, e); 471 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 472 e->inst[1] |= (idx << 4) | (on << 6); 473} 474 475static INLINE void 476set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 477{ 478 if (is_long(e)) 479 return; 480 481 e->inst[0] |= 1; 482 set_pred(pc, 0xf, 0, e); 483 set_pred_wr(pc, 0, 0, e); 484} 485 486static INLINE void 487set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 488{ 489 if (dst->type == P_RESULT) { 490 set_long(pc, e); 491 e->inst[1] |= 0x00000008; 492 } 493 494 alloc_reg(pc, dst); 495 if (dst->hw > 63) 496 set_long(pc, e); 497 e->inst[0] |= (dst->hw << 2); 498} 499 500static INLINE void 501set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 502{ 503 set_long(pc, e); 504 /* XXX: can't be predicated - bits overlap; cases where both 505 * are required should be avoided by using pc->allow32 */ 506 set_pred(pc, 0, 0, e); 507 set_pred_wr(pc, 0, 0, e); 508 509 e->inst[1] |= 0x00000002 | 0x00000001; 510 e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16; 511 e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2; 512} 513 514static INLINE void 515set_addr(struct nv50_program_exec *e, struct nv50_reg *a) 516{ 517 assert(!(e->inst[0] & 0x0c000000)); 518 assert(!(e->inst[1] & 0x00000004)); 519 520 e->inst[0] |= (a->hw & 3) << 26; 521 e->inst[1] |= (a->hw >> 2) << 2; 522} 523 524static void 525emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, 526 struct nv50_reg *src0, uint16_t src1_val) 527{ 528 struct nv50_program_exec *e = exec(pc); 529 530 e->inst[0] = 0xd0000000 | (src1_val << 9); 531 e->inst[1] = 0x20000000; 532 set_long(pc, e); 533 e->inst[0] |= dst->hw << 2; 534 if (src0) /* otherwise will add to $a0, which is always 0 */ 535 set_addr(e, src0); 536 537 emit(pc, e); 538} 539 540static struct nv50_reg * 541alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref) 542{ 543 struct nv50_reg *a_tgsi = NULL, *a = NULL; 544 int i; 545 uint8_t avail = ~pc->addr_alloc; 546 547 if (!ref) { 548 /* allocate for TGSI_FILE_ADDRESS */ 549 while (avail) { 550 i = ffs(avail) - 1; 551 552 if (pc->r_addr[i].rhw < 0 || 553 pc->r_addr[i].acc != pc->insn_cur) { 554 pc->addr_alloc |= (1 << i); 555 556 pc->r_addr[i].rhw = -1; 557 pc->r_addr[i].index = i; 558 return &pc->r_addr[i]; 559 } 560 avail &= ~(1 << i); 561 } 562 assert(0); 563 return NULL; 564 } 565 566 /* Allocate and set an address reg so we can access 'ref'. 567 * 568 * If and r_addr->index will be -1 or the hw index the value 569 * value in rhw is relative to. If rhw < 0, the reg has not 570 * been initialized or is in use for TGSI_FILE_ADDRESS. 571 */ 572 while (avail) { /* only consider regs that are not TGSI */ 573 i = ffs(avail) - 1; 574 avail &= ~(1 << i); 575 576 if ((!a || a->rhw >= 0) && pc->r_addr[i].rhw < 0) { 577 /* prefer an usused reg with low hw index */ 578 a = &pc->r_addr[i]; 579 continue; 580 } 581 if (!a && pc->r_addr[i].acc != pc->insn_cur) 582 a = &pc->r_addr[i]; 583 584 if (ref->hw - pc->r_addr[i].rhw >= 128) 585 continue; 586 587 if ((ref->acc >= 0 && pc->r_addr[i].index < 0) || 588 (ref->acc < 0 && pc->r_addr[i].index == ref->index)) { 589 pc->r_addr[i].acc = pc->insn_cur; 590 return &pc->r_addr[i]; 591 } 592 } 593 assert(a); 594 595 if (ref->acc < 0) 596 a_tgsi = pc->addr[ref->index]; 597 598 emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4); 599 600 a->rhw = ref->hw & ~0x7f; 601 a->acc = pc->insn_cur; 602 a->index = a_tgsi ? ref->index : -1; 603 return a; 604} 605 606#define INTERP_LINEAR 0 607#define INTERP_FLAT 1 608#define INTERP_PERSPECTIVE 2 609#define INTERP_CENTROID 4 610 611/* interpolant index has been stored in dst->rhw */ 612static void 613emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 614 unsigned mode) 615{ 616 assert(dst->rhw != -1); 617 struct nv50_program_exec *e = exec(pc); 618 619 e->inst[0] |= 0x80000000; 620 set_dst(pc, dst, e); 621 e->inst[0] |= (dst->rhw << 16); 622 623 if (mode & INTERP_FLAT) { 624 e->inst[0] |= (1 << 8); 625 } else { 626 if (mode & INTERP_PERSPECTIVE) { 627 e->inst[0] |= (1 << 25); 628 alloc_reg(pc, iv); 629 e->inst[0] |= (iv->hw << 9); 630 } 631 632 if (mode & INTERP_CENTROID) 633 e->inst[0] |= (1 << 24); 634 } 635 636 emit(pc, e); 637} 638 639static void 640set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 641 struct nv50_program_exec *e) 642{ 643 set_long(pc, e); 644 645 e->param.index = src->hw & 127; 646 e->param.shift = s; 647 e->param.mask = m << (s % 32); 648 649 if (src->hw > 127) 650 set_addr(e, alloc_addr(pc, src)); 651 else 652 if (src->acc < 0) { 653 assert(src->type == P_CONST); 654 set_addr(e, pc->addr[src->index]); 655 } 656 657 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 658} 659 660/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */ 661static void 662emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 663{ 664 struct nv50_program_exec *e = exec(pc); 665 666 e->inst[0] = 0x10000000; 667 if (!pc->allow32) 668 set_long(pc, e); 669 670 set_dst(pc, dst, e); 671 672 if (!is_long(e) && src->type == P_IMMD) { 673 set_immd(pc, src, e); 674 /*XXX: 32-bit, but steals part of "half" reg space - need to 675 * catch and handle this case if/when we do half-regs 676 */ 677 } else 678 if (src->type == P_IMMD || src->type == P_CONST) { 679 set_long(pc, e); 680 set_data(pc, src, 0x7f, 9, e); 681 e->inst[1] |= 0x20000000; /* mov from c[] */ 682 } else { 683 if (src->type == P_ATTR) { 684 set_long(pc, e); 685 e->inst[1] |= 0x00200000; 686 } 687 688 alloc_reg(pc, src); 689 if (src->hw > 63) 690 set_long(pc, e); 691 e->inst[0] |= (src->hw << 9); 692 } 693 694 if (is_long(e) && !is_immd(e)) { 695 e->inst[1] |= 0x04000000; /* 32-bit */ 696 e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */ 697 if (!(e->inst[1] & 0x20000000)) 698 e->inst[1] |= 0x00030000; /* lane mask 2:3 */ 699 } else 700 e->inst[0] |= 0x00008000; 701 702 emit(pc, e); 703} 704 705static INLINE void 706emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 707{ 708 struct nv50_reg *imm = alloc_immd(pc, f); 709 emit_mov(pc, dst, imm); 710 FREE(imm); 711} 712 713/* Assign the hw of the discarded temporary register src 714 * to the tgsi register dst and free src. 715 */ 716static void 717assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 718{ 719 assert(src->index == -1 && src->hw != -1); 720 721 if (pc->if_lvl || pc->loop_lvl || 722 (dst->type != P_TEMP) || 723 (src->hw < pc->result_nr * 4 && 724 pc->p->type == PIPE_SHADER_FRAGMENT) || 725 pc->p->info.opcode_count[TGSI_OPCODE_CAL] || 726 pc->p->info.opcode_count[TGSI_OPCODE_BRA]) { 727 728 emit_mov(pc, dst, src); 729 free_temp(pc, src); 730 return; 731 } 732 733 if (dst->hw != -1) 734 pc->r_temp[dst->hw] = NULL; 735 pc->r_temp[src->hw] = dst; 736 dst->hw = src->hw; 737 738 FREE(src); 739} 740 741static void 742emit_nop(struct nv50_pc *pc) 743{ 744 struct nv50_program_exec *e = exec(pc); 745 746 e->inst[0] = 0xf0000000; 747 set_long(pc, e); 748 e->inst[1] = 0xe0000000; 749 emit(pc, e); 750} 751 752static boolean 753check_swap_src_0_1(struct nv50_pc *pc, 754 struct nv50_reg **s0, struct nv50_reg **s1) 755{ 756 struct nv50_reg *src0 = *s0, *src1 = *s1; 757 758 if (src0->type == P_CONST) { 759 if (src1->type != P_CONST) { 760 *s0 = src1; 761 *s1 = src0; 762 return TRUE; 763 } 764 } else 765 if (src1->type == P_ATTR) { 766 if (src0->type != P_ATTR) { 767 *s0 = src1; 768 *s1 = src0; 769 return TRUE; 770 } 771 } 772 773 return FALSE; 774} 775 776static void 777set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, 778 struct nv50_program_exec *e) 779{ 780 struct nv50_reg *temp; 781 782 if (src->type != P_TEMP) { 783 temp = temp_temp(pc); 784 emit_mov(pc, temp, src); 785 src = temp; 786 } 787 788 alloc_reg(pc, src); 789 if (src->hw > 63) 790 set_long(pc, e); 791 e->inst[0] |= (src->hw << 9); 792} 793 794static void 795set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 796{ 797 if (src->type == P_ATTR) { 798 set_long(pc, e); 799 e->inst[1] |= 0x00200000; 800 } else 801 if (src->type == P_CONST || src->type == P_IMMD) { 802 struct nv50_reg *temp = temp_temp(pc); 803 804 emit_mov(pc, temp, src); 805 src = temp; 806 } 807 808 alloc_reg(pc, src); 809 if (src->hw > 63) 810 set_long(pc, e); 811 e->inst[0] |= (src->hw << 9); 812} 813 814static void 815set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 816{ 817 if (src->type == P_ATTR) { 818 struct nv50_reg *temp = temp_temp(pc); 819 820 emit_mov(pc, temp, src); 821 src = temp; 822 } else 823 if (src->type == P_CONST || src->type == P_IMMD) { 824 assert(!(e->inst[0] & 0x00800000)); 825 if (e->inst[0] & 0x01000000) { 826 struct nv50_reg *temp = temp_temp(pc); 827 828 emit_mov(pc, temp, src); 829 src = temp; 830 } else { 831 set_data(pc, src, 0x7f, 16, e); 832 e->inst[0] |= 0x00800000; 833 } 834 } 835 836 alloc_reg(pc, src); 837 if (src->hw > 63) 838 set_long(pc, e); 839 e->inst[0] |= ((src->hw & 127) << 16); 840} 841 842static void 843set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 844{ 845 set_long(pc, e); 846 847 if (src->type == P_ATTR) { 848 struct nv50_reg *temp = temp_temp(pc); 849 850 emit_mov(pc, temp, src); 851 src = temp; 852 } else 853 if (src->type == P_CONST || src->type == P_IMMD) { 854 assert(!(e->inst[0] & 0x01000000)); 855 if (e->inst[0] & 0x00800000) { 856 struct nv50_reg *temp = temp_temp(pc); 857 858 emit_mov(pc, temp, src); 859 src = temp; 860 } else { 861 set_data(pc, src, 0x7f, 32+14, e); 862 e->inst[0] |= 0x01000000; 863 } 864 } 865 866 alloc_reg(pc, src); 867 e->inst[1] |= ((src->hw & 127) << 14); 868} 869 870static void 871emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred) 872{ 873 struct nv50_program_exec *e = exec(pc); 874 875 assert(dst->type == P_TEMP); 876 e->inst[1] = 0x20000000 | (pred << 12); 877 set_long(pc, e); 878 set_dst(pc, dst, e); 879 880 emit(pc, e); 881} 882 883static void 884emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src) 885{ 886 struct nv50_program_exec *e = exec(pc); 887 888 e->inst[0] = 0x000001fc; 889 e->inst[1] = 0xa0000008; 890 set_long(pc, e); 891 set_pred_wr(pc, 1, pred, e); 892 set_src_0_restricted(pc, src, e); 893 894 emit(pc, e); 895} 896 897static void 898emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 899 struct nv50_reg *src1) 900{ 901 struct nv50_program_exec *e = exec(pc); 902 903 e->inst[0] |= 0xc0000000; 904 905 if (!pc->allow32) 906 set_long(pc, e); 907 908 check_swap_src_0_1(pc, &src0, &src1); 909 set_dst(pc, dst, e); 910 set_src_0(pc, src0, e); 911 if (src1->type == P_IMMD && !is_long(e)) { 912 if (src0->mod ^ src1->mod) 913 e->inst[0] |= 0x00008000; 914 set_immd(pc, src1, e); 915 } else { 916 set_src_1(pc, src1, e); 917 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { 918 if (is_long(e)) 919 e->inst[1] |= 0x08000000; 920 else 921 e->inst[0] |= 0x00008000; 922 } 923 } 924 925 emit(pc, e); 926} 927 928static void 929emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 930 struct nv50_reg *src0, struct nv50_reg *src1) 931{ 932 struct nv50_program_exec *e = exec(pc); 933 934 e->inst[0] = 0xb0000000; 935 936 alloc_reg(pc, src1); 937 check_swap_src_0_1(pc, &src0, &src1); 938 939 if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { 940 set_long(pc, e); 941 e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | 942 ((src1->mod & NV50_MOD_NEG) << 27); 943 } 944 945 set_dst(pc, dst, e); 946 set_src_0(pc, src0, e); 947 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 948 set_src_2(pc, src1, e); 949 else 950 if (src1->type == P_IMMD) 951 set_immd(pc, src1, e); 952 else 953 set_src_1(pc, src1, e); 954 955 emit(pc, e); 956} 957 958static void 959emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 960 uint8_t s) 961{ 962 struct nv50_program_exec *e = exec(pc); 963 964 set_long(pc, e); 965 e->inst[1] |= 0xc0000000; 966 967 e->inst[0] |= dst->hw << 2; 968 e->inst[0] |= s << 16; /* shift left */ 969 set_src_0_restricted(pc, src, e); 970 971 emit(pc, e); 972} 973 974static void 975emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 976 struct nv50_reg *src0, struct nv50_reg *src1) 977{ 978 struct nv50_program_exec *e = exec(pc); 979 980 set_long(pc, e); 981 e->inst[0] |= 0xb0000000; 982 e->inst[1] |= (sub << 29); 983 984 check_swap_src_0_1(pc, &src0, &src1); 985 set_dst(pc, dst, e); 986 set_src_0(pc, src0, e); 987 set_src_1(pc, src1, e); 988 989 if (src0->mod & NV50_MOD_ABS) 990 e->inst[1] |= 0x00100000; 991 if (src1->mod & NV50_MOD_ABS) 992 e->inst[1] |= 0x00080000; 993 994 emit(pc, e); 995} 996 997static INLINE void 998emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 999 struct nv50_reg *src1) 1000{ 1001 src1->mod ^= NV50_MOD_NEG; 1002 emit_add(pc, dst, src0, src1); 1003 src1->mod ^= NV50_MOD_NEG; 1004} 1005 1006static void 1007emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1008 struct nv50_reg *src1, unsigned op) 1009{ 1010 struct nv50_program_exec *e = exec(pc); 1011 1012 e->inst[0] = 0xd0000000; 1013 set_long(pc, e); 1014 1015 check_swap_src_0_1(pc, &src0, &src1); 1016 set_dst(pc, dst, e); 1017 set_src_0(pc, src0, e); 1018 1019 if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && 1020 op != TGSI_OPCODE_XOR) 1021 assert(!"invalid bit op"); 1022 1023 assert(!(src0->mod | src1->mod)); 1024 1025 if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { 1026 set_immd(pc, src1, e); 1027 if (op == TGSI_OPCODE_OR) 1028 e->inst[0] |= 0x0100; 1029 else 1030 if (op == TGSI_OPCODE_XOR) 1031 e->inst[0] |= 0x8000; 1032 } else { 1033 set_src_1(pc, src1, e); 1034 e->inst[1] |= 0x04000000; /* 32 bit */ 1035 if (op == TGSI_OPCODE_OR) 1036 e->inst[1] |= 0x4000; 1037 else 1038 if (op == TGSI_OPCODE_XOR) 1039 e->inst[1] |= 0x8000; 1040 } 1041 1042 emit(pc, e); 1043} 1044 1045static void 1046emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1047 struct nv50_reg *src1, struct nv50_reg *src2) 1048{ 1049 struct nv50_program_exec *e = exec(pc); 1050 1051 e->inst[0] |= 0xe0000000; 1052 1053 check_swap_src_0_1(pc, &src0, &src1); 1054 set_dst(pc, dst, e); 1055 set_src_0(pc, src0, e); 1056 set_src_1(pc, src1, e); 1057 set_src_2(pc, src2, e); 1058 1059 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) 1060 e->inst[1] |= 0x04000000; 1061 if (src2->mod & NV50_MOD_NEG) 1062 e->inst[1] |= 0x08000000; 1063 1064 emit(pc, e); 1065} 1066 1067static INLINE void 1068emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1069 struct nv50_reg *src1, struct nv50_reg *src2) 1070{ 1071 src2->mod ^= NV50_MOD_NEG; 1072 emit_mad(pc, dst, src0, src1, src2); 1073 src2->mod ^= NV50_MOD_NEG; 1074} 1075 1076#define NV50_FLOP_RCP 0 1077#define NV50_FLOP_RSQ 2 1078#define NV50_FLOP_LG2 3 1079#define NV50_FLOP_SIN 4 1080#define NV50_FLOP_COS 5 1081#define NV50_FLOP_EX2 6 1082 1083/* rcp, rsqrt, lg2 support neg and abs */ 1084static void 1085emit_flop(struct nv50_pc *pc, unsigned sub, 1086 struct nv50_reg *dst, struct nv50_reg *src) 1087{ 1088 struct nv50_program_exec *e = exec(pc); 1089 1090 e->inst[0] |= 0x90000000; 1091 if (sub || src->mod) { 1092 set_long(pc, e); 1093 e->inst[1] |= (sub << 29); 1094 } 1095 1096 set_dst(pc, dst, e); 1097 set_src_0_restricted(pc, src, e); 1098 1099 assert(!src->mod || sub < 4); 1100 1101 if (src->mod & NV50_MOD_NEG) 1102 e->inst[1] |= 0x04000000; 1103 if (src->mod & NV50_MOD_ABS) 1104 e->inst[1] |= 0x00100000; 1105 1106 emit(pc, e); 1107} 1108 1109static void 1110emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1111{ 1112 struct nv50_program_exec *e = exec(pc); 1113 1114 e->inst[0] |= 0xb0000000; 1115 1116 set_dst(pc, dst, e); 1117 set_src_0(pc, src, e); 1118 set_long(pc, e); 1119 e->inst[1] |= (6 << 29) | 0x00004000; 1120 1121 if (src->mod & NV50_MOD_NEG) 1122 e->inst[1] |= 0x04000000; 1123 if (src->mod & NV50_MOD_ABS) 1124 e->inst[1] |= 0x00100000; 1125 1126 emit(pc, e); 1127} 1128 1129static void 1130emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1131{ 1132 struct nv50_program_exec *e = exec(pc); 1133 1134 e->inst[0] |= 0xb0000000; 1135 1136 set_dst(pc, dst, e); 1137 set_src_0(pc, src, e); 1138 set_long(pc, e); 1139 e->inst[1] |= (6 << 29); 1140 1141 if (src->mod & NV50_MOD_NEG) 1142 e->inst[1] |= 0x04000000; 1143 if (src->mod & NV50_MOD_ABS) 1144 e->inst[1] |= 0x00100000; 1145 1146 emit(pc, e); 1147} 1148 1149#define CVT_RN (0x00 << 16) 1150#define CVT_FLOOR (0x02 << 16) 1151#define CVT_CEIL (0x04 << 16) 1152#define CVT_TRUNC (0x06 << 16) 1153#define CVT_SAT (0x08 << 16) 1154#define CVT_ABS (0x10 << 16) 1155 1156#define CVT_X32_X32 0x04004000 1157#define CVT_X32_S32 0x04014000 1158#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) 1159#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) 1160#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) 1161#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) 1162#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) 1163#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) 1164#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) 1165#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) 1166 1167#define CVT_NEG 0x20000000 1168#define CVT_RI 0x08000000 1169 1170static void 1171emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 1172 int wp, uint32_t cvn) 1173{ 1174 struct nv50_program_exec *e; 1175 1176 e = exec(pc); 1177 1178 if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG; 1179 if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS; 1180 1181 e->inst[0] = 0xa0000000; 1182 e->inst[1] = cvn; 1183 set_long(pc, e); 1184 set_src_0(pc, src, e); 1185 1186 if (wp >= 0) 1187 set_pred_wr(pc, 1, wp, e); 1188 1189 if (dst) 1190 set_dst(pc, dst, e); 1191 else { 1192 e->inst[0] |= 0x000001fc; 1193 e->inst[1] |= 0x00000008; 1194 } 1195 1196 emit(pc, e); 1197} 1198 1199/* nv50 Condition codes: 1200 * 0x1 = LT 1201 * 0x2 = EQ 1202 * 0x3 = LE 1203 * 0x4 = GT 1204 * 0x5 = NE 1205 * 0x6 = GE 1206 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) 1207 * 0x8 = unordered bit (allows NaN) 1208 */ 1209static void 1210emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, 1211 struct nv50_reg *src0, struct nv50_reg *src1) 1212{ 1213 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 1214 1215 struct nv50_program_exec *e = exec(pc); 1216 struct nv50_reg *rdst; 1217 1218 assert(ccode < 16); 1219 if (check_swap_src_0_1(pc, &src0, &src1)) 1220 ccode = cc_swapped[ccode & 7] | (ccode & 8); 1221 1222 rdst = dst; 1223 if (dst && dst->type != P_TEMP) 1224 dst = alloc_temp(pc, NULL); 1225 1226 /* set.u32 */ 1227 set_long(pc, e); 1228 e->inst[0] |= 0xb0000000; 1229 e->inst[1] |= 0x60000000 | (ccode << 14); 1230 1231 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but 1232 * that doesn't seem to match what the hw actually does 1233 e->inst[1] |= 0x04000000; << breaks things, u32 by default ? 1234 */ 1235 1236 if (wp >= 0) 1237 set_pred_wr(pc, 1, wp, e); 1238 if (dst) 1239 set_dst(pc, dst, e); 1240 else { 1241 e->inst[0] |= 0x000001fc; 1242 e->inst[1] |= 0x00000008; 1243 } 1244 1245 set_src_0(pc, src0, e); 1246 set_src_1(pc, src1, e); 1247 1248 emit(pc, e); 1249 1250 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */ 1251 if (rdst) 1252 emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32); 1253 if (rdst && rdst != dst) 1254 free_temp(pc, dst); 1255} 1256 1257static INLINE unsigned 1258map_tgsi_setop_cc(unsigned op) 1259{ 1260 switch (op) { 1261 case TGSI_OPCODE_SLT: return 0x1; 1262 case TGSI_OPCODE_SGE: return 0x6; 1263 case TGSI_OPCODE_SEQ: return 0x2; 1264 case TGSI_OPCODE_SGT: return 0x4; 1265 case TGSI_OPCODE_SLE: return 0x3; 1266 case TGSI_OPCODE_SNE: return 0xd; 1267 default: 1268 assert(0); 1269 return 0; 1270 } 1271} 1272 1273static INLINE void 1274emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1275{ 1276 emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI); 1277} 1278 1279static void 1280emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 1281 struct nv50_reg *v, struct nv50_reg *e) 1282{ 1283 struct nv50_reg *temp = alloc_temp(pc, NULL); 1284 1285 emit_flop(pc, NV50_FLOP_LG2, temp, v); 1286 emit_mul(pc, temp, temp, e); 1287 emit_preex2(pc, temp, temp); 1288 emit_flop(pc, NV50_FLOP_EX2, dst, temp); 1289 1290 free_temp(pc, temp); 1291} 1292 1293static INLINE void 1294emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1295{ 1296 emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32); 1297} 1298 1299static void 1300emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1301 struct nv50_reg **src) 1302{ 1303 struct nv50_reg *one = alloc_immd(pc, 1.0); 1304 struct nv50_reg *zero = alloc_immd(pc, 0.0); 1305 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 1306 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 1307 struct nv50_reg *tmp[4]; 1308 boolean allow32 = pc->allow32; 1309 1310 pc->allow32 = FALSE; 1311 1312 if (mask & (3 << 1)) { 1313 tmp[0] = alloc_temp(pc, NULL); 1314 emit_minmax(pc, 4, tmp[0], src[0], zero); 1315 } 1316 1317 if (mask & (1 << 2)) { 1318 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 1319 1320 tmp[1] = temp_temp(pc); 1321 emit_minmax(pc, 4, tmp[1], src[1], zero); 1322 1323 tmp[3] = temp_temp(pc); 1324 emit_minmax(pc, 4, tmp[3], src[3], neg128); 1325 emit_minmax(pc, 5, tmp[3], tmp[3], pos128); 1326 1327 emit_pow(pc, dst[2], tmp[1], tmp[3]); 1328 emit_mov(pc, dst[2], zero); 1329 set_pred(pc, 3, 0, pc->p->exec_tail); 1330 } 1331 1332 if (mask & (1 << 1)) 1333 assimilate_temp(pc, dst[1], tmp[0]); 1334 else 1335 if (mask & (1 << 2)) 1336 free_temp(pc, tmp[0]); 1337 1338 pc->allow32 = allow32; 1339 1340 /* do this last, in case src[i,j] == dst[0,3] */ 1341 if (mask & (1 << 0)) 1342 emit_mov(pc, dst[0], one); 1343 1344 if (mask & (1 << 3)) 1345 emit_mov(pc, dst[3], one); 1346 1347 FREE(pos128); 1348 FREE(neg128); 1349 FREE(zero); 1350 FREE(one); 1351} 1352 1353static void 1354emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 1355{ 1356 struct nv50_program_exec *e; 1357 const int r_pred = 1; 1358 1359 e = exec(pc); 1360 e->inst[0] = 0x00000002; /* discard */ 1361 set_long(pc, e); /* sets cond code to ALWAYS */ 1362 1363 if (src) { 1364 set_pred(pc, 0x1 /* cc = LT */, r_pred, e); 1365 /* write to predicate reg */ 1366 emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32); 1367 } 1368 1369 emit(pc, e); 1370} 1371 1372static struct nv50_program_exec * 1373emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc) 1374{ 1375 struct nv50_program_exec *e = exec(pc); 1376 1377 e->inst[0] = (op << 28) | 2; 1378 set_long(pc, e); 1379 if (pred >= 0) 1380 set_pred(pc, cc, pred, e); 1381 1382 emit(pc, e); 1383 return e; 1384} 1385 1386static INLINE struct nv50_program_exec * 1387emit_breakaddr(struct nv50_pc *pc) 1388{ 1389 return emit_control_flow(pc, 0x4, -1, 0); 1390} 1391 1392static INLINE void 1393emit_break(struct nv50_pc *pc, int pred, unsigned cc) 1394{ 1395 emit_control_flow(pc, 0x5, pred, cc); 1396} 1397 1398static INLINE struct nv50_program_exec * 1399emit_joinat(struct nv50_pc *pc) 1400{ 1401 return emit_control_flow(pc, 0xa, -1, 0); 1402} 1403 1404static INLINE struct nv50_program_exec * 1405emit_branch(struct nv50_pc *pc, int pred, unsigned cc) 1406{ 1407 return emit_control_flow(pc, 0x1, pred, cc); 1408} 1409 1410static INLINE struct nv50_program_exec * 1411emit_call(struct nv50_pc *pc, int pred, unsigned cc) 1412{ 1413 return emit_control_flow(pc, 0x2, pred, cc); 1414} 1415 1416static INLINE void 1417emit_ret(struct nv50_pc *pc, int pred, unsigned cc) 1418{ 1419 emit_control_flow(pc, 0x3, pred, cc); 1420} 1421 1422#define QOP_ADD 0 1423#define QOP_SUBR 1 1424#define QOP_SUB 2 1425#define QOP_MOV_SRC1 3 1426 1427/* For a quad of threads / top left, top right, bottom left, bottom right 1428 * pixels, do a different operation, and take src0 from a specific thread. 1429 */ 1430static void 1431emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0, 1432 struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop) 1433{ 1434 struct nv50_program_exec *e = exec(pc); 1435 1436 e->inst[0] = 0xc0000000; 1437 e->inst[1] = 0x80000000; 1438 set_long(pc, e); 1439 e->inst[0] |= lane_src0 << 16; 1440 set_src_0(pc, src0, e); 1441 set_src_2(pc, src1, e); 1442 1443 if (wp >= 0) 1444 set_pred_wr(pc, 1, wp, e); 1445 1446 if (dst) 1447 set_dst(pc, dst, e); 1448 else { 1449 e->inst[0] |= 0x000001fc; 1450 e->inst[1] |= 0x00000008; 1451 } 1452 1453 e->inst[0] |= (qop & 3) << 20; 1454 e->inst[1] |= (qop >> 2) << 22; 1455 1456 emit(pc, e); 1457} 1458 1459static void 1460load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], 1461 struct nv50_reg **src, unsigned arg, boolean proj) 1462{ 1463 int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; 1464 1465 src[0]->mod |= NV50_MOD_ABS; 1466 src[1]->mod |= NV50_MOD_ABS; 1467 src[2]->mod |= NV50_MOD_ABS; 1468 1469 emit_minmax(pc, 4, t[2], src[0], src[1]); 1470 emit_minmax(pc, 4, t[2], src[2], t[2]); 1471 1472 src[0]->mod = mod[0]; 1473 src[1]->mod = mod[1]; 1474 src[2]->mod = mod[2]; 1475 1476 if (proj && 0 /* looks more correct without this */) 1477 emit_mul(pc, t[2], t[2], src[3]); 1478 else 1479 if (arg == 4) /* there is no textureProj(samplerCubeShadow) */ 1480 emit_mov(pc, t[3], src[3]); 1481 1482 emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]); 1483 1484 emit_mul(pc, t[0], src[0], t[2]); 1485 emit_mul(pc, t[1], src[1], t[2]); 1486 emit_mul(pc, t[2], src[2], t[2]); 1487} 1488 1489static void 1490load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], 1491 struct nv50_reg **src, unsigned dim, unsigned arg) 1492{ 1493 unsigned c, mode; 1494 1495 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1496 mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE; 1497 1498 t[3]->rhw = src[3]->rhw; 1499 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1500 emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]); 1501 1502 for (c = 0; c < dim; ++c) { 1503 t[c]->rhw = src[c]->rhw; 1504 emit_interp(pc, t[c], t[3], mode); 1505 } 1506 if (arg != dim) { /* depth reference value */ 1507 t[dim]->rhw = src[2]->rhw; 1508 emit_interp(pc, t[dim], t[3], mode); 1509 } 1510 } else { 1511 /* XXX: for some reason the blob sometimes uses MAD 1512 * (mad f32 $rX $rY $rZ neg $r63) 1513 */ 1514 emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]); 1515 for (c = 0; c < dim; ++c) 1516 emit_mul(pc, t[c], src[c], t[3]); 1517 if (arg != dim) /* depth reference value */ 1518 emit_mul(pc, t[dim], src[2], t[3]); 1519 } 1520} 1521 1522static INLINE void 1523get_tex_dim(unsigned type, unsigned *dim, unsigned *arg) 1524{ 1525 switch (type) { 1526 case TGSI_TEXTURE_1D: 1527 *arg = *dim = 1; 1528 break; 1529 case TGSI_TEXTURE_SHADOW1D: 1530 *dim = 1; 1531 *arg = 2; 1532 break; 1533 case TGSI_TEXTURE_UNKNOWN: 1534 case TGSI_TEXTURE_2D: 1535 case TGSI_TEXTURE_RECT: 1536 *arg = *dim = 2; 1537 break; 1538 case TGSI_TEXTURE_SHADOW2D: 1539 case TGSI_TEXTURE_SHADOWRECT: 1540 *dim = 2; 1541 *arg = 3; 1542 break; 1543 case TGSI_TEXTURE_3D: 1544 case TGSI_TEXTURE_CUBE: 1545 *dim = *arg = 3; 1546 break; 1547 default: 1548 assert(0); 1549 break; 1550 } 1551} 1552 1553/* We shouldn't execute TEXLOD if any of the pixels in a quad have 1554 * different LOD values, so branch off groups of equal LOD. 1555 */ 1556static void 1557emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod, 1558 struct nv50_reg *src, struct nv50_program_exec *tex) 1559{ 1560 struct nv50_program_exec *join_at; 1561 unsigned i, target = pc->p->exec_size + 9 * 2; 1562 1563 if (pc->p->type != PIPE_SHADER_FRAGMENT) { 1564 emit(pc, tex); 1565 return; 1566 } 1567 pc->allow32 = FALSE; 1568 1569 /* Subtract lod of each pixel from lod of top left pixel, jump 1570 * texlod insn if result is 0, then repeat for 2 other pixels. 1571 */ 1572 join_at = emit_joinat(pc); 1573 emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55); 1574 emit_branch(pc, 0, 2)->param.index = target; 1575 1576 for (i = 1; i < 4; ++i) { 1577 emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55); 1578 emit_branch(pc, 0, 2)->param.index = target; 1579 } 1580 1581 emit_mov(pc, tlod, src); /* target */ 1582 emit(pc, tex); /* texlod */ 1583 1584 join_at->param.index = target + 2 * 2; 1585 JOIN_ON(emit_nop(pc)); /* join _after_ tex */ 1586} 1587 1588static void 1589emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg, 1590 struct nv50_program_exec *tex) 1591{ 1592 struct nv50_program_exec *e; 1593 struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL); 1594 int r_pred = 0; 1595 unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 }; 1596 1597 pc->allow32 = FALSE; 1598 ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4); 1599 1600 /* Subtract bias value of thread i from bias values of each thread, 1601 * store result in r_pred, and set bit i in r_bits if result was 0. 1602 */ 1603 assert(arg < 4); 1604 for (i = 0; i < 4; ++i, ++imm_1248.hw) { 1605 emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55); 1606 emit_mov(pc, r_bits, &imm_1248); 1607 set_pred(pc, 2, r_pred, pc->p->exec_tail); 1608 } 1609 emit_mov_to_pred(pc, r_pred, r_bits); 1610 1611 /* The lanes of a quad are now grouped by the bit in r_pred they have 1612 * set. Put the input values for TEX into a new register set for each 1613 * group and execute TEX only for a specific group. 1614 * We cannot use the same register set for each group because we need 1615 * the derivatives, which are implicitly calculated, to be correct. 1616 */ 1617 for (i = 1; i < 4; ++i) { 1618 alloc_temp4(pc, t123[i], 0); 1619 1620 for (c = 0; c <= arg; ++c) 1621 emit_mov(pc, t123[i][c], t[c]); 1622 1623 *(e = exec(pc)) = *(tex); 1624 e->inst[0] &= ~0x01fc; 1625 set_dst(pc, t123[i][0], e); 1626 set_pred(pc, cc[i], r_pred, e); 1627 emit(pc, e); 1628 } 1629 /* finally TEX on the original regs (where we kept the input) */ 1630 set_pred(pc, cc[0], r_pred, tex); 1631 emit(pc, tex); 1632 1633 /* put the 3 * n other results into regs for lane 0 */ 1634 n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc)); 1635 for (i = 1; i < 4; ++i) { 1636 for (c = 0; c < n; ++c) { 1637 emit_mov(pc, t[c], t123[i][c]); 1638 set_pred(pc, cc[i], r_pred, pc->p->exec_tail); 1639 } 1640 free_temp4(pc, t123[i]); 1641 } 1642 1643 emit_nop(pc); 1644 free_temp(pc, r_bits); 1645} 1646 1647static void 1648emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1649 struct nv50_reg **src, unsigned unit, unsigned type, 1650 boolean proj, int bias_lod) 1651{ 1652 struct nv50_reg *t[4]; 1653 struct nv50_program_exec *e; 1654 unsigned c, dim, arg; 1655 1656 /* t[i] must be within a single 128 bit super-reg */ 1657 alloc_temp4(pc, t, 0); 1658 1659 e = exec(pc); 1660 e->inst[0] = 0xf0000000; 1661 set_long(pc, e); 1662 set_dst(pc, t[0], e); 1663 1664 /* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */ 1665 e->inst[0] |= (unit << 9) /* | (unit << 17) */; 1666 1667 /* live flag (don't set if TEX results affect input to another TEX): */ 1668 /* e->inst[0] |= 0x00000004; */ 1669 1670 get_tex_dim(type, &dim, &arg); 1671 1672 if (type == TGSI_TEXTURE_CUBE) { 1673 e->inst[0] |= 0x08000000; 1674 load_cube_tex_coords(pc, t, src, arg, proj); 1675 } else 1676 if (proj) 1677 load_proj_tex_coords(pc, t, src, dim, arg); 1678 else { 1679 for (c = 0; c < dim; c++) 1680 emit_mov(pc, t[c], src[c]); 1681 if (arg != dim) /* depth reference value (always src.z here) */ 1682 emit_mov(pc, t[dim], src[2]); 1683 } 1684 1685 e->inst[0] |= (mask & 0x3) << 25; 1686 e->inst[1] |= (mask & 0xc) << 12; 1687 1688 if (!bias_lod) { 1689 e->inst[0] |= (arg - 1) << 22; 1690 emit(pc, e); 1691 } else 1692 if (bias_lod < 0) { 1693 assert(pc->p->type == PIPE_SHADER_FRAGMENT); 1694 e->inst[0] |= arg << 22; 1695 e->inst[1] |= 0x20000000; /* texbias */ 1696 emit_mov(pc, t[arg], src[3]); 1697 emit_texbias_sequence(pc, t, arg, e); 1698 } else { 1699 e->inst[0] |= arg << 22; 1700 e->inst[1] |= 0x40000000; /* texlod */ 1701 emit_mov(pc, t[arg], src[3]); 1702 emit_texlod_sequence(pc, t[arg], src[3], e); 1703 } 1704 1705#if 1 1706 c = 0; 1707 if (mask & 1) emit_mov(pc, dst[0], t[c++]); 1708 if (mask & 2) emit_mov(pc, dst[1], t[c++]); 1709 if (mask & 4) emit_mov(pc, dst[2], t[c++]); 1710 if (mask & 8) emit_mov(pc, dst[3], t[c]); 1711 1712 free_temp4(pc, t); 1713#else 1714 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1715 * the texture coordinates, not the fetched values: latency ? */ 1716 1717 for (c = 0; c < 4; c++) { 1718 if (mask & (1 << c)) 1719 assimilate_temp(pc, dst[c], t[c]); 1720 else 1721 free_temp(pc, t[c]); 1722 } 1723#endif 1724} 1725 1726static void 1727emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1728{ 1729 struct nv50_program_exec *e = exec(pc); 1730 1731 assert(src->type == P_TEMP); 1732 1733 e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000; 1734 e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000; 1735 set_long(pc, e); 1736 set_dst(pc, dst, e); 1737 set_src_0(pc, src, e); 1738 set_src_2(pc, src, e); 1739 1740 emit(pc, e); 1741} 1742 1743static void 1744emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1745{ 1746 struct nv50_program_exec *e = exec(pc); 1747 1748 assert(src->type == P_TEMP); 1749 1750 e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000; 1751 e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000; 1752 set_long(pc, e); 1753 set_dst(pc, dst, e); 1754 set_src_0(pc, src, e); 1755 set_src_2(pc, src, e); 1756 1757 emit(pc, e); 1758} 1759 1760static void 1761convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1762{ 1763 unsigned q = 0, m = ~0; 1764 1765 assert(!is_long(e)); 1766 1767 switch (e->inst[0] >> 28) { 1768 case 0x1: 1769 /* MOV */ 1770 q = 0x0403c000; 1771 m = 0xffff7fff; 1772 break; 1773 case 0x8: 1774 /* INTERP (move centroid, perspective and flat bits) */ 1775 m = ~0x03000100; 1776 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 1777 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 1778 break; 1779 case 0x9: 1780 /* RCP */ 1781 break; 1782 case 0xB: 1783 /* ADD */ 1784 m = ~(127 << 16); 1785 q = ((e->inst[0] & (~m)) >> 2); 1786 break; 1787 case 0xC: 1788 /* MUL */ 1789 m = ~0x00008000; 1790 q = ((e->inst[0] & (~m)) << 12); 1791 break; 1792 case 0xE: 1793 /* MAD (if src2 == dst) */ 1794 q = ((e->inst[0] & 0x1fc) << 12); 1795 break; 1796 default: 1797 assert(0); 1798 break; 1799 } 1800 1801 set_long(pc, e); 1802 pc->p->exec_size++; 1803 1804 e->inst[0] &= m; 1805 e->inst[1] |= q; 1806} 1807 1808/* Some operations support an optional negation flag. */ 1809static int 1810get_supported_mods(const struct tgsi_full_instruction *insn, int i) 1811{ 1812 switch (insn->Instruction.Opcode) { 1813 case TGSI_OPCODE_ADD: 1814 case TGSI_OPCODE_COS: 1815 case TGSI_OPCODE_DDX: 1816 case TGSI_OPCODE_DDY: 1817 case TGSI_OPCODE_DP3: 1818 case TGSI_OPCODE_DP4: 1819 case TGSI_OPCODE_EX2: 1820 case TGSI_OPCODE_KIL: 1821 case TGSI_OPCODE_LG2: 1822 case TGSI_OPCODE_MAD: 1823 case TGSI_OPCODE_MUL: 1824 case TGSI_OPCODE_POW: 1825 case TGSI_OPCODE_RCP: 1826 case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */ 1827 case TGSI_OPCODE_SCS: 1828 case TGSI_OPCODE_SIN: 1829 case TGSI_OPCODE_SUB: 1830 return NV50_MOD_NEG; 1831 case TGSI_OPCODE_MAX: 1832 case TGSI_OPCODE_MIN: 1833 case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */ 1834 return NV50_MOD_ABS; 1835 case TGSI_OPCODE_CEIL: 1836 case TGSI_OPCODE_FLR: 1837 case TGSI_OPCODE_TRUNC: 1838 return NV50_MOD_NEG | NV50_MOD_ABS; 1839 case TGSI_OPCODE_F2I: 1840 case TGSI_OPCODE_F2U: 1841 case TGSI_OPCODE_I2F: 1842 case TGSI_OPCODE_U2F: 1843 return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32; 1844 default: 1845 return 0; 1846 } 1847} 1848 1849/* Return a read mask for source registers deduced from opcode & write mask. */ 1850static unsigned 1851nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 1852{ 1853 unsigned x, mask = insn->Dst[0].Register.WriteMask; 1854 1855 switch (insn->Instruction.Opcode) { 1856 case TGSI_OPCODE_COS: 1857 case TGSI_OPCODE_SIN: 1858 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 1859 case TGSI_OPCODE_DP3: 1860 return 0x7; 1861 case TGSI_OPCODE_DP4: 1862 case TGSI_OPCODE_DPH: 1863 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 1864 return 0xf; 1865 case TGSI_OPCODE_DST: 1866 return mask & (c ? 0xa : 0x6); 1867 case TGSI_OPCODE_EX2: 1868 case TGSI_OPCODE_EXP: 1869 case TGSI_OPCODE_LG2: 1870 case TGSI_OPCODE_LOG: 1871 case TGSI_OPCODE_POW: 1872 case TGSI_OPCODE_RCP: 1873 case TGSI_OPCODE_RSQ: 1874 case TGSI_OPCODE_SCS: 1875 return 0x1; 1876 case TGSI_OPCODE_IF: 1877 return 0x1; 1878 case TGSI_OPCODE_LIT: 1879 return 0xb; 1880 case TGSI_OPCODE_TEX: 1881 case TGSI_OPCODE_TXB: 1882 case TGSI_OPCODE_TXL: 1883 case TGSI_OPCODE_TXP: 1884 { 1885 const struct tgsi_instruction_texture *tex; 1886 1887 assert(insn->Instruction.Texture); 1888 tex = &insn->Texture; 1889 1890 mask = 0x7; 1891 if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && 1892 insn->Instruction.Opcode != TGSI_OPCODE_TXD) 1893 mask |= 0x8; /* bias, lod or proj */ 1894 1895 switch (tex->Texture) { 1896 case TGSI_TEXTURE_1D: 1897 mask &= 0x9; 1898 break; 1899 case TGSI_TEXTURE_SHADOW1D: 1900 mask &= 0x5; 1901 break; 1902 case TGSI_TEXTURE_2D: 1903 mask &= 0xb; 1904 break; 1905 default: 1906 break; 1907 } 1908 } 1909 return mask; 1910 case TGSI_OPCODE_XPD: 1911 x = 0; 1912 if (mask & 1) x |= 0x6; 1913 if (mask & 2) x |= 0x5; 1914 if (mask & 4) x |= 0x3; 1915 return x; 1916 default: 1917 break; 1918 } 1919 1920 return mask; 1921} 1922 1923static struct nv50_reg * 1924tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 1925{ 1926 switch (dst->Register.File) { 1927 case TGSI_FILE_TEMPORARY: 1928 return &pc->temp[dst->Register.Index * 4 + c]; 1929 case TGSI_FILE_OUTPUT: 1930 return &pc->result[dst->Register.Index * 4 + c]; 1931 case TGSI_FILE_ADDRESS: 1932 { 1933 struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; 1934 if (!r) { 1935 r = alloc_addr(pc, NULL); 1936 pc->addr[dst->Register.Index * 4 + c] = r; 1937 } 1938 assert(r); 1939 return r; 1940 } 1941 case TGSI_FILE_NULL: 1942 return NULL; 1943 default: 1944 break; 1945 } 1946 1947 return NULL; 1948} 1949 1950static struct nv50_reg * 1951tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 1952 int mod) 1953{ 1954 struct nv50_reg *r = NULL; 1955 struct nv50_reg *temp = NULL; 1956 unsigned sgn, c, swz, cvn; 1957 1958 if (src->Register.File != TGSI_FILE_CONSTANT) 1959 assert(!src->Register.Indirect); 1960 1961 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 1962 1963 c = tgsi_util_get_full_src_register_swizzle(src, chan); 1964 switch (c) { 1965 case TGSI_SWIZZLE_X: 1966 case TGSI_SWIZZLE_Y: 1967 case TGSI_SWIZZLE_Z: 1968 case TGSI_SWIZZLE_W: 1969 switch (src->Register.File) { 1970 case TGSI_FILE_INPUT: 1971 r = &pc->attr[src->Register.Index * 4 + c]; 1972 break; 1973 case TGSI_FILE_TEMPORARY: 1974 r = &pc->temp[src->Register.Index * 4 + c]; 1975 break; 1976 case TGSI_FILE_CONSTANT: 1977 if (!src->Register.Indirect) { 1978 r = &pc->param[src->Register.Index * 4 + c]; 1979 break; 1980 } 1981 /* Indicate indirection by setting r->acc < 0 and 1982 * use the index field to select the address reg. 1983 */ 1984 r = reg_instance(pc, NULL); 1985 swz = tgsi_util_get_src_register_swizzle( 1986 &src->Indirect, 0); 1987 ctor_reg(r, P_CONST, 1988 src->Indirect.Index * 4 + swz, 1989 src->Register.Index * 4 + c); 1990 r->acc = -1; 1991 break; 1992 case TGSI_FILE_IMMEDIATE: 1993 r = &pc->immd[src->Register.Index * 4 + c]; 1994 break; 1995 case TGSI_FILE_SAMPLER: 1996 return NULL; 1997 case TGSI_FILE_ADDRESS: 1998 r = pc->addr[src->Register.Index * 4 + c]; 1999 assert(r); 2000 break; 2001 default: 2002 assert(0); 2003 break; 2004 } 2005 break; 2006 default: 2007 assert(0); 2008 break; 2009 } 2010 2011 cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32; 2012 2013 switch (sgn) { 2014 case TGSI_UTIL_SIGN_CLEAR: 2015 r->mod = NV50_MOD_ABS; 2016 break; 2017 case TGSI_UTIL_SIGN_SET: 2018 r->mod = NV50_MOD_NEG_ABS; 2019 break; 2020 case TGSI_UTIL_SIGN_TOGGLE: 2021 r->mod = NV50_MOD_NEG; 2022 break; 2023 default: 2024 assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP); 2025 break; 2026 } 2027 2028 if ((r->mod & mod) != r->mod) { 2029 temp = temp_temp(pc); 2030 emit_cvt(pc, temp, r, -1, cvn); 2031 r->mod = 0; 2032 r = temp; 2033 } else 2034 r->mod |= mod & NV50_MOD_I32; 2035 2036 assert(r); 2037 if (r->acc >= 0 && r != temp) 2038 return reg_instance(pc, r); /* will clear r->mod */ 2039 return r; 2040} 2041 2042/* return TRUE for ops that produce only a single result */ 2043static boolean 2044is_scalar_op(unsigned op) 2045{ 2046 switch (op) { 2047 case TGSI_OPCODE_COS: 2048 case TGSI_OPCODE_DP2: 2049 case TGSI_OPCODE_DP3: 2050 case TGSI_OPCODE_DP4: 2051 case TGSI_OPCODE_DPH: 2052 case TGSI_OPCODE_EX2: 2053 case TGSI_OPCODE_LG2: 2054 case TGSI_OPCODE_POW: 2055 case TGSI_OPCODE_RCP: 2056 case TGSI_OPCODE_RSQ: 2057 case TGSI_OPCODE_SIN: 2058 /* 2059 case TGSI_OPCODE_KIL: 2060 case TGSI_OPCODE_LIT: 2061 case TGSI_OPCODE_SCS: 2062 */ 2063 return TRUE; 2064 default: 2065 return FALSE; 2066 } 2067} 2068 2069/* Returns a bitmask indicating which dst components depend 2070 * on source s, component c (reverse of nv50_tgsi_src_mask). 2071 */ 2072static unsigned 2073nv50_tgsi_dst_revdep(unsigned op, int s, int c) 2074{ 2075 if (is_scalar_op(op)) 2076 return 0x1; 2077 2078 switch (op) { 2079 case TGSI_OPCODE_DST: 2080 return (1 << c) & (s ? 0xa : 0x6); 2081 case TGSI_OPCODE_XPD: 2082 switch (c) { 2083 case 0: return 0x6; 2084 case 1: return 0x5; 2085 case 2: return 0x3; 2086 case 3: return 0x0; 2087 default: 2088 assert(0); 2089 return 0x0; 2090 } 2091 case TGSI_OPCODE_EXP: 2092 case TGSI_OPCODE_LOG: 2093 case TGSI_OPCODE_LIT: 2094 case TGSI_OPCODE_SCS: 2095 case TGSI_OPCODE_TEX: 2096 case TGSI_OPCODE_TXB: 2097 case TGSI_OPCODE_TXL: 2098 case TGSI_OPCODE_TXP: 2099 /* these take care of dangerous swizzles themselves */ 2100 return 0x0; 2101 case TGSI_OPCODE_IF: 2102 case TGSI_OPCODE_KIL: 2103 /* don't call this function for these ops */ 2104 assert(0); 2105 return 0; 2106 default: 2107 /* linear vector instruction */ 2108 return (1 << c); 2109 } 2110} 2111 2112static INLINE boolean 2113has_pred(struct nv50_program_exec *e, unsigned cc) 2114{ 2115 if (!is_long(e) || is_immd(e)) 2116 return FALSE; 2117 return ((e->inst[1] & 0x780) == (cc << 7)); 2118} 2119 2120/* on ENDIF see if we can do "@p0.neu single_op" instead of: 2121 * join_at ENDIF 2122 * @p0.eq bra ENDIF 2123 * single_op 2124 * ENDIF: nop.join 2125 */ 2126static boolean 2127nv50_kill_branch(struct nv50_pc *pc) 2128{ 2129 int lvl = pc->if_lvl; 2130 2131 if (pc->if_insn[lvl]->next != pc->p->exec_tail) 2132 return FALSE; 2133 if (is_immd(pc->p->exec_tail)) 2134 return FALSE; 2135 2136 /* if ccode == 'true', the BRA is from an ELSE and the predicate 2137 * reg may no longer be valid, since we currently always use $p0 2138 */ 2139 if (has_pred(pc->if_insn[lvl], 0xf)) 2140 return FALSE; 2141 assert(pc->if_insn[lvl] && pc->if_join[lvl]); 2142 2143 /* We'll use the exec allocated for JOIN_AT (we can't easily 2144 * access nv50_program_exec's prev). 2145 */ 2146 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ 2147 2148 *pc->if_join[lvl] = *pc->p->exec_tail; 2149 2150 FREE(pc->if_insn[lvl]); 2151 FREE(pc->p->exec_tail); 2152 2153 pc->p->exec_tail = pc->if_join[lvl]; 2154 pc->p->exec_tail->next = NULL; 2155 set_pred(pc, 0xd, 0, pc->p->exec_tail); 2156 2157 return TRUE; 2158} 2159 2160static void 2161nv50_fp_move_results(struct nv50_pc *pc) 2162{ 2163 struct nv50_reg reg; 2164 unsigned i; 2165 2166 ctor_reg(®, P_TEMP, -1, -1); 2167 2168 for (i = 0; i < pc->result_nr * 4; ++i) { 2169 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) 2170 continue; 2171 if (pc->result[i].rhw != pc->result[i].hw) { 2172 reg.hw = pc->result[i].rhw; 2173 emit_mov(pc, ®, &pc->result[i]); 2174 } 2175 } 2176} 2177 2178static boolean 2179nv50_program_tx_insn(struct nv50_pc *pc, 2180 const struct tgsi_full_instruction *inst) 2181{ 2182 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 2183 unsigned mask, sat, unit; 2184 int i, c; 2185 2186 mask = inst->Dst[0].Register.WriteMask; 2187 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 2188 2189 memset(src, 0, sizeof(src)); 2190 2191 for (c = 0; c < 4; c++) { 2192 if ((mask & (1 << c)) && !pc->r_dst[c]) 2193 dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); 2194 else 2195 dst[c] = pc->r_dst[c]; 2196 rdst[c] = dst[c]; 2197 } 2198 2199 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2200 const struct tgsi_full_src_register *fs = &inst->Src[i]; 2201 unsigned src_mask; 2202 int mod_supp; 2203 2204 src_mask = nv50_tgsi_src_mask(inst, i); 2205 mod_supp = get_supported_mods(inst, i); 2206 2207 if (fs->Register.File == TGSI_FILE_SAMPLER) 2208 unit = fs->Register.Index; 2209 2210 for (c = 0; c < 4; c++) 2211 if (src_mask & (1 << c)) 2212 src[i][c] = tgsi_src(pc, c, fs, mod_supp); 2213 } 2214 2215 brdc = temp = pc->r_brdc; 2216 if (brdc && brdc->type != P_TEMP) { 2217 temp = temp_temp(pc); 2218 if (sat) 2219 brdc = temp; 2220 } else 2221 if (sat) { 2222 for (c = 0; c < 4; c++) { 2223 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 2224 continue; 2225 /* rdst[c] = dst[c]; */ /* done above */ 2226 dst[c] = temp_temp(pc); 2227 } 2228 } 2229 2230 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 2231 2232 switch (inst->Instruction.Opcode) { 2233 case TGSI_OPCODE_ABS: 2234 for (c = 0; c < 4; c++) { 2235 if (!(mask & (1 << c))) 2236 continue; 2237 emit_cvt(pc, dst[c], src[0][c], -1, 2238 CVT_ABS | CVT_F32_F32); 2239 } 2240 break; 2241 case TGSI_OPCODE_ADD: 2242 for (c = 0; c < 4; c++) { 2243 if (!(mask & (1 << c))) 2244 continue; 2245 emit_add(pc, dst[c], src[0][c], src[1][c]); 2246 } 2247 break; 2248 case TGSI_OPCODE_AND: 2249 case TGSI_OPCODE_XOR: 2250 case TGSI_OPCODE_OR: 2251 for (c = 0; c < 4; c++) { 2252 if (!(mask & (1 << c))) 2253 continue; 2254 emit_bitop2(pc, dst[c], src[0][c], src[1][c], 2255 inst->Instruction.Opcode); 2256 } 2257 break; 2258 case TGSI_OPCODE_ARL: 2259 assert(src[0][0]); 2260 temp = temp_temp(pc); 2261 emit_cvt(pc, temp, src[0][0], -1, CVT_FLOOR | CVT_S32_F32); 2262 emit_arl(pc, dst[0], temp, 4); 2263 break; 2264 case TGSI_OPCODE_BGNLOOP: 2265 pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc); 2266 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; 2267 terminate_mbb(pc); 2268 break; 2269 case TGSI_OPCODE_BGNSUB: 2270 assert(!pc->in_subroutine); 2271 pc->in_subroutine = TRUE; 2272 /* probably not necessary, but align to 8 byte boundary */ 2273 if (!is_long(pc->p->exec_tail)) 2274 convert_to_long(pc, pc->p->exec_tail); 2275 break; 2276 case TGSI_OPCODE_BRK: 2277 assert(pc->loop_lvl > 0); 2278 emit_break(pc, -1, 0); 2279 break; 2280 case TGSI_OPCODE_CAL: 2281 assert(inst->Label.Label < pc->insn_nr); 2282 emit_call(pc, -1, 0)->param.index = inst->Label.Label; 2283 /* replaced by actual offset in nv50_program_fixup_insns */ 2284 break; 2285 case TGSI_OPCODE_CEIL: 2286 for (c = 0; c < 4; c++) { 2287 if (!(mask & (1 << c))) 2288 continue; 2289 emit_cvt(pc, dst[c], src[0][c], -1, 2290 CVT_CEIL | CVT_F32_F32 | CVT_RI); 2291 } 2292 break; 2293 case TGSI_OPCODE_CMP: 2294 pc->allow32 = FALSE; 2295 for (c = 0; c < 4; c++) { 2296 if (!(mask & (1 << c))) 2297 continue; 2298 emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32); 2299 emit_mov(pc, dst[c], src[1][c]); 2300 set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ 2301 emit_mov(pc, dst[c], src[2][c]); 2302 set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ 2303 } 2304 break; 2305 case TGSI_OPCODE_CONT: 2306 assert(pc->loop_lvl > 0); 2307 emit_branch(pc, -1, 0)->param.index = 2308 pc->loop_pos[pc->loop_lvl - 1]; 2309 break; 2310 case TGSI_OPCODE_COS: 2311 if (mask & 8) { 2312 emit_precossin(pc, temp, src[0][3]); 2313 emit_flop(pc, NV50_FLOP_COS, dst[3], temp); 2314 if (!(mask &= 7)) 2315 break; 2316 if (temp == dst[3]) 2317 temp = brdc = temp_temp(pc); 2318 } 2319 emit_precossin(pc, temp, src[0][0]); 2320 emit_flop(pc, NV50_FLOP_COS, brdc, temp); 2321 break; 2322 case TGSI_OPCODE_DDX: 2323 for (c = 0; c < 4; c++) { 2324 if (!(mask & (1 << c))) 2325 continue; 2326 emit_ddx(pc, dst[c], src[0][c]); 2327 } 2328 break; 2329 case TGSI_OPCODE_DDY: 2330 for (c = 0; c < 4; c++) { 2331 if (!(mask & (1 << c))) 2332 continue; 2333 emit_ddy(pc, dst[c], src[0][c]); 2334 } 2335 break; 2336 case TGSI_OPCODE_DP3: 2337 emit_mul(pc, temp, src[0][0], src[1][0]); 2338 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2339 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 2340 break; 2341 case TGSI_OPCODE_DP4: 2342 emit_mul(pc, temp, src[0][0], src[1][0]); 2343 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2344 emit_mad(pc, temp, src[0][2], src[1][2], temp); 2345 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 2346 break; 2347 case TGSI_OPCODE_DPH: 2348 emit_mul(pc, temp, src[0][0], src[1][0]); 2349 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2350 emit_mad(pc, temp, src[0][2], src[1][2], temp); 2351 emit_add(pc, brdc, src[1][3], temp); 2352 break; 2353 case TGSI_OPCODE_DST: 2354 if (mask & (1 << 1)) 2355 emit_mul(pc, dst[1], src[0][1], src[1][1]); 2356 if (mask & (1 << 2)) 2357 emit_mov(pc, dst[2], src[0][2]); 2358 if (mask & (1 << 3)) 2359 emit_mov(pc, dst[3], src[1][3]); 2360 if (mask & (1 << 0)) 2361 emit_mov_immdval(pc, dst[0], 1.0f); 2362 break; 2363 case TGSI_OPCODE_ELSE: 2364 emit_branch(pc, -1, 0); 2365 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 2366 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; 2367 terminate_mbb(pc); 2368 break; 2369 case TGSI_OPCODE_ENDIF: 2370 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 2371 2372 /* try to replace branch over 1 insn with a predicated insn */ 2373 if (nv50_kill_branch(pc) == TRUE) 2374 break; 2375 2376 if (pc->if_join[pc->if_lvl]) { 2377 pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size; 2378 pc->if_join[pc->if_lvl] = NULL; 2379 } 2380 terminate_mbb(pc); 2381 /* emit a NOP as join point, we could set it on the next 2382 * one, but would have to make sure it is long and !immd 2383 */ 2384 JOIN_ON(emit_nop(pc)); 2385 break; 2386 case TGSI_OPCODE_ENDLOOP: 2387 emit_branch(pc, -1, 0)->param.index = 2388 pc->loop_pos[--pc->loop_lvl]; 2389 pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size; 2390 terminate_mbb(pc); 2391 break; 2392 case TGSI_OPCODE_ENDSUB: 2393 assert(pc->in_subroutine); 2394 pc->in_subroutine = FALSE; 2395 break; 2396 case TGSI_OPCODE_EX2: 2397 emit_preex2(pc, temp, src[0][0]); 2398 emit_flop(pc, NV50_FLOP_EX2, brdc, temp); 2399 break; 2400 case TGSI_OPCODE_EXP: 2401 { 2402 struct nv50_reg *t[2]; 2403 2404 assert(!temp); 2405 t[0] = temp_temp(pc); 2406 t[1] = temp_temp(pc); 2407 2408 if (mask & 0x6) 2409 emit_mov(pc, t[0], src[0][0]); 2410 if (mask & 0x3) 2411 emit_flr(pc, t[1], src[0][0]); 2412 2413 if (mask & (1 << 1)) 2414 emit_sub(pc, dst[1], t[0], t[1]); 2415 if (mask & (1 << 0)) { 2416 emit_preex2(pc, t[1], t[1]); 2417 emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]); 2418 } 2419 if (mask & (1 << 2)) { 2420 emit_preex2(pc, t[0], t[0]); 2421 emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]); 2422 } 2423 if (mask & (1 << 3)) 2424 emit_mov_immdval(pc, dst[3], 1.0f); 2425 } 2426 break; 2427 case TGSI_OPCODE_F2I: 2428 for (c = 0; c < 4; c++) { 2429 if (!(mask & (1 << c))) 2430 continue; 2431 emit_cvt(pc, dst[c], src[0][c], -1, 2432 CVT_TRUNC | CVT_S32_F32); 2433 } 2434 break; 2435 case TGSI_OPCODE_F2U: 2436 for (c = 0; c < 4; c++) { 2437 if (!(mask & (1 << c))) 2438 continue; 2439 emit_cvt(pc, dst[c], src[0][c], -1, 2440 CVT_TRUNC | CVT_U32_F32); 2441 } 2442 break; 2443 case TGSI_OPCODE_FLR: 2444 for (c = 0; c < 4; c++) { 2445 if (!(mask & (1 << c))) 2446 continue; 2447 emit_flr(pc, dst[c], src[0][c]); 2448 } 2449 break; 2450 case TGSI_OPCODE_FRC: 2451 temp = temp_temp(pc); 2452 for (c = 0; c < 4; c++) { 2453 if (!(mask & (1 << c))) 2454 continue; 2455 emit_flr(pc, temp, src[0][c]); 2456 emit_sub(pc, dst[c], src[0][c], temp); 2457 } 2458 break; 2459 case TGSI_OPCODE_I2F: 2460 for (c = 0; c < 4; c++) { 2461 if (!(mask & (1 << c))) 2462 continue; 2463 emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32); 2464 } 2465 break; 2466 case TGSI_OPCODE_IF: 2467 assert(pc->if_lvl < NV50_MAX_COND_NESTING); 2468 emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32); 2469 pc->if_join[pc->if_lvl] = emit_joinat(pc); 2470 pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);; 2471 terminate_mbb(pc); 2472 break; 2473 case TGSI_OPCODE_INEG: 2474 for (c = 0; c < 4; c++) { 2475 if (!(mask & (1 << c))) 2476 continue; 2477 emit_cvt(pc, dst[c], src[0][c], -1, 2478 CVT_S32_S32 | CVT_NEG); 2479 } 2480 break; 2481 case TGSI_OPCODE_KIL: 2482 assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]); 2483 emit_kil(pc, src[0][0]); 2484 emit_kil(pc, src[0][1]); 2485 emit_kil(pc, src[0][2]); 2486 emit_kil(pc, src[0][3]); 2487 break; 2488 case TGSI_OPCODE_KILP: 2489 emit_kil(pc, NULL); 2490 break; 2491 case TGSI_OPCODE_LIT: 2492 emit_lit(pc, &dst[0], mask, &src[0][0]); 2493 break; 2494 case TGSI_OPCODE_LG2: 2495 emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]); 2496 break; 2497 case TGSI_OPCODE_LOG: 2498 { 2499 struct nv50_reg *t[2]; 2500 2501 t[0] = temp_temp(pc); 2502 if (mask & (1 << 1)) 2503 t[1] = temp_temp(pc); 2504 else 2505 t[1] = t[0]; 2506 2507 emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32); 2508 emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]); 2509 if (mask & (1 << 2)) 2510 emit_mov(pc, dst[2], t[1]); 2511 emit_flr(pc, t[1], t[1]); 2512 if (mask & (1 << 0)) 2513 emit_mov(pc, dst[0], t[1]); 2514 if (mask & (1 << 1)) { 2515 t[1]->mod = NV50_MOD_NEG; 2516 emit_preex2(pc, t[1], t[1]); 2517 t[1]->mod = 0; 2518 emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]); 2519 emit_mul(pc, dst[1], t[0], t[1]); 2520 } 2521 if (mask & (1 << 3)) 2522 emit_mov_immdval(pc, dst[3], 1.0f); 2523 } 2524 break; 2525 case TGSI_OPCODE_LRP: 2526 temp = temp_temp(pc); 2527 for (c = 0; c < 4; c++) { 2528 if (!(mask & (1 << c))) 2529 continue; 2530 emit_sub(pc, temp, src[1][c], src[2][c]); 2531 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 2532 } 2533 break; 2534 case TGSI_OPCODE_MAD: 2535 for (c = 0; c < 4; c++) { 2536 if (!(mask & (1 << c))) 2537 continue; 2538 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 2539 } 2540 break; 2541 case TGSI_OPCODE_MAX: 2542 for (c = 0; c < 4; c++) { 2543 if (!(mask & (1 << c))) 2544 continue; 2545 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]); 2546 } 2547 break; 2548 case TGSI_OPCODE_MIN: 2549 for (c = 0; c < 4; c++) { 2550 if (!(mask & (1 << c))) 2551 continue; 2552 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]); 2553 } 2554 break; 2555 case TGSI_OPCODE_MOV: 2556 for (c = 0; c < 4; c++) { 2557 if (!(mask & (1 << c))) 2558 continue; 2559 emit_mov(pc, dst[c], src[0][c]); 2560 } 2561 break; 2562 case TGSI_OPCODE_MUL: 2563 for (c = 0; c < 4; c++) { 2564 if (!(mask & (1 << c))) 2565 continue; 2566 emit_mul(pc, dst[c], src[0][c], src[1][c]); 2567 } 2568 break; 2569 case TGSI_OPCODE_POW: 2570 emit_pow(pc, brdc, src[0][0], src[1][0]); 2571 break; 2572 case TGSI_OPCODE_RCP: 2573 emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]); 2574 break; 2575 case TGSI_OPCODE_RET: 2576 if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine) 2577 nv50_fp_move_results(pc); 2578 emit_ret(pc, -1, 0); 2579 break; 2580 case TGSI_OPCODE_RSQ: 2581 src[0][0]->mod |= NV50_MOD_ABS; 2582 emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]); 2583 break; 2584 case TGSI_OPCODE_SCS: 2585 temp = temp_temp(pc); 2586 if (mask & 3) 2587 emit_precossin(pc, temp, src[0][0]); 2588 if (mask & (1 << 0)) 2589 emit_flop(pc, NV50_FLOP_COS, dst[0], temp); 2590 if (mask & (1 << 1)) 2591 emit_flop(pc, NV50_FLOP_SIN, dst[1], temp); 2592 if (mask & (1 << 2)) 2593 emit_mov_immdval(pc, dst[2], 0.0); 2594 if (mask & (1 << 3)) 2595 emit_mov_immdval(pc, dst[3], 1.0); 2596 break; 2597 case TGSI_OPCODE_SIN: 2598 if (mask & 8) { 2599 emit_precossin(pc, temp, src[0][3]); 2600 emit_flop(pc, NV50_FLOP_SIN, dst[3], temp); 2601 if (!(mask &= 7)) 2602 break; 2603 if (temp == dst[3]) 2604 temp = brdc = temp_temp(pc); 2605 } 2606 emit_precossin(pc, temp, src[0][0]); 2607 emit_flop(pc, NV50_FLOP_SIN, brdc, temp); 2608 break; 2609 case TGSI_OPCODE_SLT: 2610 case TGSI_OPCODE_SGE: 2611 case TGSI_OPCODE_SEQ: 2612 case TGSI_OPCODE_SGT: 2613 case TGSI_OPCODE_SLE: 2614 case TGSI_OPCODE_SNE: 2615 i = map_tgsi_setop_cc(inst->Instruction.Opcode); 2616 for (c = 0; c < 4; c++) { 2617 if (!(mask & (1 << c))) 2618 continue; 2619 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]); 2620 } 2621 break; 2622 case TGSI_OPCODE_SUB: 2623 for (c = 0; c < 4; c++) { 2624 if (!(mask & (1 << c))) 2625 continue; 2626 emit_sub(pc, dst[c], src[0][c], src[1][c]); 2627 } 2628 break; 2629 case TGSI_OPCODE_TEX: 2630 emit_tex(pc, dst, mask, src[0], unit, 2631 inst->Texture.Texture, FALSE, 0); 2632 break; 2633 case TGSI_OPCODE_TXB: 2634 emit_tex(pc, dst, mask, src[0], unit, 2635 inst->Texture.Texture, FALSE, -1); 2636 break; 2637 case TGSI_OPCODE_TXL: 2638 emit_tex(pc, dst, mask, src[0], unit, 2639 inst->Texture.Texture, FALSE, 1); 2640 break; 2641 case TGSI_OPCODE_TXP: 2642 emit_tex(pc, dst, mask, src[0], unit, 2643 inst->Texture.Texture, TRUE, 0); 2644 break; 2645 case TGSI_OPCODE_TRUNC: 2646 for (c = 0; c < 4; c++) { 2647 if (!(mask & (1 << c))) 2648 continue; 2649 emit_cvt(pc, dst[c], src[0][c], -1, 2650 CVT_TRUNC | CVT_F32_F32 | CVT_RI); 2651 } 2652 break; 2653 case TGSI_OPCODE_U2F: 2654 for (c = 0; c < 4; c++) { 2655 if (!(mask & (1 << c))) 2656 continue; 2657 emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32); 2658 } 2659 break; 2660 case TGSI_OPCODE_XPD: 2661 temp = temp_temp(pc); 2662 if (mask & (1 << 0)) { 2663 emit_mul(pc, temp, src[0][2], src[1][1]); 2664 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 2665 } 2666 if (mask & (1 << 1)) { 2667 emit_mul(pc, temp, src[0][0], src[1][2]); 2668 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 2669 } 2670 if (mask & (1 << 2)) { 2671 emit_mul(pc, temp, src[0][1], src[1][0]); 2672 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 2673 } 2674 if (mask & (1 << 3)) 2675 emit_mov_immdval(pc, dst[3], 1.0); 2676 break; 2677 case TGSI_OPCODE_END: 2678 if (pc->p->type == PIPE_SHADER_FRAGMENT) 2679 nv50_fp_move_results(pc); 2680 2681 /* last insn must be long so it can have the exit bit set */ 2682 if (!is_long(pc->p->exec_tail)) 2683 convert_to_long(pc, pc->p->exec_tail); 2684 else 2685 if (is_immd(pc->p->exec_tail) || is_join(pc->p->exec_tail)) 2686 emit_nop(pc); 2687 2688 pc->p->exec_tail->inst[1] |= 1; /* set exit bit */ 2689 break; 2690 default: 2691 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 2692 return FALSE; 2693 } 2694 2695 if (brdc) { 2696 if (sat) 2697 emit_sat(pc, brdc, brdc); 2698 for (c = 0; c < 4; c++) 2699 if ((mask & (1 << c)) && dst[c] != brdc) 2700 emit_mov(pc, dst[c], brdc); 2701 } else 2702 if (sat) { 2703 for (c = 0; c < 4; c++) { 2704 if (!(mask & (1 << c))) 2705 continue; 2706 /* In this case we saturate later, and dst[c] won't 2707 * be another temp_temp (and thus lost), since rdst 2708 * already is TEMP (see above). */ 2709 if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) 2710 continue; 2711 emit_sat(pc, rdst[c], dst[c]); 2712 } 2713 } 2714 2715 kill_temp_temp(pc); 2716 pc->reg_instance_nr = 0; 2717 2718 return TRUE; 2719} 2720 2721static void 2722prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) 2723{ 2724 struct nv50_reg *reg = NULL; 2725 const struct tgsi_full_src_register *src; 2726 const struct tgsi_dst_register *dst; 2727 unsigned i, c, k, mask; 2728 2729 dst = &insn->Dst[0].Register; 2730 mask = dst->WriteMask; 2731 2732 if (dst->File == TGSI_FILE_TEMPORARY) 2733 reg = pc->temp; 2734 else 2735 if (dst->File == TGSI_FILE_OUTPUT) { 2736 reg = pc->result; 2737 2738 if (insn->Instruction.Opcode == TGSI_OPCODE_MOV && 2739 dst->Index == pc->edgeflag_out && 2740 insn->Src[0].Register.File == TGSI_FILE_INPUT) 2741 pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index; 2742 } 2743 2744 if (reg) { 2745 for (c = 0; c < 4; c++) { 2746 if (!(mask & (1 << c))) 2747 continue; 2748 reg[dst->Index * 4 + c].acc = pc->insn_nr; 2749 } 2750 } 2751 2752 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2753 src = &insn->Src[i]; 2754 2755 if (src->Register.File == TGSI_FILE_TEMPORARY) 2756 reg = pc->temp; 2757 else 2758 if (src->Register.File == TGSI_FILE_INPUT) 2759 reg = pc->attr; 2760 else 2761 continue; 2762 2763 mask = nv50_tgsi_src_mask(insn, i); 2764 2765 for (c = 0; c < 4; c++) { 2766 if (!(mask & (1 << c))) 2767 continue; 2768 k = tgsi_util_get_full_src_register_swizzle(src, c); 2769 2770 reg[src->Register.Index * 4 + k].acc = pc->insn_nr; 2771 } 2772 } 2773} 2774 2775/* Returns a bitmask indicating which dst components need to be 2776 * written to temporaries first to avoid 'corrupting' sources. 2777 * 2778 * m[i] (out) indicate component to write in the i-th position 2779 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 2780 */ 2781static unsigned 2782nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 2783{ 2784 unsigned i, c, x, unsafe; 2785 2786 for (c = 0; c < 4; c++) 2787 m[c] = c; 2788 2789 /* Swap as long as a dst component written earlier is depended on 2790 * by one written later, but the next one isn't depended on by it. 2791 */ 2792 for (c = 0; c < 3; c++) { 2793 if (rdep[m[c + 1]] & (1 << m[c])) 2794 continue; /* if next one is depended on by us */ 2795 for (i = c + 1; i < 4; i++) 2796 /* if we are depended on by a later one */ 2797 if (rdep[m[c]] & (1 << m[i])) 2798 break; 2799 if (i == 4) 2800 continue; 2801 /* now, swap */ 2802 x = m[c]; 2803 m[c] = m[c + 1]; 2804 m[c + 1] = x; 2805 2806 /* restart */ 2807 c = 0; 2808 } 2809 2810 /* mark dependencies that could not be resolved by reordering */ 2811 for (i = 0; i < 3; ++i) 2812 for (c = i + 1; c < 4; ++c) 2813 if (rdep[m[i]] & (1 << m[c])) 2814 unsafe |= (1 << i); 2815 2816 /* NOTE: $unsafe is with respect to order, not component */ 2817 return unsafe; 2818} 2819 2820/* Select a suitable dst register for broadcasting scalar results, 2821 * or return NULL if we have to allocate an extra TEMP. 2822 * 2823 * If e.g. only 1 component is written, we may also emit the final 2824 * result to a write-only register. 2825 */ 2826static struct nv50_reg * 2827tgsi_broadcast_dst(struct nv50_pc *pc, 2828 const struct tgsi_full_dst_register *fd, unsigned mask) 2829{ 2830 if (fd->Register.File == TGSI_FILE_TEMPORARY) { 2831 int c = ffs(~mask & fd->Register.WriteMask); 2832 if (c) 2833 return tgsi_dst(pc, c - 1, fd); 2834 } else { 2835 int c = ffs(fd->Register.WriteMask) - 1; 2836 if ((1 << c) == fd->Register.WriteMask) 2837 return tgsi_dst(pc, c, fd); 2838 } 2839 2840 return NULL; 2841} 2842 2843/* Scan source swizzles and return a bitmask indicating dst regs that 2844 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 2845 */ 2846static unsigned 2847nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 2848 unsigned rdep[4]) 2849{ 2850 const struct tgsi_full_dst_register *fd = &insn->Dst[0]; 2851 const struct tgsi_full_src_register *fs; 2852 unsigned i, deqs = 0; 2853 2854 for (i = 0; i < 4; ++i) 2855 rdep[i] = 0; 2856 2857 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2858 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 2859 int ms = get_supported_mods(insn, i); 2860 2861 fs = &insn->Src[i]; 2862 if (fs->Register.File != fd->Register.File || 2863 fs->Register.Index != fd->Register.Index) 2864 continue; 2865 2866 for (chn = 0; chn < 4; ++chn) { 2867 unsigned s, c; 2868 2869 if (!(mask & (1 << chn))) /* src is not read */ 2870 continue; 2871 c = tgsi_util_get_full_src_register_swizzle(fs, chn); 2872 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 2873 2874 if (!(fd->Register.WriteMask & (1 << c))) 2875 continue; 2876 2877 if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG)) 2878 continue; 2879 if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS)) 2880 continue; 2881 if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3)) 2882 continue; 2883 2884 rdep[c] |= nv50_tgsi_dst_revdep( 2885 insn->Instruction.Opcode, i, chn); 2886 deqs |= (1 << c); 2887 } 2888 } 2889 2890 return deqs; 2891} 2892 2893static boolean 2894nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 2895{ 2896 struct tgsi_full_instruction insn = tok->FullInstruction; 2897 const struct tgsi_full_dst_register *fd; 2898 unsigned i, deqs, rdep[4], m[4]; 2899 2900 fd = &tok->FullInstruction.Dst[0]; 2901 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 2902 2903 if (is_scalar_op(insn.Instruction.Opcode)) { 2904 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 2905 if (!pc->r_brdc) 2906 pc->r_brdc = temp_temp(pc); 2907 return nv50_program_tx_insn(pc, &insn); 2908 } 2909 pc->r_brdc = NULL; 2910 2911 if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3])) 2912 return nv50_program_tx_insn(pc, &insn); 2913 2914 deqs = nv50_revdep_reorder(m, rdep); 2915 2916 for (i = 0; i < 4; ++i) { 2917 assert(pc->r_dst[m[i]] == NULL); 2918 2919 insn.Dst[0].Register.WriteMask = 2920 fd->Register.WriteMask & (1 << m[i]); 2921 2922 if (!insn.Dst[0].Register.WriteMask) 2923 continue; 2924 2925 if (deqs & (1 << i)) 2926 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 2927 2928 if (!nv50_program_tx_insn(pc, &insn)) 2929 return FALSE; 2930 } 2931 2932 for (i = 0; i < 4; i++) { 2933 struct nv50_reg *reg = pc->r_dst[i]; 2934 if (!reg) 2935 continue; 2936 pc->r_dst[i] = NULL; 2937 2938 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 2939 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 2940 else 2941 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 2942 free_temp(pc, reg); 2943 } 2944 2945 return TRUE; 2946} 2947 2948static void 2949load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) 2950{ 2951 struct nv50_reg *iv, **ppiv; 2952 unsigned mode = pc->interp_mode[reg->index]; 2953 2954 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; 2955 iv = *ppiv; 2956 2957 if ((mode & INTERP_PERSPECTIVE) && !iv) { 2958 iv = *ppiv = alloc_temp(pc, NULL); 2959 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; 2960 2961 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); 2962 emit_flop(pc, NV50_FLOP_RCP, iv, iv); 2963 2964 /* XXX: when loading interpolants dynamically, move these 2965 * to the program head, or make sure it can't be skipped. 2966 */ 2967 } 2968 2969 emit_interp(pc, reg, iv, mode); 2970} 2971 2972/* The face input is always at v[255] (varying space), with a 2973 * value of 0 for back-facing, and 0xffffffff for front-facing. 2974 */ 2975static void 2976load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a) 2977{ 2978 struct nv50_reg *one = alloc_immd(pc, 1.0f); 2979 2980 assert(a->rhw == -1); 2981 alloc_reg(pc, a); /* do this before rhw is set */ 2982 a->rhw = 255; 2983 load_interpolant(pc, a); 2984 emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND); 2985 2986 FREE(one); 2987} 2988 2989static boolean 2990nv50_program_tx_prep(struct nv50_pc *pc) 2991{ 2992 struct tgsi_parse_context tp; 2993 struct nv50_program *p = pc->p; 2994 boolean ret = FALSE; 2995 unsigned i, c, flat_nr = 0; 2996 2997 tgsi_parse_init(&tp, pc->p->pipe.tokens); 2998 while (!tgsi_parse_end_of_tokens(&tp)) { 2999 const union tgsi_full_token *tok = &tp.FullToken; 3000 3001 tgsi_parse_token(&tp); 3002 switch (tok->Token.Type) { 3003 case TGSI_TOKEN_TYPE_IMMEDIATE: 3004 { 3005 const struct tgsi_full_immediate *imm = 3006 &tp.FullToken.FullImmediate; 3007 3008 ctor_immd_4f32(pc, imm->u[0].Float, 3009 imm->u[1].Float, 3010 imm->u[2].Float, 3011 imm->u[3].Float); 3012 } 3013 break; 3014 case TGSI_TOKEN_TYPE_DECLARATION: 3015 { 3016 const struct tgsi_full_declaration *d; 3017 unsigned si, last, first, mode; 3018 3019 d = &tp.FullToken.FullDeclaration; 3020 first = d->Range.First; 3021 last = d->Range.Last; 3022 3023 switch (d->Declaration.File) { 3024 case TGSI_FILE_TEMPORARY: 3025 break; 3026 case TGSI_FILE_OUTPUT: 3027 if (!d->Declaration.Semantic || 3028 p->type == PIPE_SHADER_FRAGMENT) 3029 break; 3030 3031 si = d->Semantic.Index; 3032 switch (d->Semantic.Name) { 3033 case TGSI_SEMANTIC_BCOLOR: 3034 p->cfg.two_side[si].hw = first; 3035 if (p->cfg.io_nr > first) 3036 p->cfg.io_nr = first; 3037 break; 3038 case TGSI_SEMANTIC_PSIZE: 3039 p->cfg.psiz = first; 3040 if (p->cfg.io_nr > first) 3041 p->cfg.io_nr = first; 3042 break; 3043 case TGSI_SEMANTIC_EDGEFLAG: 3044 pc->edgeflag_out = first; 3045 break; 3046 /* 3047 case TGSI_SEMANTIC_CLIP_DISTANCE: 3048 p->cfg.clpd = MIN2(p->cfg.clpd, first); 3049 break; 3050 */ 3051 default: 3052 break; 3053 } 3054 break; 3055 case TGSI_FILE_INPUT: 3056 { 3057 if (p->type != PIPE_SHADER_FRAGMENT) 3058 break; 3059 3060 switch (d->Declaration.Interpolate) { 3061 case TGSI_INTERPOLATE_CONSTANT: 3062 mode = INTERP_FLAT; 3063 flat_nr++; 3064 break; 3065 case TGSI_INTERPOLATE_PERSPECTIVE: 3066 mode = INTERP_PERSPECTIVE; 3067 p->cfg.regs[1] |= 0x08 << 24; 3068 break; 3069 default: 3070 mode = INTERP_LINEAR; 3071 break; 3072 } 3073 if (d->Declaration.Centroid) 3074 mode |= INTERP_CENTROID; 3075 3076 assert(last < 32); 3077 for (i = first; i <= last; i++) 3078 pc->interp_mode[i] = mode; 3079 } 3080 break; 3081 case TGSI_FILE_ADDRESS: 3082 case TGSI_FILE_CONSTANT: 3083 case TGSI_FILE_SAMPLER: 3084 break; 3085 default: 3086 NOUVEAU_ERR("bad decl file %d\n", 3087 d->Declaration.File); 3088 goto out_err; 3089 } 3090 } 3091 break; 3092 case TGSI_TOKEN_TYPE_INSTRUCTION: 3093 pc->insn_nr++; 3094 prep_inspect_insn(pc, &tok->FullInstruction); 3095 break; 3096 default: 3097 break; 3098 } 3099 } 3100 3101 if (p->type == PIPE_SHADER_VERTEX) { 3102 int rid = 0; 3103 3104 for (i = 0; i < pc->attr_nr * 4; ++i) { 3105 if (pc->attr[i].acc) { 3106 pc->attr[i].hw = rid++; 3107 p->cfg.attr[i / 32] |= 1 << (i % 32); 3108 } 3109 } 3110 3111 for (i = 0, rid = 0; i < pc->result_nr; ++i) { 3112 p->cfg.io[i].hw = rid; 3113 p->cfg.io[i].id = i; 3114 3115 for (c = 0; c < 4; ++c) { 3116 int n = i * 4 + c; 3117 if (!pc->result[n].acc) 3118 continue; 3119 pc->result[n].hw = rid++; 3120 p->cfg.io[i].mask |= 1 << c; 3121 } 3122 } 3123 3124 for (c = 0; c < 2; ++c) 3125 if (p->cfg.two_side[c].hw < 0x40) 3126 p->cfg.two_side[c] = p->cfg.io[ 3127 p->cfg.two_side[c].hw]; 3128 3129 if (p->cfg.psiz < 0x40) 3130 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw; 3131 } else 3132 if (p->type == PIPE_SHADER_FRAGMENT) { 3133 int rid, aid; 3134 unsigned n = 0, m = pc->attr_nr - flat_nr; 3135 3136 pc->allow32 = TRUE; 3137 3138 int base = (TGSI_SEMANTIC_POSITION == 3139 p->info.input_semantic_name[0]) ? 0 : 1; 3140 3141 /* non-flat interpolants have to be mapped to 3142 * the lower hardware IDs, so sort them: 3143 */ 3144 for (i = 0; i < pc->attr_nr; i++) { 3145 if (pc->interp_mode[i] == INTERP_FLAT) 3146 p->cfg.io[m++].id = i; 3147 else { 3148 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) 3149 p->cfg.io[n].linear = TRUE; 3150 p->cfg.io[n++].id = i; 3151 } 3152 } 3153 3154 if (!base) /* set w-coordinate mask from perspective interp */ 3155 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24; 3156 3157 aid = popcnt4( /* if fcrd isn't contained in cfg.io */ 3158 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask); 3159 3160 for (n = 0; n < pc->attr_nr; ++n) { 3161 p->cfg.io[n].hw = rid = aid; 3162 i = p->cfg.io[n].id; 3163 3164 if (p->info.input_semantic_name[n] == 3165 TGSI_SEMANTIC_FACE) { 3166 load_frontfacing(pc, &pc->attr[i * 4]); 3167 continue; 3168 } 3169 3170 for (c = 0; c < 4; ++c) { 3171 if (!pc->attr[i * 4 + c].acc) 3172 continue; 3173 pc->attr[i * 4 + c].rhw = rid++; 3174 p->cfg.io[n].mask |= 1 << c; 3175 3176 load_interpolant(pc, &pc->attr[i * 4 + c]); 3177 } 3178 aid += popcnt4(p->cfg.io[n].mask); 3179 } 3180 3181 if (!base) 3182 p->cfg.regs[1] |= p->cfg.io[0].mask << 24; 3183 3184 m = popcnt4(p->cfg.regs[1] >> 24); 3185 3186 /* set count of non-position inputs and of non-flat 3187 * non-position inputs for FP_INTERPOLANT_CTRL 3188 */ 3189 p->cfg.regs[1] |= aid - m; 3190 3191 if (flat_nr) { 3192 i = p->cfg.io[pc->attr_nr - flat_nr].hw; 3193 p->cfg.regs[1] |= (i - m) << 16; 3194 } else 3195 p->cfg.regs[1] |= p->cfg.regs[1] << 16; 3196 3197 /* mark color semantic for light-twoside */ 3198 n = 0x40; 3199 for (i = 0; i < pc->attr_nr; i++) { 3200 ubyte si, sn; 3201 3202 sn = p->info.input_semantic_name[p->cfg.io[i].id]; 3203 si = p->info.input_semantic_index[p->cfg.io[i].id]; 3204 3205 if (sn == TGSI_SEMANTIC_COLOR) { 3206 p->cfg.two_side[si] = p->cfg.io[i]; 3207 3208 /* increase colour count */ 3209 p->cfg.regs[0] += popcnt4( 3210 p->cfg.two_side[si].mask) << 16; 3211 3212 n = MIN2(n, p->cfg.io[i].hw - m); 3213 } 3214 } 3215 if (n < 0x40) 3216 p->cfg.regs[0] += n; 3217 3218 /* Initialize FP results: 3219 * FragDepth is always first TGSI and last hw output 3220 */ 3221 i = p->info.writes_z ? 4 : 0; 3222 for (rid = 0; i < pc->result_nr * 4; i++) 3223 pc->result[i].rhw = rid++; 3224 if (p->info.writes_z) 3225 pc->result[2].rhw = rid; 3226 3227 p->cfg.high_result = rid; 3228 3229 /* separate/different colour results for MRTs ? */ 3230 if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) 3231 p->cfg.regs[2] |= 1; 3232 } 3233 3234 if (pc->immd_nr) { 3235 int rid = 0; 3236 3237 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); 3238 if (!pc->immd) 3239 goto out_err; 3240 3241 for (i = 0; i < pc->immd_nr; i++) { 3242 for (c = 0; c < 4; c++, rid++) 3243 ctor_reg(&pc->immd[rid], P_IMMD, i, rid); 3244 } 3245 } 3246 3247 ret = TRUE; 3248out_err: 3249 if (pc->iv_p) 3250 free_temp(pc, pc->iv_p); 3251 if (pc->iv_c) 3252 free_temp(pc, pc->iv_c); 3253 3254 tgsi_parse_free(&tp); 3255 return ret; 3256} 3257 3258static void 3259free_nv50_pc(struct nv50_pc *pc) 3260{ 3261 if (pc->immd) 3262 FREE(pc->immd); 3263 if (pc->param) 3264 FREE(pc->param); 3265 if (pc->result) 3266 FREE(pc->result); 3267 if (pc->attr) 3268 FREE(pc->attr); 3269 if (pc->temp) 3270 FREE(pc->temp); 3271 3272 FREE(pc); 3273} 3274 3275static boolean 3276ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) 3277{ 3278 int i, c; 3279 unsigned rtype[2] = { P_ATTR, P_RESULT }; 3280 3281 pc->p = p; 3282 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; 3283 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; 3284 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; 3285 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; 3286 pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; 3287 assert(pc->addr_nr <= 2); 3288 3289 p->cfg.high_temp = 4; 3290 3291 p->cfg.two_side[0].hw = 0x40; 3292 p->cfg.two_side[1].hw = 0x40; 3293 3294 p->cfg.edgeflag_in = pc->edgeflag_out = 0xff; 3295 3296 switch (p->type) { 3297 case PIPE_SHADER_VERTEX: 3298 p->cfg.psiz = 0x40; 3299 p->cfg.clpd = 0x40; 3300 p->cfg.io_nr = pc->result_nr; 3301 break; 3302 case PIPE_SHADER_FRAGMENT: 3303 rtype[0] = rtype[1] = P_TEMP; 3304 3305 p->cfg.regs[0] = 0x01000004; 3306 p->cfg.io_nr = pc->attr_nr; 3307 3308 if (p->info.writes_z) { 3309 p->cfg.regs[2] |= 0x00000100; 3310 p->cfg.regs[3] |= 0x00000011; 3311 } 3312 if (p->info.uses_kill) 3313 p->cfg.regs[2] |= 0x00100000; 3314 break; 3315 } 3316 3317 if (pc->temp_nr) { 3318 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); 3319 if (!pc->temp) 3320 return FALSE; 3321 3322 for (i = 0; i < pc->temp_nr * 4; ++i) 3323 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); 3324 } 3325 3326 if (pc->attr_nr) { 3327 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); 3328 if (!pc->attr) 3329 return FALSE; 3330 3331 for (i = 0; i < pc->attr_nr * 4; ++i) 3332 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); 3333 } 3334 3335 if (pc->result_nr) { 3336 unsigned nr = pc->result_nr * 4; 3337 3338 pc->result = MALLOC(nr * sizeof(struct nv50_reg)); 3339 if (!pc->result) 3340 return FALSE; 3341 3342 for (i = 0; i < nr; ++i) 3343 ctor_reg(&pc->result[i], rtype[1], i / 4, -1); 3344 } 3345 3346 if (pc->param_nr) { 3347 int rid = 0; 3348 3349 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); 3350 if (!pc->param) 3351 return FALSE; 3352 3353 for (i = 0; i < pc->param_nr; ++i) 3354 for (c = 0; c < 4; ++c, ++rid) 3355 ctor_reg(&pc->param[rid], P_CONST, i, rid); 3356 } 3357 3358 if (pc->addr_nr) { 3359 pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); 3360 if (!pc->addr) 3361 return FALSE; 3362 } 3363 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 3364 ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1); 3365 3366 return TRUE; 3367} 3368 3369static void 3370nv50_program_fixup_insns(struct nv50_pc *pc) 3371{ 3372 struct nv50_program_exec *e, **bra_list; 3373 unsigned i, n, pos; 3374 3375 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); 3376 3377 /* Collect branch instructions, we need to adjust their offsets 3378 * when converting 32 bit instructions to 64 bit ones 3379 */ 3380 for (n = 0, e = pc->p->exec_head; e; e = e->next) 3381 if (e->param.index >= 0 && !e->param.mask) 3382 bra_list[n++] = e; 3383 3384 /* Make sure we don't have any single 32 bit instructions. */ 3385 for (e = pc->p->exec_head, pos = 0; e; e = e->next) { 3386 pos += is_long(e) ? 2 : 1; 3387 3388 if ((pos & 1) && (!e->next || is_long(e->next))) { 3389 for (i = 0; i < n; ++i) 3390 if (bra_list[i]->param.index >= pos) 3391 bra_list[i]->param.index += 1; 3392 for (i = 0; i < pc->insn_nr; ++i) 3393 if (pc->insn_pos[i] >= pos) 3394 pc->insn_pos[i] += 1; 3395 convert_to_long(pc, e); 3396 ++pos; 3397 } 3398 } 3399 3400 FREE(bra_list); 3401 3402 if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL]) 3403 return; 3404 3405 /* fill in CALL offsets */ 3406 for (e = pc->p->exec_head; e; e = e->next) { 3407 if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2) 3408 e->param.index = pc->insn_pos[e->param.index]; 3409 } 3410} 3411 3412static boolean 3413nv50_program_tx(struct nv50_program *p) 3414{ 3415 struct tgsi_parse_context parse; 3416 struct nv50_pc *pc; 3417 boolean ret; 3418 3419 pc = CALLOC_STRUCT(nv50_pc); 3420 if (!pc) 3421 return FALSE; 3422 3423 ret = ctor_nv50_pc(pc, p); 3424 if (ret == FALSE) 3425 goto out_cleanup; 3426 3427 ret = nv50_program_tx_prep(pc); 3428 if (ret == FALSE) 3429 goto out_cleanup; 3430 3431 pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned)); 3432 3433 tgsi_parse_init(&parse, pc->p->pipe.tokens); 3434 while (!tgsi_parse_end_of_tokens(&parse)) { 3435 const union tgsi_full_token *tok = &parse.FullToken; 3436 3437 /* previously allow32 was FALSE for first & last instruction */ 3438 pc->allow32 = TRUE; 3439 3440 tgsi_parse_token(&parse); 3441 3442 switch (tok->Token.Type) { 3443 case TGSI_TOKEN_TYPE_INSTRUCTION: 3444 pc->insn_pos[pc->insn_cur] = pc->p->exec_size; 3445 ++pc->insn_cur; 3446 ret = nv50_tgsi_insn(pc, tok); 3447 if (ret == FALSE) 3448 goto out_err; 3449 break; 3450 default: 3451 break; 3452 } 3453 } 3454 3455 nv50_program_fixup_insns(pc); 3456 3457 p->param_nr = pc->param_nr * 4; 3458 p->immd_nr = pc->immd_nr * 4; 3459 p->immd = pc->immd_buf; 3460 3461out_err: 3462 tgsi_parse_free(&parse); 3463 3464out_cleanup: 3465 free_nv50_pc(pc); 3466 return ret; 3467} 3468 3469static void 3470nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 3471{ 3472 if (nv50_program_tx(p) == FALSE) 3473 assert(0); 3474 p->translated = TRUE; 3475} 3476 3477static void 3478nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map, 3479 unsigned start, unsigned count, unsigned cbuf) 3480{ 3481 struct nouveau_channel *chan = nv50->screen->base.channel; 3482 struct nouveau_grobj *tesla = nv50->screen->tesla; 3483 3484 while (count) { 3485 unsigned nr = count > 2047 ? 2047 : count; 3486 3487 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 3488 OUT_RING (chan, (cbuf << 0) | (start << 8)); 3489 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 3490 OUT_RINGp (chan, map, nr); 3491 3492 map += nr; 3493 start += nr; 3494 count -= nr; 3495 } 3496} 3497 3498static void 3499nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 3500{ 3501 struct pipe_screen *pscreen = nv50->pipe.screen; 3502 3503 if (!p->data[0] && p->immd_nr) { 3504 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 3505 3506 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 3507 while (heap->next && heap->size < p->immd_nr) { 3508 struct nv50_program *evict = heap->next->priv; 3509 nouveau_resource_free(&evict->data[0]); 3510 } 3511 3512 if (nouveau_resource_alloc(heap, p->immd_nr, p, 3513 &p->data[0])) 3514 assert(0); 3515 } 3516 3517 /* immediates only need to be uploaded again when freed */ 3518 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 3519 p->immd_nr, NV50_CB_PMISC); 3520 } 3521 3522 assert(p->param_nr <= 512); 3523 3524 if (p->param_nr) { 3525 unsigned cb; 3526 uint32_t *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], 3527 PIPE_BUFFER_USAGE_CPU_READ); 3528 3529 if (p->type == PIPE_SHADER_VERTEX) 3530 cb = NV50_CB_PVP; 3531 else 3532 cb = NV50_CB_PFP; 3533 3534 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); 3535 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); 3536 } 3537} 3538 3539static void 3540nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 3541{ 3542 struct nouveau_channel *chan = nv50->screen->base.channel; 3543 struct nv50_program_exec *e; 3544 uint32_t *up, i; 3545 boolean upload = FALSE; 3546 3547 if (!p->bo) { 3548 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 3549 p->exec_size * 4, &p->bo); 3550 upload = TRUE; 3551 } 3552 3553 if (p->data[0] && p->data[0]->start != p->data_start[0]) 3554 upload = TRUE; 3555 3556 if (!upload) 3557 return; 3558 3559 up = MALLOC(p->exec_size * 4); 3560 3561 for (i = 0, e = p->exec_head; e; e = e->next) { 3562 unsigned ei, ci, bs; 3563 3564 if (e->param.index >= 0 && e->param.mask) { 3565 bs = (e->inst[1] >> 22) & 0x07; 3566 assert(bs < 2); 3567 ei = e->param.shift >> 5; 3568 ci = e->param.index; 3569 if (bs == 0) 3570 ci += p->data[bs]->start; 3571 3572 e->inst[ei] &= ~e->param.mask; 3573 e->inst[ei] |= (ci << e->param.shift); 3574 } else 3575 if (e->param.index >= 0) { 3576 /* zero mask means param is a jump/branch offset */ 3577 assert(!(e->param.index & 1)); 3578 /* seem to be 8 byte steps */ 3579 ei = (e->param.index >> 1) + 0 /* START_ID */; 3580 3581 e->inst[0] &= 0xf0000fff; 3582 e->inst[0] |= ei << 12; 3583 } 3584 3585 up[i++] = e->inst[0]; 3586 if (is_long(e)) 3587 up[i++] = e->inst[1]; 3588 } 3589 assert(i == p->exec_size); 3590 3591 if (p->data[0]) 3592 p->data_start[0] = p->data[0]->start; 3593 3594#ifdef NV50_PROGRAM_DUMP 3595 NOUVEAU_ERR("-------\n"); 3596 for (e = p->exec_head; e; e = e->next) { 3597 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 3598 if (is_long(e)) 3599 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 3600 } 3601#endif 3602 nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM, 3603 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, 3604 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0, 3605 0, 0, p->exec_size * 4, 1, 1); 3606 3607 FREE(up); 3608} 3609 3610void 3611nv50_vertprog_validate(struct nv50_context *nv50) 3612{ 3613 struct nouveau_grobj *tesla = nv50->screen->tesla; 3614 struct nv50_program *p = nv50->vertprog; 3615 struct nouveau_stateobj *so; 3616 3617 if (!p->translated) { 3618 nv50_program_validate(nv50, p); 3619 if (!p->translated) 3620 assert(0); 3621 } 3622 3623 nv50_program_validate_data(nv50, p); 3624 nv50_program_validate_code(nv50, p); 3625 3626 so = so_new(5, 8, 2); 3627 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 3628 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3629 NOUVEAU_BO_HIGH, 0, 0); 3630 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3631 NOUVEAU_BO_LOW, 0, 0); 3632 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 3633 so_data (so, p->cfg.attr[0]); 3634 so_data (so, p->cfg.attr[1]); 3635 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 3636 so_data (so, p->cfg.high_result); 3637 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); 3638 so_data (so, p->cfg.high_result); //8); 3639 so_data (so, p->cfg.high_temp); 3640 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 3641 so_data (so, 0); /* program start offset */ 3642 so_ref(so, &nv50->state.vertprog); 3643 so_ref(NULL, &so); 3644} 3645 3646void 3647nv50_fragprog_validate(struct nv50_context *nv50) 3648{ 3649 struct nouveau_grobj *tesla = nv50->screen->tesla; 3650 struct nv50_program *p = nv50->fragprog; 3651 struct nouveau_stateobj *so; 3652 3653 if (!p->translated) { 3654 nv50_program_validate(nv50, p); 3655 if (!p->translated) 3656 assert(0); 3657 } 3658 3659 nv50_program_validate_data(nv50, p); 3660 nv50_program_validate_code(nv50, p); 3661 3662 so = so_new(6, 7, 2); 3663 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 3664 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3665 NOUVEAU_BO_HIGH, 0, 0); 3666 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3667 NOUVEAU_BO_LOW, 0, 0); 3668 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); 3669 so_data (so, p->cfg.high_temp); 3670 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 3671 so_data (so, p->cfg.high_result); 3672 so_method(so, tesla, NV50TCL_FP_CONTROL, 1); 3673 so_data (so, p->cfg.regs[2]); 3674 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 3675 so_data (so, p->cfg.regs[3]); 3676 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 3677 so_data (so, 0); /* program start offset */ 3678 so_ref(so, &nv50->state.fragprog); 3679 so_ref(NULL, &so); 3680} 3681 3682static void 3683nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) 3684{ 3685 struct nv50_program *fp = nv50->fragprog; 3686 struct nv50_program *vp = nv50->vertprog; 3687 unsigned i, c, m = base; 3688 3689 /* XXX: this might not work correctly in all cases yet - we'll 3690 * just assume that an FP generic input that is not written in 3691 * the VP is PointCoord. 3692 */ 3693 memset(pntc, 0, 8 * sizeof(uint32_t)); 3694 3695 for (i = 0; i < fp->cfg.io_nr; i++) { 3696 uint8_t sn, si; 3697 uint8_t j, k = fp->cfg.io[i].id; 3698 unsigned n = popcnt4(fp->cfg.io[i].mask); 3699 3700 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) { 3701 m += n; 3702 continue; 3703 } 3704 3705 for (j = 0; j < vp->info.num_outputs; ++j) { 3706 sn = vp->info.output_semantic_name[j]; 3707 si = vp->info.output_semantic_index[j]; 3708 3709 if (sn == fp->info.input_semantic_name[k] && 3710 si == fp->info.input_semantic_index[k]) 3711 break; 3712 } 3713 3714 if (j < vp->info.num_outputs) { 3715 ubyte mode = 3716 nv50->rasterizer->pipe.sprite_coord_mode[si]; 3717 3718 if (mode == PIPE_SPRITE_COORD_NONE) { 3719 m += n; 3720 continue; 3721 } 3722 } 3723 3724 /* this is either PointCoord or replaced by sprite coords */ 3725 for (c = 0; c < 4; c++) { 3726 if (!(fp->cfg.io[i].mask & (1 << c))) 3727 continue; 3728 pntc[m / 8] |= (c + 1) << ((m % 8) * 4); 3729 ++m; 3730 } 3731 } 3732} 3733 3734static int 3735nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4], 3736 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) 3737{ 3738 int c; 3739 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; 3740 uint8_t *map = (uint8_t *)p_map; 3741 3742 for (c = 0; c < 4; ++c) { 3743 if (mf & 1) { 3744 if (fpi->linear == TRUE) 3745 lin[mid / 32] |= 1 << (mid % 32); 3746 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40); 3747 } 3748 3749 oid += mv & 1; 3750 mf >>= 1; 3751 mv >>= 1; 3752 } 3753 3754 return mid; 3755} 3756 3757void 3758nv50_linkage_validate(struct nv50_context *nv50) 3759{ 3760 struct nouveau_grobj *tesla = nv50->screen->tesla; 3761 struct nv50_program *vp = nv50->vertprog; 3762 struct nv50_program *fp = nv50->fragprog; 3763 struct nouveau_stateobj *so; 3764 struct nv50_sreg4 dummy, *vpo; 3765 int i, n, c, m = 0; 3766 uint32_t map[16], lin[4], reg[5], pcrd[8]; 3767 3768 memset(map, 0, sizeof(map)); 3769 memset(lin, 0, sizeof(lin)); 3770 3771 reg[1] = 0x00000004; /* low and high clip distance map ids */ 3772 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ 3773 reg[3] = 0x00000000; /* point size map id & enable */ 3774 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ 3775 reg[4] = fp->cfg.regs[1]; /* interpolant info */ 3776 3777 dummy.linear = FALSE; 3778 dummy.mask = 0xf; /* map all components of HPOS */ 3779 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]); 3780 3781 dummy.mask = 0x0; 3782 3783 if (vp->cfg.clpd < 0x40) { 3784 for (c = 0; c < vp->cfg.clpd_nr; ++c) 3785 map[m++] = vp->cfg.clpd + c; 3786 reg[1] = (m << 8); 3787 } 3788 3789 reg[0] |= m << 8; /* adjust BFC0 id */ 3790 3791 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ 3792 if (nv50->rasterizer->pipe.light_twoside) { 3793 vpo = &vp->cfg.two_side[0]; 3794 3795 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]); 3796 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]); 3797 } 3798 3799 reg[0] += m - 4; /* adjust FFC0 id */ 3800 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ 3801 3802 for (i = 0; i < fp->cfg.io_nr; i++) { 3803 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id]; 3804 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id]; 3805 3806 /* position must be mapped first */ 3807 assert(i == 0 || sn != TGSI_SEMANTIC_POSITION); 3808 3809 /* maybe even remove these from cfg.io */ 3810 if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE) 3811 continue; 3812 3813 /* VP outputs and vp->cfg.io are in the same order */ 3814 for (n = 0; n < vp->info.num_outputs; ++n) { 3815 if (vp->info.output_semantic_name[n] == sn && 3816 vp->info.output_semantic_index[n] == si) 3817 break; 3818 } 3819 vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy; 3820 3821 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo); 3822 } 3823 3824 if (nv50->rasterizer->pipe.point_size_per_vertex) { 3825 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); 3826 reg[3] = (m++ << 4) | 1; 3827 } 3828 3829 /* now fill the stateobj */ 3830 so = so_new(6, 58, 0); 3831 3832 n = (m + 3) / 4; 3833 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); 3834 so_data (so, m); 3835 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); 3836 so_datap (so, map, n); 3837 3838 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 3839 so_datap (so, reg, 4); 3840 3841 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); 3842 so_data (so, reg[4]); 3843 3844 so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); 3845 so_datap (so, lin, 4); 3846 3847 if (nv50->rasterizer->pipe.point_sprite) { 3848 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff); 3849 3850 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); 3851 so_datap (so, pcrd, 8); 3852 } 3853 3854 so_ref(so, &nv50->state.programs); 3855 so_ref(NULL, &so); 3856} 3857 3858void 3859nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 3860{ 3861 while (p->exec_head) { 3862 struct nv50_program_exec *e = p->exec_head; 3863 3864 p->exec_head = e->next; 3865 FREE(e); 3866 } 3867 p->exec_tail = NULL; 3868 p->exec_size = 0; 3869 3870 nouveau_bo_ref(NULL, &p->bo); 3871 3872 nouveau_resource_free(&p->data[0]); 3873 3874 p->translated = 0; 3875} 3876