nv50_program.c revision a5f771d7583f9cd2d47bc795fe9231d647659432
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "pipe/p_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33 34#define NV50_SU_MAX_TEMP 127 35#define NV50_SU_MAX_ADDR 4 36//#define NV50_PROGRAM_DUMP 37 38/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ 39 40/* ARL - gallium craps itself on progs/vp/arl.txt 41 * 42 * MSB - Like MAD, but MUL+SUB 43 * - Fuck it off, introduce a way to negate args for ops that 44 * support it. 45 * 46 * Look into inlining IMMD for ops other than MOV (make it general?) 47 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 48 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 49 * 50 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 51 * case, if the emit_src() causes the inst to suddenly become long. 52 * 53 * Verify half-insns work where expected - and force disable them where they 54 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 55 * 56 * FUCK! watch dst==src vectors, can overwrite components that are needed. 57 * ie. SUB R0, R0.yzxw, R0 58 * 59 * Things to check with renouveau: 60 * FP attr/result assignment - how? 61 * attrib 62 * - 0x16bc maps vp output onto fp hpos 63 * - 0x16c0 maps vp output onto fp col0 64 * result 65 * - colr always 0-3 66 * - depr always 4 67 * 0x16bc->0x16e8 --> some binding between vp/fp regs 68 * 0x16b8 --> VP output count 69 * 70 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 71 * "MOV rcol.x, fcol.y" = 0x00000004 72 * 0x19a8 --> as above but 0x00000100 and 0x00000000 73 * - 0x00100000 used when KIL used 74 * 0x196c --> as above but 0x00000011 and 0x00000000 75 * 76 * 0x1988 --> 0xXXNNNNNN 77 * - XX == FP high something 78 */ 79struct nv50_reg { 80 enum { 81 P_TEMP, 82 P_ATTR, 83 P_RESULT, 84 P_CONST, 85 P_IMMD, 86 P_ADDR 87 } type; 88 int index; 89 90 int hw; 91 int mod; 92 93 int rhw; /* result hw for FP outputs, or interpolant index */ 94 int acc; /* instruction where this reg is last read (first insn == 1) */ 95}; 96 97#define NV50_MOD_NEG 1 98#define NV50_MOD_ABS 2 99#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS) 100#define NV50_MOD_SAT 4 101#define NV50_MOD_I32 8 102 103/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */ 104 105/* STACK: Conditionals and loops have to use the (per warp) stack. 106 * Stack entries consist of an entry type (divergent path, join at), 107 * a mask indicating the active threads of the warp, and an address. 108 * MPs can store 12 stack entries internally, if we need more (and 109 * we probably do), we have to create a stack buffer in VRAM. 110 */ 111/* impose low limits for now */ 112#define NV50_MAX_COND_NESTING 4 113#define NV50_MAX_LOOP_NESTING 3 114 115#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2 116 117struct nv50_pc { 118 struct nv50_program *p; 119 120 /* hw resources */ 121 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 122 struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; 123 124 /* tgsi resources */ 125 struct nv50_reg *temp; 126 int temp_nr; 127 struct nv50_reg *attr; 128 int attr_nr; 129 struct nv50_reg *result; 130 int result_nr; 131 struct nv50_reg *param; 132 int param_nr; 133 struct nv50_reg *immd; 134 uint32_t *immd_buf; 135 int immd_nr; 136 struct nv50_reg **addr; 137 int addr_nr; 138 uint8_t addr_alloc; /* set bit indicates used for TGSI_FILE_ADDRESS */ 139 140 struct nv50_reg *temp_temp[16]; 141 unsigned temp_temp_nr; 142 143 /* broadcast and destination replacement regs */ 144 struct nv50_reg *r_brdc; 145 struct nv50_reg *r_dst[4]; 146 147 struct nv50_reg reg_instances[16]; 148 unsigned reg_instance_nr; 149 150 unsigned interp_mode[32]; 151 /* perspective interpolation registers */ 152 struct nv50_reg *iv_p; 153 struct nv50_reg *iv_c; 154 155 struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING]; 156 struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING]; 157 struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING]; 158 int if_lvl, loop_lvl; 159 unsigned loop_pos[NV50_MAX_LOOP_NESTING]; 160 161 unsigned *insn_pos; /* actual program offset of each TGSI insn */ 162 boolean in_subroutine; 163 164 /* current instruction and total number of insns */ 165 unsigned insn_cur; 166 unsigned insn_nr; 167 168 boolean allow32; 169 170 uint8_t edgeflag_out; 171}; 172 173static INLINE void 174ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) 175{ 176 reg->type = type; 177 reg->index = index; 178 reg->hw = hw; 179 reg->mod = 0; 180 reg->rhw = -1; 181 reg->acc = 0; 182} 183 184static INLINE unsigned 185popcnt4(uint32_t val) 186{ 187 static const unsigned cnt[16] 188 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 189 return cnt[val & 0xf]; 190} 191 192static void 193terminate_mbb(struct nv50_pc *pc) 194{ 195 int i; 196 197 /* remove records of temporary address register values */ 198 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 199 pc->r_addr[i].rhw = -1; 200} 201 202static void 203alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 204{ 205 int i = 0; 206 207 if (reg->type == P_RESULT) { 208 if (pc->p->cfg.high_result < (reg->hw + 1)) 209 pc->p->cfg.high_result = reg->hw + 1; 210 } 211 212 if (reg->type != P_TEMP) 213 return; 214 215 if (reg->hw >= 0) { 216 /*XXX: do this here too to catch FP temp-as-attr usage.. 217 * not clean, but works */ 218 if (pc->p->cfg.high_temp < (reg->hw + 1)) 219 pc->p->cfg.high_temp = reg->hw + 1; 220 return; 221 } 222 223 if (reg->rhw != -1) { 224 /* try to allocate temporary with index rhw first */ 225 if (!(pc->r_temp[reg->rhw])) { 226 pc->r_temp[reg->rhw] = reg; 227 reg->hw = reg->rhw; 228 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 229 pc->p->cfg.high_temp = reg->rhw + 1; 230 return; 231 } 232 /* make sure we don't get things like $r0 needs to go 233 * in $r1 and $r1 in $r0 234 */ 235 i = pc->result_nr * 4; 236 } 237 238 for (; i < NV50_SU_MAX_TEMP; i++) { 239 if (!(pc->r_temp[i])) { 240 pc->r_temp[i] = reg; 241 reg->hw = i; 242 if (pc->p->cfg.high_temp < (i + 1)) 243 pc->p->cfg.high_temp = i + 1; 244 return; 245 } 246 } 247 248 assert(0); 249} 250 251static INLINE struct nv50_reg * 252reg_instance(struct nv50_pc *pc, struct nv50_reg *reg) 253{ 254 struct nv50_reg *ri; 255 256 assert(pc->reg_instance_nr < 16); 257 ri = &pc->reg_instances[pc->reg_instance_nr++]; 258 if (reg) { 259 alloc_reg(pc, reg); 260 *ri = *reg; 261 reg->mod = 0; 262 } 263 return ri; 264} 265 266/* XXX: For shaders that aren't executed linearly (e.g. shaders that 267 * contain loops), we need to assign all hw regs to TGSI TEMPs early, 268 * lest we risk temp_temps overwriting regs alloc'd "later". 269 */ 270static struct nv50_reg * 271alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 272{ 273 struct nv50_reg *r; 274 int i; 275 276 if (dst && dst->type == P_TEMP && dst->hw == -1) 277 return dst; 278 279 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 280 if (!pc->r_temp[i]) { 281 r = MALLOC_STRUCT(nv50_reg); 282 ctor_reg(r, P_TEMP, -1, i); 283 pc->r_temp[i] = r; 284 return r; 285 } 286 } 287 288 assert(0); 289 return NULL; 290} 291 292/* release the hardware resource held by r */ 293static void 294release_hw(struct nv50_pc *pc, struct nv50_reg *r) 295{ 296 assert(r->type == P_TEMP); 297 if (r->hw == -1) 298 return; 299 300 assert(pc->r_temp[r->hw] == r); 301 pc->r_temp[r->hw] = NULL; 302 303 r->acc = 0; 304 if (r->index == -1) 305 FREE(r); 306} 307 308static void 309free_temp(struct nv50_pc *pc, struct nv50_reg *r) 310{ 311 if (r->index == -1) { 312 unsigned hw = r->hw; 313 314 FREE(pc->r_temp[hw]); 315 pc->r_temp[hw] = NULL; 316 } 317} 318 319static int 320alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 321{ 322 int i; 323 324 if ((idx + 4) >= NV50_SU_MAX_TEMP) 325 return 1; 326 327 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 328 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 329 return alloc_temp4(pc, dst, idx + 4); 330 331 for (i = 0; i < 4; i++) { 332 dst[i] = MALLOC_STRUCT(nv50_reg); 333 ctor_reg(dst[i], P_TEMP, -1, idx + i); 334 pc->r_temp[idx + i] = dst[i]; 335 } 336 337 return 0; 338} 339 340static void 341free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 342{ 343 int i; 344 345 for (i = 0; i < 4; i++) 346 free_temp(pc, reg[i]); 347} 348 349static struct nv50_reg * 350temp_temp(struct nv50_pc *pc) 351{ 352 if (pc->temp_temp_nr >= 16) 353 assert(0); 354 355 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 356 return pc->temp_temp[pc->temp_temp_nr++]; 357} 358 359static void 360kill_temp_temp(struct nv50_pc *pc) 361{ 362 int i; 363 364 for (i = 0; i < pc->temp_temp_nr; i++) 365 free_temp(pc, pc->temp_temp[i]); 366 pc->temp_temp_nr = 0; 367} 368 369static int 370ctor_immd_4u32(struct nv50_pc *pc, 371 uint32_t x, uint32_t y, uint32_t z, uint32_t w) 372{ 373 unsigned size = pc->immd_nr * 4 * sizeof(uint32_t); 374 375 pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t)); 376 377 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 378 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 379 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 380 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 381 382 return pc->immd_nr++; 383} 384 385static INLINE int 386ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w) 387{ 388 return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w)); 389} 390 391static struct nv50_reg * 392alloc_immd(struct nv50_pc *pc, float f) 393{ 394 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); 395 unsigned hw; 396 397 for (hw = 0; hw < pc->immd_nr * 4; hw++) 398 if (pc->immd_buf[hw] == fui(f)) 399 break; 400 401 if (hw == pc->immd_nr * 4) 402 hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4; 403 404 ctor_reg(r, P_IMMD, -1, hw); 405 return r; 406} 407 408static struct nv50_program_exec * 409exec(struct nv50_pc *pc) 410{ 411 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 412 413 e->param.index = -1; 414 return e; 415} 416 417static void 418emit(struct nv50_pc *pc, struct nv50_program_exec *e) 419{ 420 struct nv50_program *p = pc->p; 421 422 if (p->exec_tail) 423 p->exec_tail->next = e; 424 if (!p->exec_head) 425 p->exec_head = e; 426 p->exec_tail = e; 427 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 428} 429 430static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 431 432static boolean 433is_long(struct nv50_program_exec *e) 434{ 435 if (e->inst[0] & 1) 436 return TRUE; 437 return FALSE; 438} 439 440static boolean 441is_immd(struct nv50_program_exec *e) 442{ 443 if (is_long(e) && (e->inst[1] & 3) == 3) 444 return TRUE; 445 return FALSE; 446} 447 448static boolean 449is_join(struct nv50_program_exec *e) 450{ 451 if (is_long(e) && (e->inst[1] & 3) == 2) 452 return TRUE; 453 return FALSE; 454} 455 456static INLINE void 457set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 458 struct nv50_program_exec *e) 459{ 460 assert(!is_immd(e)); 461 set_long(pc, e); 462 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 463 e->inst[1] |= (pred << 7) | (idx << 12); 464} 465 466static INLINE void 467set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 468 struct nv50_program_exec *e) 469{ 470 set_long(pc, e); 471 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 472 e->inst[1] |= (idx << 4) | (on << 6); 473} 474 475static INLINE void 476set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 477{ 478 if (is_long(e)) 479 return; 480 481 e->inst[0] |= 1; 482 set_pred(pc, 0xf, 0, e); 483 set_pred_wr(pc, 0, 0, e); 484} 485 486static INLINE void 487set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 488{ 489 if (dst->type == P_RESULT) { 490 set_long(pc, e); 491 e->inst[1] |= 0x00000008; 492 } 493 494 alloc_reg(pc, dst); 495 if (dst->hw > 63) 496 set_long(pc, e); 497 e->inst[0] |= (dst->hw << 2); 498} 499 500static INLINE void 501set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 502{ 503 set_long(pc, e); 504 /* XXX: can't be predicated - bits overlap; cases where both 505 * are required should be avoided by using pc->allow32 */ 506 set_pred(pc, 0, 0, e); 507 set_pred_wr(pc, 0, 0, e); 508 509 e->inst[1] |= 0x00000002 | 0x00000001; 510 e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16; 511 e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2; 512} 513 514static INLINE void 515set_addr(struct nv50_program_exec *e, struct nv50_reg *a) 516{ 517 assert(!(e->inst[0] & 0x0c000000)); 518 assert(!(e->inst[1] & 0x00000004)); 519 520 e->inst[0] |= (a->hw & 3) << 26; 521 e->inst[1] |= (a->hw >> 2) << 2; 522} 523 524static void 525emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, 526 struct nv50_reg *src0, uint16_t src1_val) 527{ 528 struct nv50_program_exec *e = exec(pc); 529 530 e->inst[0] = 0xd0000000 | (src1_val << 9); 531 e->inst[1] = 0x20000000; 532 set_long(pc, e); 533 e->inst[0] |= dst->hw << 2; 534 if (src0) /* otherwise will add to $a0, which is always 0 */ 535 set_addr(e, src0); 536 537 emit(pc, e); 538} 539 540static struct nv50_reg * 541alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref) 542{ 543 struct nv50_reg *a_tgsi = NULL, *a = NULL; 544 int i; 545 uint8_t avail = ~pc->addr_alloc; 546 547 if (!ref) { 548 /* allocate for TGSI_FILE_ADDRESS */ 549 while (avail) { 550 i = ffs(avail) - 1; 551 552 if (pc->r_addr[i].rhw < 0 || 553 pc->r_addr[i].acc != pc->insn_cur) { 554 pc->addr_alloc |= (1 << i); 555 556 pc->r_addr[i].rhw = -1; 557 pc->r_addr[i].index = i; 558 return &pc->r_addr[i]; 559 } 560 avail &= ~(1 << i); 561 } 562 assert(0); 563 return NULL; 564 } 565 566 /* Allocate and set an address reg so we can access 'ref'. 567 * 568 * If and r_addr->index will be -1 or the hw index the value 569 * value in rhw is relative to. If rhw < 0, the reg has not 570 * been initialized or is in use for TGSI_FILE_ADDRESS. 571 */ 572 while (avail) { /* only consider regs that are not TGSI */ 573 i = ffs(avail) - 1; 574 avail &= ~(1 << i); 575 576 if ((!a || a->rhw >= 0) && pc->r_addr[i].rhw < 0) { 577 /* prefer an usused reg with low hw index */ 578 a = &pc->r_addr[i]; 579 continue; 580 } 581 if (!a && pc->r_addr[i].acc != pc->insn_cur) 582 a = &pc->r_addr[i]; 583 584 if (ref->hw - pc->r_addr[i].rhw >= 128) 585 continue; 586 587 if ((ref->acc >= 0 && pc->r_addr[i].index < 0) || 588 (ref->acc < 0 && pc->r_addr[i].index == ref->index)) { 589 pc->r_addr[i].acc = pc->insn_cur; 590 return &pc->r_addr[i]; 591 } 592 } 593 assert(a); 594 595 if (ref->acc < 0) 596 a_tgsi = pc->addr[ref->index]; 597 598 emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4); 599 600 a->rhw = ref->hw & ~0x7f; 601 a->acc = pc->insn_cur; 602 a->index = a_tgsi ? ref->index : -1; 603 return a; 604} 605 606#define INTERP_LINEAR 0 607#define INTERP_FLAT 1 608#define INTERP_PERSPECTIVE 2 609#define INTERP_CENTROID 4 610 611/* interpolant index has been stored in dst->rhw */ 612static void 613emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 614 unsigned mode) 615{ 616 assert(dst->rhw != -1); 617 struct nv50_program_exec *e = exec(pc); 618 619 e->inst[0] |= 0x80000000; 620 set_dst(pc, dst, e); 621 e->inst[0] |= (dst->rhw << 16); 622 623 if (mode & INTERP_FLAT) { 624 e->inst[0] |= (1 << 8); 625 } else { 626 if (mode & INTERP_PERSPECTIVE) { 627 e->inst[0] |= (1 << 25); 628 alloc_reg(pc, iv); 629 e->inst[0] |= (iv->hw << 9); 630 } 631 632 if (mode & INTERP_CENTROID) 633 e->inst[0] |= (1 << 24); 634 } 635 636 emit(pc, e); 637} 638 639static void 640set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 641 struct nv50_program_exec *e) 642{ 643 set_long(pc, e); 644 645 e->param.index = src->hw & 127; 646 e->param.shift = s; 647 e->param.mask = m << (s % 32); 648 649 if (src->hw > 127) 650 set_addr(e, alloc_addr(pc, src)); 651 else 652 if (src->acc < 0) { 653 assert(src->type == P_CONST); 654 set_addr(e, pc->addr[src->index]); 655 } 656 657 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 658} 659 660/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */ 661static void 662emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 663{ 664 struct nv50_program_exec *e = exec(pc); 665 666 e->inst[0] = 0x10000000; 667 if (!pc->allow32) 668 set_long(pc, e); 669 670 set_dst(pc, dst, e); 671 672 if (!is_long(e) && src->type == P_IMMD) { 673 set_immd(pc, src, e); 674 /*XXX: 32-bit, but steals part of "half" reg space - need to 675 * catch and handle this case if/when we do half-regs 676 */ 677 } else 678 if (src->type == P_IMMD || src->type == P_CONST) { 679 set_long(pc, e); 680 set_data(pc, src, 0x7f, 9, e); 681 e->inst[1] |= 0x20000000; /* mov from c[] */ 682 } else { 683 if (src->type == P_ATTR) { 684 set_long(pc, e); 685 e->inst[1] |= 0x00200000; 686 } 687 688 alloc_reg(pc, src); 689 if (src->hw > 63) 690 set_long(pc, e); 691 e->inst[0] |= (src->hw << 9); 692 } 693 694 if (is_long(e) && !is_immd(e)) { 695 e->inst[1] |= 0x04000000; /* 32-bit */ 696 e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */ 697 if (!(e->inst[1] & 0x20000000)) 698 e->inst[1] |= 0x00030000; /* lane mask 2:3 */ 699 } else 700 e->inst[0] |= 0x00008000; 701 702 emit(pc, e); 703} 704 705static INLINE void 706emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 707{ 708 struct nv50_reg *imm = alloc_immd(pc, f); 709 emit_mov(pc, dst, imm); 710 FREE(imm); 711} 712 713/* Assign the hw of the discarded temporary register src 714 * to the tgsi register dst and free src. 715 */ 716static void 717assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 718{ 719 assert(src->index == -1 && src->hw != -1); 720 721 if (pc->if_lvl || pc->loop_lvl || 722 (dst->type != P_TEMP) || 723 (src->hw < pc->result_nr * 4 && 724 pc->p->type == PIPE_SHADER_FRAGMENT) || 725 pc->p->info.opcode_count[TGSI_OPCODE_CAL] || 726 pc->p->info.opcode_count[TGSI_OPCODE_BRA]) { 727 728 emit_mov(pc, dst, src); 729 free_temp(pc, src); 730 return; 731 } 732 733 if (dst->hw != -1) 734 pc->r_temp[dst->hw] = NULL; 735 pc->r_temp[src->hw] = dst; 736 dst->hw = src->hw; 737 738 FREE(src); 739} 740 741static void 742emit_nop(struct nv50_pc *pc) 743{ 744 struct nv50_program_exec *e = exec(pc); 745 746 e->inst[0] = 0xf0000000; 747 set_long(pc, e); 748 e->inst[1] = 0xe0000000; 749 emit(pc, e); 750} 751 752static boolean 753check_swap_src_0_1(struct nv50_pc *pc, 754 struct nv50_reg **s0, struct nv50_reg **s1) 755{ 756 struct nv50_reg *src0 = *s0, *src1 = *s1; 757 758 if (src0->type == P_CONST) { 759 if (src1->type != P_CONST) { 760 *s0 = src1; 761 *s1 = src0; 762 return TRUE; 763 } 764 } else 765 if (src1->type == P_ATTR) { 766 if (src0->type != P_ATTR) { 767 *s0 = src1; 768 *s1 = src0; 769 return TRUE; 770 } 771 } 772 773 return FALSE; 774} 775 776static void 777set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, 778 struct nv50_program_exec *e) 779{ 780 struct nv50_reg *temp; 781 782 if (src->type != P_TEMP) { 783 temp = temp_temp(pc); 784 emit_mov(pc, temp, src); 785 src = temp; 786 } 787 788 alloc_reg(pc, src); 789 if (src->hw > 63) 790 set_long(pc, e); 791 e->inst[0] |= (src->hw << 9); 792} 793 794static void 795set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 796{ 797 if (src->type == P_ATTR) { 798 set_long(pc, e); 799 e->inst[1] |= 0x00200000; 800 } else 801 if (src->type == P_CONST || src->type == P_IMMD) { 802 struct nv50_reg *temp = temp_temp(pc); 803 804 emit_mov(pc, temp, src); 805 src = temp; 806 } 807 808 alloc_reg(pc, src); 809 if (src->hw > 63) 810 set_long(pc, e); 811 e->inst[0] |= (src->hw << 9); 812} 813 814static void 815set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 816{ 817 if (src->type == P_ATTR) { 818 struct nv50_reg *temp = temp_temp(pc); 819 820 emit_mov(pc, temp, src); 821 src = temp; 822 } else 823 if (src->type == P_CONST || src->type == P_IMMD) { 824 assert(!(e->inst[0] & 0x00800000)); 825 if (e->inst[0] & 0x01000000) { 826 struct nv50_reg *temp = temp_temp(pc); 827 828 emit_mov(pc, temp, src); 829 src = temp; 830 } else { 831 set_data(pc, src, 0x7f, 16, e); 832 e->inst[0] |= 0x00800000; 833 } 834 } 835 836 alloc_reg(pc, src); 837 if (src->hw > 63) 838 set_long(pc, e); 839 e->inst[0] |= ((src->hw & 127) << 16); 840} 841 842static void 843set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 844{ 845 set_long(pc, e); 846 847 if (src->type == P_ATTR) { 848 struct nv50_reg *temp = temp_temp(pc); 849 850 emit_mov(pc, temp, src); 851 src = temp; 852 } else 853 if (src->type == P_CONST || src->type == P_IMMD) { 854 assert(!(e->inst[0] & 0x01000000)); 855 if (e->inst[0] & 0x00800000) { 856 struct nv50_reg *temp = temp_temp(pc); 857 858 emit_mov(pc, temp, src); 859 src = temp; 860 } else { 861 set_data(pc, src, 0x7f, 32+14, e); 862 e->inst[0] |= 0x01000000; 863 } 864 } 865 866 alloc_reg(pc, src); 867 e->inst[1] |= ((src->hw & 127) << 14); 868} 869 870static void 871emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred) 872{ 873 struct nv50_program_exec *e = exec(pc); 874 875 assert(dst->type == P_TEMP); 876 e->inst[1] = 0x20000000 | (pred << 12); 877 set_long(pc, e); 878 set_dst(pc, dst, e); 879 880 emit(pc, e); 881} 882 883static void 884emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src) 885{ 886 struct nv50_program_exec *e = exec(pc); 887 888 e->inst[0] = 0x000001fc; 889 e->inst[1] = 0xa0000008; 890 set_long(pc, e); 891 set_pred_wr(pc, 1, pred, e); 892 set_src_0_restricted(pc, src, e); 893 894 emit(pc, e); 895} 896 897static void 898emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 899 struct nv50_reg *src1) 900{ 901 struct nv50_program_exec *e = exec(pc); 902 903 e->inst[0] |= 0xc0000000; 904 905 if (!pc->allow32) 906 set_long(pc, e); 907 908 check_swap_src_0_1(pc, &src0, &src1); 909 set_dst(pc, dst, e); 910 set_src_0(pc, src0, e); 911 if (src1->type == P_IMMD && !is_long(e)) { 912 if (src0->mod ^ src1->mod) 913 e->inst[0] |= 0x00008000; 914 set_immd(pc, src1, e); 915 } else { 916 set_src_1(pc, src1, e); 917 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { 918 if (is_long(e)) 919 e->inst[1] |= 0x08000000; 920 else 921 e->inst[0] |= 0x00008000; 922 } 923 } 924 925 emit(pc, e); 926} 927 928static void 929emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 930 struct nv50_reg *src0, struct nv50_reg *src1) 931{ 932 struct nv50_program_exec *e = exec(pc); 933 934 e->inst[0] = 0xb0000000; 935 936 alloc_reg(pc, src1); 937 check_swap_src_0_1(pc, &src0, &src1); 938 939 if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { 940 set_long(pc, e); 941 e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | 942 ((src1->mod & NV50_MOD_NEG) << 27); 943 } 944 945 set_dst(pc, dst, e); 946 set_src_0(pc, src0, e); 947 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 948 set_src_2(pc, src1, e); 949 else 950 if (src1->type == P_IMMD) 951 set_immd(pc, src1, e); 952 else 953 set_src_1(pc, src1, e); 954 955 emit(pc, e); 956} 957 958static void 959emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 960 uint8_t s) 961{ 962 struct nv50_program_exec *e = exec(pc); 963 964 set_long(pc, e); 965 e->inst[1] |= 0xc0000000; 966 967 e->inst[0] |= dst->hw << 2; 968 e->inst[0] |= s << 16; /* shift left */ 969 set_src_0_restricted(pc, src, e); 970 971 emit(pc, e); 972} 973 974#define NV50_MAX_F32 0x880 975#define NV50_MAX_S32 0x08c 976#define NV50_MAX_U32 0x084 977#define NV50_MIN_F32 0x8a0 978#define NV50_MIN_S32 0x0ac 979#define NV50_MIN_U32 0x0a4 980 981static void 982emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 983 struct nv50_reg *src0, struct nv50_reg *src1) 984{ 985 struct nv50_program_exec *e = exec(pc); 986 987 set_long(pc, e); 988 e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20); 989 e->inst[1] |= (sub << 24); 990 991 check_swap_src_0_1(pc, &src0, &src1); 992 set_dst(pc, dst, e); 993 set_src_0(pc, src0, e); 994 set_src_1(pc, src1, e); 995 996 if (src0->mod & NV50_MOD_ABS) 997 e->inst[1] |= 0x00100000; 998 if (src1->mod & NV50_MOD_ABS) 999 e->inst[1] |= 0x00080000; 1000 1001 emit(pc, e); 1002} 1003 1004static INLINE void 1005emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1006 struct nv50_reg *src1) 1007{ 1008 src1->mod ^= NV50_MOD_NEG; 1009 emit_add(pc, dst, src0, src1); 1010 src1->mod ^= NV50_MOD_NEG; 1011} 1012 1013static void 1014emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1015 struct nv50_reg *src1, unsigned op) 1016{ 1017 struct nv50_program_exec *e = exec(pc); 1018 1019 e->inst[0] = 0xd0000000; 1020 set_long(pc, e); 1021 1022 check_swap_src_0_1(pc, &src0, &src1); 1023 set_dst(pc, dst, e); 1024 set_src_0(pc, src0, e); 1025 1026 if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && 1027 op != TGSI_OPCODE_XOR) 1028 assert(!"invalid bit op"); 1029 1030 assert(!(src0->mod | src1->mod)); 1031 1032 if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { 1033 set_immd(pc, src1, e); 1034 if (op == TGSI_OPCODE_OR) 1035 e->inst[0] |= 0x0100; 1036 else 1037 if (op == TGSI_OPCODE_XOR) 1038 e->inst[0] |= 0x8000; 1039 } else { 1040 set_src_1(pc, src1, e); 1041 e->inst[1] |= 0x04000000; /* 32 bit */ 1042 if (op == TGSI_OPCODE_OR) 1043 e->inst[1] |= 0x4000; 1044 else 1045 if (op == TGSI_OPCODE_XOR) 1046 e->inst[1] |= 0x8000; 1047 } 1048 1049 emit(pc, e); 1050} 1051 1052static void 1053emit_shift(struct nv50_pc *pc, struct nv50_reg *dst, 1054 struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir) 1055{ 1056 struct nv50_program_exec *e = exec(pc); 1057 1058 e->inst[0] = 0x30000000; 1059 e->inst[1] = 0xc4000000; 1060 1061 set_long(pc, e); 1062 set_dst(pc, dst, e); 1063 set_src_0(pc, src0, e); 1064 1065 if (src1->type == P_IMMD) { 1066 e->inst[1] |= (1 << 20); 1067 e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16; 1068 } else 1069 set_src_1(pc, src1, e); 1070 1071 if (dir != TGSI_OPCODE_SHL) 1072 e->inst[1] |= (1 << 29); 1073 1074 if (dir == TGSI_OPCODE_ISHR) 1075 e->inst[1] |= (1 << 27); 1076 1077 emit(pc, e); 1078} 1079 1080static void 1081emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1082 struct nv50_reg *src1, struct nv50_reg *src2) 1083{ 1084 struct nv50_program_exec *e = exec(pc); 1085 1086 e->inst[0] |= 0xe0000000; 1087 1088 check_swap_src_0_1(pc, &src0, &src1); 1089 set_dst(pc, dst, e); 1090 set_src_0(pc, src0, e); 1091 set_src_1(pc, src1, e); 1092 set_src_2(pc, src2, e); 1093 1094 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) 1095 e->inst[1] |= 0x04000000; 1096 if (src2->mod & NV50_MOD_NEG) 1097 e->inst[1] |= 0x08000000; 1098 1099 emit(pc, e); 1100} 1101 1102static INLINE void 1103emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1104 struct nv50_reg *src1, struct nv50_reg *src2) 1105{ 1106 src2->mod ^= NV50_MOD_NEG; 1107 emit_mad(pc, dst, src0, src1, src2); 1108 src2->mod ^= NV50_MOD_NEG; 1109} 1110 1111#define NV50_FLOP_RCP 0 1112#define NV50_FLOP_RSQ 2 1113#define NV50_FLOP_LG2 3 1114#define NV50_FLOP_SIN 4 1115#define NV50_FLOP_COS 5 1116#define NV50_FLOP_EX2 6 1117 1118/* rcp, rsqrt, lg2 support neg and abs */ 1119static void 1120emit_flop(struct nv50_pc *pc, unsigned sub, 1121 struct nv50_reg *dst, struct nv50_reg *src) 1122{ 1123 struct nv50_program_exec *e = exec(pc); 1124 1125 e->inst[0] |= 0x90000000; 1126 if (sub || src->mod) { 1127 set_long(pc, e); 1128 e->inst[1] |= (sub << 29); 1129 } 1130 1131 set_dst(pc, dst, e); 1132 set_src_0_restricted(pc, src, e); 1133 1134 assert(!src->mod || sub < 4); 1135 1136 if (src->mod & NV50_MOD_NEG) 1137 e->inst[1] |= 0x04000000; 1138 if (src->mod & NV50_MOD_ABS) 1139 e->inst[1] |= 0x00100000; 1140 1141 emit(pc, e); 1142} 1143 1144static void 1145emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1146{ 1147 struct nv50_program_exec *e = exec(pc); 1148 1149 e->inst[0] |= 0xb0000000; 1150 1151 set_dst(pc, dst, e); 1152 set_src_0(pc, src, e); 1153 set_long(pc, e); 1154 e->inst[1] |= (6 << 29) | 0x00004000; 1155 1156 if (src->mod & NV50_MOD_NEG) 1157 e->inst[1] |= 0x04000000; 1158 if (src->mod & NV50_MOD_ABS) 1159 e->inst[1] |= 0x00100000; 1160 1161 emit(pc, e); 1162} 1163 1164static void 1165emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1166{ 1167 struct nv50_program_exec *e = exec(pc); 1168 1169 e->inst[0] |= 0xb0000000; 1170 1171 set_dst(pc, dst, e); 1172 set_src_0(pc, src, e); 1173 set_long(pc, e); 1174 e->inst[1] |= (6 << 29); 1175 1176 if (src->mod & NV50_MOD_NEG) 1177 e->inst[1] |= 0x04000000; 1178 if (src->mod & NV50_MOD_ABS) 1179 e->inst[1] |= 0x00100000; 1180 1181 emit(pc, e); 1182} 1183 1184#define CVT_RN (0x00 << 16) 1185#define CVT_FLOOR (0x02 << 16) 1186#define CVT_CEIL (0x04 << 16) 1187#define CVT_TRUNC (0x06 << 16) 1188#define CVT_SAT (0x08 << 16) 1189#define CVT_ABS (0x10 << 16) 1190 1191#define CVT_X32_X32 0x04004000 1192#define CVT_X32_S32 0x04014000 1193#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) 1194#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) 1195#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) 1196#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) 1197#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) 1198#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) 1199#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) 1200#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) 1201 1202#define CVT_NEG 0x20000000 1203#define CVT_RI 0x08000000 1204 1205static void 1206emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 1207 int wp, uint32_t cvn) 1208{ 1209 struct nv50_program_exec *e; 1210 1211 e = exec(pc); 1212 1213 if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG; 1214 if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS; 1215 1216 e->inst[0] = 0xa0000000; 1217 e->inst[1] = cvn; 1218 set_long(pc, e); 1219 set_src_0(pc, src, e); 1220 1221 if (wp >= 0) 1222 set_pred_wr(pc, 1, wp, e); 1223 1224 if (dst) 1225 set_dst(pc, dst, e); 1226 else { 1227 e->inst[0] |= 0x000001fc; 1228 e->inst[1] |= 0x00000008; 1229 } 1230 1231 emit(pc, e); 1232} 1233 1234/* nv50 Condition codes: 1235 * 0x1 = LT 1236 * 0x2 = EQ 1237 * 0x3 = LE 1238 * 0x4 = GT 1239 * 0x5 = NE 1240 * 0x6 = GE 1241 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) 1242 * 0x8 = unordered bit (allows NaN) 1243 * 1244 * mode = 0x04 (u32), 0x0c (s32), 0x80 (f32) 1245 */ 1246static void 1247emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, 1248 struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode) 1249{ 1250 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 1251 1252 struct nv50_program_exec *e = exec(pc); 1253 struct nv50_reg *rdst; 1254 1255 assert(ccode < 16); 1256 if (check_swap_src_0_1(pc, &src0, &src1)) 1257 ccode = cc_swapped[ccode & 7] | (ccode & 8); 1258 1259 rdst = dst; 1260 if (dst && dst->type != P_TEMP) 1261 dst = alloc_temp(pc, NULL); 1262 1263 set_long(pc, e); 1264 e->inst[0] |= 0x30000000 | (mode << 24); 1265 e->inst[1] |= 0x60000000 | (ccode << 14); 1266 1267 if (wp >= 0) 1268 set_pred_wr(pc, 1, wp, e); 1269 if (dst) 1270 set_dst(pc, dst, e); 1271 else { 1272 e->inst[0] |= 0x000001fc; 1273 e->inst[1] |= 0x00000008; 1274 } 1275 1276 set_src_0(pc, src0, e); 1277 set_src_1(pc, src1, e); 1278 1279 emit(pc, e); 1280 1281 if (rdst && mode == 0x80) /* convert to float ? */ 1282 emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32); 1283 if (rdst && rdst != dst) 1284 free_temp(pc, dst); 1285} 1286 1287static INLINE void 1288map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty) 1289{ 1290 switch (op) { 1291 case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break; 1292 case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break; 1293 case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break; 1294 case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break; 1295 case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break; 1296 case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break; 1297 1298 case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break; 1299 case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break; 1300 case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break; 1301 case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break; 1302 case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break; 1303 case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break; 1304 default: 1305 assert(0); 1306 return; 1307 } 1308} 1309 1310static void 1311emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst, 1312 struct nv50_reg *src0, struct nv50_reg *rsrc1) 1313{ 1314 struct nv50_program_exec *e = exec(pc); 1315 struct nv50_reg *src1; 1316 1317 e->inst[0] = 0x20000000; 1318 1319 alloc_reg(pc, rsrc1); 1320 check_swap_src_0_1(pc, &src0, &rsrc1); 1321 1322 src1 = rsrc1; 1323 if (src0->mod & rsrc1->mod & NV50_MOD_NEG) { 1324 src1 = alloc_temp(pc, NULL); 1325 emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32); 1326 } 1327 1328 if (!pc->allow32 || src1->hw > 63 || 1329 (src1->type != P_TEMP && src1->type != P_IMMD)) 1330 set_long(pc, e); 1331 1332 set_dst(pc, dst, e); 1333 set_src_0(pc, src0, e); 1334 1335 if (is_long(e)) { 1336 e->inst[1] |= 1 << 26; 1337 set_src_2(pc, src1, e); 1338 } else { 1339 e->inst[0] |= 0x8000; 1340 if (src1->type == P_IMMD) 1341 set_immd(pc, src1, e); 1342 else 1343 set_src_1(pc, src1, e); 1344 } 1345 1346 if (src0->mod & NV50_MOD_NEG) 1347 e->inst[0] |= 1 << 28; 1348 else 1349 if (src1->mod & NV50_MOD_NEG) 1350 e->inst[0] |= 1 << 22; 1351 1352 emit(pc, e); 1353 1354 if (src1 != rsrc1) 1355 free_temp(pc, src1); 1356} 1357 1358static void 1359emit_sad(struct nv50_pc *pc, struct nv50_reg *dst, 1360 struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2) 1361{ 1362 struct nv50_program_exec *e = exec(pc); 1363 1364 e->inst[0] = 0x50000000; 1365 set_dst(pc, dst, e); 1366 set_src_0(pc, src0, e); 1367 set_src_1(pc, src1, e); 1368 alloc_reg(pc, src2); 1369 if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw)) 1370 set_src_2(pc, src2, e); 1371 1372 if (is_long(e)) 1373 e->inst[1] |= 0x0c << 24; 1374 else 1375 e->inst[0] |= 0x81 << 8; 1376} 1377 1378static INLINE void 1379emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1380{ 1381 emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI); 1382} 1383 1384static void 1385emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 1386 struct nv50_reg *v, struct nv50_reg *e) 1387{ 1388 struct nv50_reg *temp = alloc_temp(pc, NULL); 1389 1390 emit_flop(pc, NV50_FLOP_LG2, temp, v); 1391 emit_mul(pc, temp, temp, e); 1392 emit_preex2(pc, temp, temp); 1393 emit_flop(pc, NV50_FLOP_EX2, dst, temp); 1394 1395 free_temp(pc, temp); 1396} 1397 1398static INLINE void 1399emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1400{ 1401 emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32); 1402} 1403 1404static void 1405emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1406 struct nv50_reg **src) 1407{ 1408 struct nv50_reg *one = alloc_immd(pc, 1.0); 1409 struct nv50_reg *zero = alloc_immd(pc, 0.0); 1410 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 1411 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 1412 struct nv50_reg *tmp[4]; 1413 boolean allow32 = pc->allow32; 1414 1415 pc->allow32 = FALSE; 1416 1417 if (mask & (3 << 1)) { 1418 tmp[0] = alloc_temp(pc, NULL); 1419 emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero); 1420 } 1421 1422 if (mask & (1 << 2)) { 1423 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 1424 1425 tmp[1] = temp_temp(pc); 1426 emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero); 1427 1428 tmp[3] = temp_temp(pc); 1429 emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128); 1430 emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128); 1431 1432 emit_pow(pc, dst[2], tmp[1], tmp[3]); 1433 emit_mov(pc, dst[2], zero); 1434 set_pred(pc, 3, 0, pc->p->exec_tail); 1435 } 1436 1437 if (mask & (1 << 1)) 1438 assimilate_temp(pc, dst[1], tmp[0]); 1439 else 1440 if (mask & (1 << 2)) 1441 free_temp(pc, tmp[0]); 1442 1443 pc->allow32 = allow32; 1444 1445 /* do this last, in case src[i,j] == dst[0,3] */ 1446 if (mask & (1 << 0)) 1447 emit_mov(pc, dst[0], one); 1448 1449 if (mask & (1 << 3)) 1450 emit_mov(pc, dst[3], one); 1451 1452 FREE(pos128); 1453 FREE(neg128); 1454 FREE(zero); 1455 FREE(one); 1456} 1457 1458static void 1459emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 1460{ 1461 struct nv50_program_exec *e; 1462 const int r_pred = 1; 1463 1464 e = exec(pc); 1465 e->inst[0] = 0x00000002; /* discard */ 1466 set_long(pc, e); /* sets cond code to ALWAYS */ 1467 1468 if (src) { 1469 set_pred(pc, 0x1 /* cc = LT */, r_pred, e); 1470 /* write to predicate reg */ 1471 emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32); 1472 } 1473 1474 emit(pc, e); 1475} 1476 1477static struct nv50_program_exec * 1478emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc) 1479{ 1480 struct nv50_program_exec *e = exec(pc); 1481 1482 e->inst[0] = (op << 28) | 2; 1483 set_long(pc, e); 1484 if (pred >= 0) 1485 set_pred(pc, cc, pred, e); 1486 1487 emit(pc, e); 1488 return e; 1489} 1490 1491static INLINE struct nv50_program_exec * 1492emit_breakaddr(struct nv50_pc *pc) 1493{ 1494 return emit_control_flow(pc, 0x4, -1, 0); 1495} 1496 1497static INLINE void 1498emit_break(struct nv50_pc *pc, int pred, unsigned cc) 1499{ 1500 emit_control_flow(pc, 0x5, pred, cc); 1501} 1502 1503static INLINE struct nv50_program_exec * 1504emit_joinat(struct nv50_pc *pc) 1505{ 1506 return emit_control_flow(pc, 0xa, -1, 0); 1507} 1508 1509static INLINE struct nv50_program_exec * 1510emit_branch(struct nv50_pc *pc, int pred, unsigned cc) 1511{ 1512 return emit_control_flow(pc, 0x1, pred, cc); 1513} 1514 1515static INLINE struct nv50_program_exec * 1516emit_call(struct nv50_pc *pc, int pred, unsigned cc) 1517{ 1518 return emit_control_flow(pc, 0x2, pred, cc); 1519} 1520 1521static INLINE void 1522emit_ret(struct nv50_pc *pc, int pred, unsigned cc) 1523{ 1524 emit_control_flow(pc, 0x3, pred, cc); 1525} 1526 1527#define QOP_ADD 0 1528#define QOP_SUBR 1 1529#define QOP_SUB 2 1530#define QOP_MOV_SRC1 3 1531 1532/* For a quad of threads / top left, top right, bottom left, bottom right 1533 * pixels, do a different operation, and take src0 from a specific thread. 1534 */ 1535static void 1536emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0, 1537 struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop) 1538{ 1539 struct nv50_program_exec *e = exec(pc); 1540 1541 e->inst[0] = 0xc0000000; 1542 e->inst[1] = 0x80000000; 1543 set_long(pc, e); 1544 e->inst[0] |= lane_src0 << 16; 1545 set_src_0(pc, src0, e); 1546 set_src_2(pc, src1, e); 1547 1548 if (wp >= 0) 1549 set_pred_wr(pc, 1, wp, e); 1550 1551 if (dst) 1552 set_dst(pc, dst, e); 1553 else { 1554 e->inst[0] |= 0x000001fc; 1555 e->inst[1] |= 0x00000008; 1556 } 1557 1558 e->inst[0] |= (qop & 3) << 20; 1559 e->inst[1] |= (qop >> 2) << 22; 1560 1561 emit(pc, e); 1562} 1563 1564static void 1565load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], 1566 struct nv50_reg **src, unsigned arg, boolean proj) 1567{ 1568 int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; 1569 1570 src[0]->mod |= NV50_MOD_ABS; 1571 src[1]->mod |= NV50_MOD_ABS; 1572 src[2]->mod |= NV50_MOD_ABS; 1573 1574 emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]); 1575 emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]); 1576 1577 src[0]->mod = mod[0]; 1578 src[1]->mod = mod[1]; 1579 src[2]->mod = mod[2]; 1580 1581 if (proj && 0 /* looks more correct without this */) 1582 emit_mul(pc, t[2], t[2], src[3]); 1583 else 1584 if (arg == 4) /* there is no textureProj(samplerCubeShadow) */ 1585 emit_mov(pc, t[3], src[3]); 1586 1587 emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]); 1588 1589 emit_mul(pc, t[0], src[0], t[2]); 1590 emit_mul(pc, t[1], src[1], t[2]); 1591 emit_mul(pc, t[2], src[2], t[2]); 1592} 1593 1594static void 1595load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], 1596 struct nv50_reg **src, unsigned dim, unsigned arg) 1597{ 1598 unsigned c, mode; 1599 1600 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1601 mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE; 1602 1603 t[3]->rhw = src[3]->rhw; 1604 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1605 emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]); 1606 1607 for (c = 0; c < dim; ++c) { 1608 t[c]->rhw = src[c]->rhw; 1609 emit_interp(pc, t[c], t[3], mode); 1610 } 1611 if (arg != dim) { /* depth reference value */ 1612 t[dim]->rhw = src[2]->rhw; 1613 emit_interp(pc, t[dim], t[3], mode); 1614 } 1615 } else { 1616 /* XXX: for some reason the blob sometimes uses MAD 1617 * (mad f32 $rX $rY $rZ neg $r63) 1618 */ 1619 emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]); 1620 for (c = 0; c < dim; ++c) 1621 emit_mul(pc, t[c], src[c], t[3]); 1622 if (arg != dim) /* depth reference value */ 1623 emit_mul(pc, t[dim], src[2], t[3]); 1624 } 1625} 1626 1627static INLINE void 1628get_tex_dim(unsigned type, unsigned *dim, unsigned *arg) 1629{ 1630 switch (type) { 1631 case TGSI_TEXTURE_1D: 1632 *arg = *dim = 1; 1633 break; 1634 case TGSI_TEXTURE_SHADOW1D: 1635 *dim = 1; 1636 *arg = 2; 1637 break; 1638 case TGSI_TEXTURE_UNKNOWN: 1639 case TGSI_TEXTURE_2D: 1640 case TGSI_TEXTURE_RECT: 1641 *arg = *dim = 2; 1642 break; 1643 case TGSI_TEXTURE_SHADOW2D: 1644 case TGSI_TEXTURE_SHADOWRECT: 1645 *dim = 2; 1646 *arg = 3; 1647 break; 1648 case TGSI_TEXTURE_3D: 1649 case TGSI_TEXTURE_CUBE: 1650 *dim = *arg = 3; 1651 break; 1652 default: 1653 assert(0); 1654 break; 1655 } 1656} 1657 1658/* We shouldn't execute TEXLOD if any of the pixels in a quad have 1659 * different LOD values, so branch off groups of equal LOD. 1660 */ 1661static void 1662emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod, 1663 struct nv50_reg *src, struct nv50_program_exec *tex) 1664{ 1665 struct nv50_program_exec *join_at; 1666 unsigned i, target = pc->p->exec_size + 9 * 2; 1667 1668 if (pc->p->type != PIPE_SHADER_FRAGMENT) { 1669 emit(pc, tex); 1670 return; 1671 } 1672 pc->allow32 = FALSE; 1673 1674 /* Subtract lod of each pixel from lod of top left pixel, jump 1675 * texlod insn if result is 0, then repeat for 2 other pixels. 1676 */ 1677 join_at = emit_joinat(pc); 1678 emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55); 1679 emit_branch(pc, 0, 2)->param.index = target; 1680 1681 for (i = 1; i < 4; ++i) { 1682 emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55); 1683 emit_branch(pc, 0, 2)->param.index = target; 1684 } 1685 1686 emit_mov(pc, tlod, src); /* target */ 1687 emit(pc, tex); /* texlod */ 1688 1689 join_at->param.index = target + 2 * 2; 1690 JOIN_ON(emit_nop(pc)); /* join _after_ tex */ 1691} 1692 1693static void 1694emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg, 1695 struct nv50_program_exec *tex) 1696{ 1697 struct nv50_program_exec *e; 1698 struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL); 1699 int r_pred = 0; 1700 unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 }; 1701 1702 pc->allow32 = FALSE; 1703 ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4); 1704 1705 /* Subtract bias value of thread i from bias values of each thread, 1706 * store result in r_pred, and set bit i in r_bits if result was 0. 1707 */ 1708 assert(arg < 4); 1709 for (i = 0; i < 4; ++i, ++imm_1248.hw) { 1710 emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55); 1711 emit_mov(pc, r_bits, &imm_1248); 1712 set_pred(pc, 2, r_pred, pc->p->exec_tail); 1713 } 1714 emit_mov_to_pred(pc, r_pred, r_bits); 1715 1716 /* The lanes of a quad are now grouped by the bit in r_pred they have 1717 * set. Put the input values for TEX into a new register set for each 1718 * group and execute TEX only for a specific group. 1719 * We cannot use the same register set for each group because we need 1720 * the derivatives, which are implicitly calculated, to be correct. 1721 */ 1722 for (i = 1; i < 4; ++i) { 1723 alloc_temp4(pc, t123[i], 0); 1724 1725 for (c = 0; c <= arg; ++c) 1726 emit_mov(pc, t123[i][c], t[c]); 1727 1728 *(e = exec(pc)) = *(tex); 1729 e->inst[0] &= ~0x01fc; 1730 set_dst(pc, t123[i][0], e); 1731 set_pred(pc, cc[i], r_pred, e); 1732 emit(pc, e); 1733 } 1734 /* finally TEX on the original regs (where we kept the input) */ 1735 set_pred(pc, cc[0], r_pred, tex); 1736 emit(pc, tex); 1737 1738 /* put the 3 * n other results into regs for lane 0 */ 1739 n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc)); 1740 for (i = 1; i < 4; ++i) { 1741 for (c = 0; c < n; ++c) { 1742 emit_mov(pc, t[c], t123[i][c]); 1743 set_pred(pc, cc[i], r_pred, pc->p->exec_tail); 1744 } 1745 free_temp4(pc, t123[i]); 1746 } 1747 1748 emit_nop(pc); 1749 free_temp(pc, r_bits); 1750} 1751 1752static void 1753emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1754 struct nv50_reg **src, unsigned unit, unsigned type, 1755 boolean proj, int bias_lod) 1756{ 1757 struct nv50_reg *t[4]; 1758 struct nv50_program_exec *e; 1759 unsigned c, dim, arg; 1760 1761 /* t[i] must be within a single 128 bit super-reg */ 1762 alloc_temp4(pc, t, 0); 1763 1764 e = exec(pc); 1765 e->inst[0] = 0xf0000000; 1766 set_long(pc, e); 1767 set_dst(pc, t[0], e); 1768 1769 /* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */ 1770 e->inst[0] |= (unit << 9) /* | (unit << 17) */; 1771 1772 /* live flag (don't set if TEX results affect input to another TEX): */ 1773 /* e->inst[0] |= 0x00000004; */ 1774 1775 get_tex_dim(type, &dim, &arg); 1776 1777 if (type == TGSI_TEXTURE_CUBE) { 1778 e->inst[0] |= 0x08000000; 1779 load_cube_tex_coords(pc, t, src, arg, proj); 1780 } else 1781 if (proj) 1782 load_proj_tex_coords(pc, t, src, dim, arg); 1783 else { 1784 for (c = 0; c < dim; c++) 1785 emit_mov(pc, t[c], src[c]); 1786 if (arg != dim) /* depth reference value (always src.z here) */ 1787 emit_mov(pc, t[dim], src[2]); 1788 } 1789 1790 e->inst[0] |= (mask & 0x3) << 25; 1791 e->inst[1] |= (mask & 0xc) << 12; 1792 1793 if (!bias_lod) { 1794 e->inst[0] |= (arg - 1) << 22; 1795 emit(pc, e); 1796 } else 1797 if (bias_lod < 0) { 1798 assert(pc->p->type == PIPE_SHADER_FRAGMENT); 1799 e->inst[0] |= arg << 22; 1800 e->inst[1] |= 0x20000000; /* texbias */ 1801 emit_mov(pc, t[arg], src[3]); 1802 emit_texbias_sequence(pc, t, arg, e); 1803 } else { 1804 e->inst[0] |= arg << 22; 1805 e->inst[1] |= 0x40000000; /* texlod */ 1806 emit_mov(pc, t[arg], src[3]); 1807 emit_texlod_sequence(pc, t[arg], src[3], e); 1808 } 1809 1810#if 1 1811 c = 0; 1812 if (mask & 1) emit_mov(pc, dst[0], t[c++]); 1813 if (mask & 2) emit_mov(pc, dst[1], t[c++]); 1814 if (mask & 4) emit_mov(pc, dst[2], t[c++]); 1815 if (mask & 8) emit_mov(pc, dst[3], t[c]); 1816 1817 free_temp4(pc, t); 1818#else 1819 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1820 * the texture coordinates, not the fetched values: latency ? */ 1821 1822 for (c = 0; c < 4; c++) { 1823 if (mask & (1 << c)) 1824 assimilate_temp(pc, dst[c], t[c]); 1825 else 1826 free_temp(pc, t[c]); 1827 } 1828#endif 1829} 1830 1831static void 1832emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1833{ 1834 struct nv50_program_exec *e = exec(pc); 1835 1836 assert(src->type == P_TEMP); 1837 1838 e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000; 1839 e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000; 1840 set_long(pc, e); 1841 set_dst(pc, dst, e); 1842 set_src_0(pc, src, e); 1843 set_src_2(pc, src, e); 1844 1845 emit(pc, e); 1846} 1847 1848static void 1849emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1850{ 1851 struct nv50_program_exec *e = exec(pc); 1852 1853 assert(src->type == P_TEMP); 1854 1855 e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000; 1856 e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000; 1857 set_long(pc, e); 1858 set_dst(pc, dst, e); 1859 set_src_0(pc, src, e); 1860 set_src_2(pc, src, e); 1861 1862 emit(pc, e); 1863} 1864 1865static void 1866convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1867{ 1868 unsigned q = 0, m = ~0; 1869 1870 assert(!is_long(e)); 1871 1872 switch (e->inst[0] >> 28) { 1873 case 0x1: 1874 /* MOV */ 1875 q = 0x0403c000; 1876 m = 0xffff7fff; 1877 break; 1878 case 0x2: 1879 case 0x3: 1880 /* ADD, SUB, SUBR b32 */ 1881 m = ~(0x8000 | (127 << 16)); 1882 q = ((e->inst[0] & (~m)) >> 2) | (1 << 26); 1883 break; 1884 case 0x5: 1885 /* SAD */ 1886 m = ~(0x81 << 8); 1887 q = 0x0c << 24; 1888 break; 1889 case 0x8: 1890 /* INTERP (move centroid, perspective and flat bits) */ 1891 m = ~0x03000100; 1892 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 1893 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 1894 break; 1895 case 0x9: 1896 /* RCP */ 1897 break; 1898 case 0xB: 1899 /* ADD */ 1900 m = ~(127 << 16); 1901 q = ((e->inst[0] & (~m)) >> 2); 1902 break; 1903 case 0xC: 1904 /* MUL */ 1905 m = ~0x00008000; 1906 q = ((e->inst[0] & (~m)) << 12); 1907 break; 1908 case 0xE: 1909 /* MAD (if src2 == dst) */ 1910 q = ((e->inst[0] & 0x1fc) << 12); 1911 break; 1912 default: 1913 assert(0); 1914 break; 1915 } 1916 1917 set_long(pc, e); 1918 pc->p->exec_size++; 1919 1920 e->inst[0] &= m; 1921 e->inst[1] |= q; 1922} 1923 1924/* Some operations support an optional negation flag. */ 1925static int 1926get_supported_mods(const struct tgsi_full_instruction *insn, int i) 1927{ 1928 switch (insn->Instruction.Opcode) { 1929 case TGSI_OPCODE_ADD: 1930 case TGSI_OPCODE_COS: 1931 case TGSI_OPCODE_DDX: 1932 case TGSI_OPCODE_DDY: 1933 case TGSI_OPCODE_DP3: 1934 case TGSI_OPCODE_DP4: 1935 case TGSI_OPCODE_EX2: 1936 case TGSI_OPCODE_KIL: 1937 case TGSI_OPCODE_LG2: 1938 case TGSI_OPCODE_MAD: 1939 case TGSI_OPCODE_MUL: 1940 case TGSI_OPCODE_POW: 1941 case TGSI_OPCODE_RCP: 1942 case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */ 1943 case TGSI_OPCODE_SCS: 1944 case TGSI_OPCODE_SIN: 1945 case TGSI_OPCODE_SUB: 1946 return NV50_MOD_NEG; 1947 case TGSI_OPCODE_MAX: 1948 case TGSI_OPCODE_MIN: 1949 case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */ 1950 return NV50_MOD_ABS; 1951 case TGSI_OPCODE_CEIL: 1952 case TGSI_OPCODE_FLR: 1953 case TGSI_OPCODE_TRUNC: 1954 return NV50_MOD_NEG | NV50_MOD_ABS; 1955 case TGSI_OPCODE_F2I: 1956 case TGSI_OPCODE_F2U: 1957 case TGSI_OPCODE_I2F: 1958 case TGSI_OPCODE_U2F: 1959 return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32; 1960 case TGSI_OPCODE_UADD: 1961 return NV50_MOD_NEG | NV50_MOD_I32; 1962 case TGSI_OPCODE_SAD: 1963 case TGSI_OPCODE_SHL: 1964 case TGSI_OPCODE_IMAX: 1965 case TGSI_OPCODE_IMIN: 1966 case TGSI_OPCODE_ISHR: 1967 case TGSI_OPCODE_UMAX: 1968 case TGSI_OPCODE_UMIN: 1969 case TGSI_OPCODE_USHR: 1970 return NV50_MOD_I32; 1971 default: 1972 return 0; 1973 } 1974} 1975 1976/* Return a read mask for source registers deduced from opcode & write mask. */ 1977static unsigned 1978nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 1979{ 1980 unsigned x, mask = insn->Dst[0].Register.WriteMask; 1981 1982 switch (insn->Instruction.Opcode) { 1983 case TGSI_OPCODE_COS: 1984 case TGSI_OPCODE_SIN: 1985 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 1986 case TGSI_OPCODE_DP3: 1987 return 0x7; 1988 case TGSI_OPCODE_DP4: 1989 case TGSI_OPCODE_DPH: 1990 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 1991 return 0xf; 1992 case TGSI_OPCODE_DST: 1993 return mask & (c ? 0xa : 0x6); 1994 case TGSI_OPCODE_EX2: 1995 case TGSI_OPCODE_EXP: 1996 case TGSI_OPCODE_LG2: 1997 case TGSI_OPCODE_LOG: 1998 case TGSI_OPCODE_POW: 1999 case TGSI_OPCODE_RCP: 2000 case TGSI_OPCODE_RSQ: 2001 case TGSI_OPCODE_SCS: 2002 return 0x1; 2003 case TGSI_OPCODE_IF: 2004 return 0x1; 2005 case TGSI_OPCODE_LIT: 2006 return 0xb; 2007 case TGSI_OPCODE_TEX: 2008 case TGSI_OPCODE_TXB: 2009 case TGSI_OPCODE_TXL: 2010 case TGSI_OPCODE_TXP: 2011 { 2012 const struct tgsi_instruction_texture *tex; 2013 2014 assert(insn->Instruction.Texture); 2015 tex = &insn->Texture; 2016 2017 mask = 0x7; 2018 if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && 2019 insn->Instruction.Opcode != TGSI_OPCODE_TXD) 2020 mask |= 0x8; /* bias, lod or proj */ 2021 2022 switch (tex->Texture) { 2023 case TGSI_TEXTURE_1D: 2024 mask &= 0x9; 2025 break; 2026 case TGSI_TEXTURE_SHADOW1D: 2027 mask &= 0x5; 2028 break; 2029 case TGSI_TEXTURE_2D: 2030 mask &= 0xb; 2031 break; 2032 default: 2033 break; 2034 } 2035 } 2036 return mask; 2037 case TGSI_OPCODE_XPD: 2038 x = 0; 2039 if (mask & 1) x |= 0x6; 2040 if (mask & 2) x |= 0x5; 2041 if (mask & 4) x |= 0x3; 2042 return x; 2043 default: 2044 break; 2045 } 2046 2047 return mask; 2048} 2049 2050static struct nv50_reg * 2051tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 2052{ 2053 switch (dst->Register.File) { 2054 case TGSI_FILE_TEMPORARY: 2055 return &pc->temp[dst->Register.Index * 4 + c]; 2056 case TGSI_FILE_OUTPUT: 2057 return &pc->result[dst->Register.Index * 4 + c]; 2058 case TGSI_FILE_ADDRESS: 2059 { 2060 struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; 2061 if (!r) { 2062 r = alloc_addr(pc, NULL); 2063 pc->addr[dst->Register.Index * 4 + c] = r; 2064 } 2065 assert(r); 2066 return r; 2067 } 2068 case TGSI_FILE_NULL: 2069 return NULL; 2070 default: 2071 break; 2072 } 2073 2074 return NULL; 2075} 2076 2077static struct nv50_reg * 2078tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 2079 int mod) 2080{ 2081 struct nv50_reg *r = NULL; 2082 struct nv50_reg *temp = NULL; 2083 unsigned sgn, c, swz, cvn; 2084 2085 if (src->Register.File != TGSI_FILE_CONSTANT) 2086 assert(!src->Register.Indirect); 2087 2088 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 2089 2090 c = tgsi_util_get_full_src_register_swizzle(src, chan); 2091 switch (c) { 2092 case TGSI_SWIZZLE_X: 2093 case TGSI_SWIZZLE_Y: 2094 case TGSI_SWIZZLE_Z: 2095 case TGSI_SWIZZLE_W: 2096 switch (src->Register.File) { 2097 case TGSI_FILE_INPUT: 2098 r = &pc->attr[src->Register.Index * 4 + c]; 2099 break; 2100 case TGSI_FILE_TEMPORARY: 2101 r = &pc->temp[src->Register.Index * 4 + c]; 2102 break; 2103 case TGSI_FILE_CONSTANT: 2104 if (!src->Register.Indirect) { 2105 r = &pc->param[src->Register.Index * 4 + c]; 2106 break; 2107 } 2108 /* Indicate indirection by setting r->acc < 0 and 2109 * use the index field to select the address reg. 2110 */ 2111 r = reg_instance(pc, NULL); 2112 swz = tgsi_util_get_src_register_swizzle( 2113 &src->Indirect, 0); 2114 ctor_reg(r, P_CONST, 2115 src->Indirect.Index * 4 + swz, 2116 src->Register.Index * 4 + c); 2117 r->acc = -1; 2118 break; 2119 case TGSI_FILE_IMMEDIATE: 2120 r = &pc->immd[src->Register.Index * 4 + c]; 2121 break; 2122 case TGSI_FILE_SAMPLER: 2123 return NULL; 2124 case TGSI_FILE_ADDRESS: 2125 r = pc->addr[src->Register.Index * 4 + c]; 2126 assert(r); 2127 break; 2128 default: 2129 assert(0); 2130 break; 2131 } 2132 break; 2133 default: 2134 assert(0); 2135 break; 2136 } 2137 2138 cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32; 2139 2140 switch (sgn) { 2141 case TGSI_UTIL_SIGN_CLEAR: 2142 r->mod = NV50_MOD_ABS; 2143 break; 2144 case TGSI_UTIL_SIGN_SET: 2145 r->mod = NV50_MOD_NEG_ABS; 2146 break; 2147 case TGSI_UTIL_SIGN_TOGGLE: 2148 r->mod = NV50_MOD_NEG; 2149 break; 2150 default: 2151 assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP); 2152 break; 2153 } 2154 2155 if ((r->mod & mod) != r->mod) { 2156 temp = temp_temp(pc); 2157 emit_cvt(pc, temp, r, -1, cvn); 2158 r->mod = 0; 2159 r = temp; 2160 } else 2161 r->mod |= mod & NV50_MOD_I32; 2162 2163 assert(r); 2164 if (r->acc >= 0 && r != temp) 2165 return reg_instance(pc, r); /* will clear r->mod */ 2166 return r; 2167} 2168 2169/* return TRUE for ops that produce only a single result */ 2170static boolean 2171is_scalar_op(unsigned op) 2172{ 2173 switch (op) { 2174 case TGSI_OPCODE_COS: 2175 case TGSI_OPCODE_DP2: 2176 case TGSI_OPCODE_DP3: 2177 case TGSI_OPCODE_DP4: 2178 case TGSI_OPCODE_DPH: 2179 case TGSI_OPCODE_EX2: 2180 case TGSI_OPCODE_LG2: 2181 case TGSI_OPCODE_POW: 2182 case TGSI_OPCODE_RCP: 2183 case TGSI_OPCODE_RSQ: 2184 case TGSI_OPCODE_SIN: 2185 /* 2186 case TGSI_OPCODE_KIL: 2187 case TGSI_OPCODE_LIT: 2188 case TGSI_OPCODE_SCS: 2189 */ 2190 return TRUE; 2191 default: 2192 return FALSE; 2193 } 2194} 2195 2196/* Returns a bitmask indicating which dst components depend 2197 * on source s, component c (reverse of nv50_tgsi_src_mask). 2198 */ 2199static unsigned 2200nv50_tgsi_dst_revdep(unsigned op, int s, int c) 2201{ 2202 if (is_scalar_op(op)) 2203 return 0x1; 2204 2205 switch (op) { 2206 case TGSI_OPCODE_DST: 2207 return (1 << c) & (s ? 0xa : 0x6); 2208 case TGSI_OPCODE_XPD: 2209 switch (c) { 2210 case 0: return 0x6; 2211 case 1: return 0x5; 2212 case 2: return 0x3; 2213 case 3: return 0x0; 2214 default: 2215 assert(0); 2216 return 0x0; 2217 } 2218 case TGSI_OPCODE_EXP: 2219 case TGSI_OPCODE_LOG: 2220 case TGSI_OPCODE_LIT: 2221 case TGSI_OPCODE_SCS: 2222 case TGSI_OPCODE_TEX: 2223 case TGSI_OPCODE_TXB: 2224 case TGSI_OPCODE_TXL: 2225 case TGSI_OPCODE_TXP: 2226 /* these take care of dangerous swizzles themselves */ 2227 return 0x0; 2228 case TGSI_OPCODE_IF: 2229 case TGSI_OPCODE_KIL: 2230 /* don't call this function for these ops */ 2231 assert(0); 2232 return 0; 2233 default: 2234 /* linear vector instruction */ 2235 return (1 << c); 2236 } 2237} 2238 2239static INLINE boolean 2240has_pred(struct nv50_program_exec *e, unsigned cc) 2241{ 2242 if (!is_long(e) || is_immd(e)) 2243 return FALSE; 2244 return ((e->inst[1] & 0x780) == (cc << 7)); 2245} 2246 2247/* on ENDIF see if we can do "@p0.neu single_op" instead of: 2248 * join_at ENDIF 2249 * @p0.eq bra ENDIF 2250 * single_op 2251 * ENDIF: nop.join 2252 */ 2253static boolean 2254nv50_kill_branch(struct nv50_pc *pc) 2255{ 2256 int lvl = pc->if_lvl; 2257 2258 if (pc->if_insn[lvl]->next != pc->p->exec_tail) 2259 return FALSE; 2260 if (is_immd(pc->p->exec_tail)) 2261 return FALSE; 2262 2263 /* if ccode == 'true', the BRA is from an ELSE and the predicate 2264 * reg may no longer be valid, since we currently always use $p0 2265 */ 2266 if (has_pred(pc->if_insn[lvl], 0xf)) 2267 return FALSE; 2268 assert(pc->if_insn[lvl] && pc->if_join[lvl]); 2269 2270 /* We'll use the exec allocated for JOIN_AT (we can't easily 2271 * access nv50_program_exec's prev). 2272 */ 2273 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ 2274 2275 *pc->if_join[lvl] = *pc->p->exec_tail; 2276 2277 FREE(pc->if_insn[lvl]); 2278 FREE(pc->p->exec_tail); 2279 2280 pc->p->exec_tail = pc->if_join[lvl]; 2281 pc->p->exec_tail->next = NULL; 2282 set_pred(pc, 0xd, 0, pc->p->exec_tail); 2283 2284 return TRUE; 2285} 2286 2287static void 2288nv50_fp_move_results(struct nv50_pc *pc) 2289{ 2290 struct nv50_reg reg; 2291 unsigned i; 2292 2293 ctor_reg(®, P_TEMP, -1, -1); 2294 2295 for (i = 0; i < pc->result_nr * 4; ++i) { 2296 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) 2297 continue; 2298 if (pc->result[i].rhw != pc->result[i].hw) { 2299 reg.hw = pc->result[i].rhw; 2300 emit_mov(pc, ®, &pc->result[i]); 2301 } 2302 } 2303} 2304 2305static boolean 2306nv50_program_tx_insn(struct nv50_pc *pc, 2307 const struct tgsi_full_instruction *inst) 2308{ 2309 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 2310 unsigned mask, sat, unit; 2311 int i, c; 2312 2313 mask = inst->Dst[0].Register.WriteMask; 2314 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 2315 2316 memset(src, 0, sizeof(src)); 2317 2318 for (c = 0; c < 4; c++) { 2319 if ((mask & (1 << c)) && !pc->r_dst[c]) 2320 dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); 2321 else 2322 dst[c] = pc->r_dst[c]; 2323 rdst[c] = dst[c]; 2324 } 2325 2326 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2327 const struct tgsi_full_src_register *fs = &inst->Src[i]; 2328 unsigned src_mask; 2329 int mod_supp; 2330 2331 src_mask = nv50_tgsi_src_mask(inst, i); 2332 mod_supp = get_supported_mods(inst, i); 2333 2334 if (fs->Register.File == TGSI_FILE_SAMPLER) 2335 unit = fs->Register.Index; 2336 2337 for (c = 0; c < 4; c++) 2338 if (src_mask & (1 << c)) 2339 src[i][c] = tgsi_src(pc, c, fs, mod_supp); 2340 } 2341 2342 brdc = temp = pc->r_brdc; 2343 if (brdc && brdc->type != P_TEMP) { 2344 temp = temp_temp(pc); 2345 if (sat) 2346 brdc = temp; 2347 } else 2348 if (sat) { 2349 for (c = 0; c < 4; c++) { 2350 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 2351 continue; 2352 /* rdst[c] = dst[c]; */ /* done above */ 2353 dst[c] = temp_temp(pc); 2354 } 2355 } 2356 2357 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 2358 2359 switch (inst->Instruction.Opcode) { 2360 case TGSI_OPCODE_ABS: 2361 for (c = 0; c < 4; c++) { 2362 if (!(mask & (1 << c))) 2363 continue; 2364 emit_cvt(pc, dst[c], src[0][c], -1, 2365 CVT_ABS | CVT_F32_F32); 2366 } 2367 break; 2368 case TGSI_OPCODE_ADD: 2369 for (c = 0; c < 4; c++) { 2370 if (!(mask & (1 << c))) 2371 continue; 2372 emit_add(pc, dst[c], src[0][c], src[1][c]); 2373 } 2374 break; 2375 case TGSI_OPCODE_AND: 2376 case TGSI_OPCODE_XOR: 2377 case TGSI_OPCODE_OR: 2378 for (c = 0; c < 4; c++) { 2379 if (!(mask & (1 << c))) 2380 continue; 2381 emit_bitop2(pc, dst[c], src[0][c], src[1][c], 2382 inst->Instruction.Opcode); 2383 } 2384 break; 2385 case TGSI_OPCODE_ARL: 2386 assert(src[0][0]); 2387 temp = temp_temp(pc); 2388 emit_cvt(pc, temp, src[0][0], -1, CVT_FLOOR | CVT_S32_F32); 2389 emit_arl(pc, dst[0], temp, 4); 2390 break; 2391 case TGSI_OPCODE_BGNLOOP: 2392 pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc); 2393 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; 2394 terminate_mbb(pc); 2395 break; 2396 case TGSI_OPCODE_BGNSUB: 2397 assert(!pc->in_subroutine); 2398 pc->in_subroutine = TRUE; 2399 /* probably not necessary, but align to 8 byte boundary */ 2400 if (!is_long(pc->p->exec_tail)) 2401 convert_to_long(pc, pc->p->exec_tail); 2402 break; 2403 case TGSI_OPCODE_BRK: 2404 assert(pc->loop_lvl > 0); 2405 emit_break(pc, -1, 0); 2406 break; 2407 case TGSI_OPCODE_CAL: 2408 assert(inst->Label.Label < pc->insn_nr); 2409 emit_call(pc, -1, 0)->param.index = inst->Label.Label; 2410 /* replaced by actual offset in nv50_program_fixup_insns */ 2411 break; 2412 case TGSI_OPCODE_CEIL: 2413 for (c = 0; c < 4; c++) { 2414 if (!(mask & (1 << c))) 2415 continue; 2416 emit_cvt(pc, dst[c], src[0][c], -1, 2417 CVT_CEIL | CVT_F32_F32 | CVT_RI); 2418 } 2419 break; 2420 case TGSI_OPCODE_CMP: 2421 pc->allow32 = FALSE; 2422 for (c = 0; c < 4; c++) { 2423 if (!(mask & (1 << c))) 2424 continue; 2425 emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32); 2426 emit_mov(pc, dst[c], src[1][c]); 2427 set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ 2428 emit_mov(pc, dst[c], src[2][c]); 2429 set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ 2430 } 2431 break; 2432 case TGSI_OPCODE_CONT: 2433 assert(pc->loop_lvl > 0); 2434 emit_branch(pc, -1, 0)->param.index = 2435 pc->loop_pos[pc->loop_lvl - 1]; 2436 break; 2437 case TGSI_OPCODE_COS: 2438 if (mask & 8) { 2439 emit_precossin(pc, temp, src[0][3]); 2440 emit_flop(pc, NV50_FLOP_COS, dst[3], temp); 2441 if (!(mask &= 7)) 2442 break; 2443 if (temp == dst[3]) 2444 temp = brdc = temp_temp(pc); 2445 } 2446 emit_precossin(pc, temp, src[0][0]); 2447 emit_flop(pc, NV50_FLOP_COS, brdc, temp); 2448 break; 2449 case TGSI_OPCODE_DDX: 2450 for (c = 0; c < 4; c++) { 2451 if (!(mask & (1 << c))) 2452 continue; 2453 emit_ddx(pc, dst[c], src[0][c]); 2454 } 2455 break; 2456 case TGSI_OPCODE_DDY: 2457 for (c = 0; c < 4; c++) { 2458 if (!(mask & (1 << c))) 2459 continue; 2460 emit_ddy(pc, dst[c], src[0][c]); 2461 } 2462 break; 2463 case TGSI_OPCODE_DP3: 2464 emit_mul(pc, temp, src[0][0], src[1][0]); 2465 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2466 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 2467 break; 2468 case TGSI_OPCODE_DP4: 2469 emit_mul(pc, temp, src[0][0], src[1][0]); 2470 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2471 emit_mad(pc, temp, src[0][2], src[1][2], temp); 2472 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 2473 break; 2474 case TGSI_OPCODE_DPH: 2475 emit_mul(pc, temp, src[0][0], src[1][0]); 2476 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2477 emit_mad(pc, temp, src[0][2], src[1][2], temp); 2478 emit_add(pc, brdc, src[1][3], temp); 2479 break; 2480 case TGSI_OPCODE_DST: 2481 if (mask & (1 << 1)) 2482 emit_mul(pc, dst[1], src[0][1], src[1][1]); 2483 if (mask & (1 << 2)) 2484 emit_mov(pc, dst[2], src[0][2]); 2485 if (mask & (1 << 3)) 2486 emit_mov(pc, dst[3], src[1][3]); 2487 if (mask & (1 << 0)) 2488 emit_mov_immdval(pc, dst[0], 1.0f); 2489 break; 2490 case TGSI_OPCODE_ELSE: 2491 emit_branch(pc, -1, 0); 2492 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 2493 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; 2494 terminate_mbb(pc); 2495 break; 2496 case TGSI_OPCODE_ENDIF: 2497 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 2498 2499 /* try to replace branch over 1 insn with a predicated insn */ 2500 if (nv50_kill_branch(pc) == TRUE) 2501 break; 2502 2503 if (pc->if_join[pc->if_lvl]) { 2504 pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size; 2505 pc->if_join[pc->if_lvl] = NULL; 2506 } 2507 terminate_mbb(pc); 2508 /* emit a NOP as join point, we could set it on the next 2509 * one, but would have to make sure it is long and !immd 2510 */ 2511 JOIN_ON(emit_nop(pc)); 2512 break; 2513 case TGSI_OPCODE_ENDLOOP: 2514 emit_branch(pc, -1, 0)->param.index = 2515 pc->loop_pos[--pc->loop_lvl]; 2516 pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size; 2517 terminate_mbb(pc); 2518 break; 2519 case TGSI_OPCODE_ENDSUB: 2520 assert(pc->in_subroutine); 2521 pc->in_subroutine = FALSE; 2522 break; 2523 case TGSI_OPCODE_EX2: 2524 emit_preex2(pc, temp, src[0][0]); 2525 emit_flop(pc, NV50_FLOP_EX2, brdc, temp); 2526 break; 2527 case TGSI_OPCODE_EXP: 2528 { 2529 struct nv50_reg *t[2]; 2530 2531 assert(!temp); 2532 t[0] = temp_temp(pc); 2533 t[1] = temp_temp(pc); 2534 2535 if (mask & 0x6) 2536 emit_mov(pc, t[0], src[0][0]); 2537 if (mask & 0x3) 2538 emit_flr(pc, t[1], src[0][0]); 2539 2540 if (mask & (1 << 1)) 2541 emit_sub(pc, dst[1], t[0], t[1]); 2542 if (mask & (1 << 0)) { 2543 emit_preex2(pc, t[1], t[1]); 2544 emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]); 2545 } 2546 if (mask & (1 << 2)) { 2547 emit_preex2(pc, t[0], t[0]); 2548 emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]); 2549 } 2550 if (mask & (1 << 3)) 2551 emit_mov_immdval(pc, dst[3], 1.0f); 2552 } 2553 break; 2554 case TGSI_OPCODE_F2I: 2555 for (c = 0; c < 4; c++) { 2556 if (!(mask & (1 << c))) 2557 continue; 2558 emit_cvt(pc, dst[c], src[0][c], -1, 2559 CVT_TRUNC | CVT_S32_F32); 2560 } 2561 break; 2562 case TGSI_OPCODE_F2U: 2563 for (c = 0; c < 4; c++) { 2564 if (!(mask & (1 << c))) 2565 continue; 2566 emit_cvt(pc, dst[c], src[0][c], -1, 2567 CVT_TRUNC | CVT_U32_F32); 2568 } 2569 break; 2570 case TGSI_OPCODE_FLR: 2571 for (c = 0; c < 4; c++) { 2572 if (!(mask & (1 << c))) 2573 continue; 2574 emit_flr(pc, dst[c], src[0][c]); 2575 } 2576 break; 2577 case TGSI_OPCODE_FRC: 2578 temp = temp_temp(pc); 2579 for (c = 0; c < 4; c++) { 2580 if (!(mask & (1 << c))) 2581 continue; 2582 emit_flr(pc, temp, src[0][c]); 2583 emit_sub(pc, dst[c], src[0][c], temp); 2584 } 2585 break; 2586 case TGSI_OPCODE_I2F: 2587 for (c = 0; c < 4; c++) { 2588 if (!(mask & (1 << c))) 2589 continue; 2590 emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32); 2591 } 2592 break; 2593 case TGSI_OPCODE_IF: 2594 assert(pc->if_lvl < NV50_MAX_COND_NESTING); 2595 emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32); 2596 pc->if_join[pc->if_lvl] = emit_joinat(pc); 2597 pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);; 2598 terminate_mbb(pc); 2599 break; 2600 case TGSI_OPCODE_IMAX: 2601 for (c = 0; c < 4; c++) { 2602 if (!(mask & (1 << c))) 2603 continue; 2604 emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]); 2605 } 2606 break; 2607 case TGSI_OPCODE_IMIN: 2608 for (c = 0; c < 4; c++) { 2609 if (!(mask & (1 << c))) 2610 continue; 2611 emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]); 2612 } 2613 break; 2614 case TGSI_OPCODE_INEG: 2615 for (c = 0; c < 4; c++) { 2616 if (!(mask & (1 << c))) 2617 continue; 2618 emit_cvt(pc, dst[c], src[0][c], -1, 2619 CVT_S32_S32 | CVT_NEG); 2620 } 2621 break; 2622 case TGSI_OPCODE_KIL: 2623 assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]); 2624 emit_kil(pc, src[0][0]); 2625 emit_kil(pc, src[0][1]); 2626 emit_kil(pc, src[0][2]); 2627 emit_kil(pc, src[0][3]); 2628 break; 2629 case TGSI_OPCODE_KILP: 2630 emit_kil(pc, NULL); 2631 break; 2632 case TGSI_OPCODE_LIT: 2633 emit_lit(pc, &dst[0], mask, &src[0][0]); 2634 break; 2635 case TGSI_OPCODE_LG2: 2636 emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]); 2637 break; 2638 case TGSI_OPCODE_LOG: 2639 { 2640 struct nv50_reg *t[2]; 2641 2642 t[0] = temp_temp(pc); 2643 if (mask & (1 << 1)) 2644 t[1] = temp_temp(pc); 2645 else 2646 t[1] = t[0]; 2647 2648 emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32); 2649 emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]); 2650 if (mask & (1 << 2)) 2651 emit_mov(pc, dst[2], t[1]); 2652 emit_flr(pc, t[1], t[1]); 2653 if (mask & (1 << 0)) 2654 emit_mov(pc, dst[0], t[1]); 2655 if (mask & (1 << 1)) { 2656 t[1]->mod = NV50_MOD_NEG; 2657 emit_preex2(pc, t[1], t[1]); 2658 t[1]->mod = 0; 2659 emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]); 2660 emit_mul(pc, dst[1], t[0], t[1]); 2661 } 2662 if (mask & (1 << 3)) 2663 emit_mov_immdval(pc, dst[3], 1.0f); 2664 } 2665 break; 2666 case TGSI_OPCODE_LRP: 2667 temp = temp_temp(pc); 2668 for (c = 0; c < 4; c++) { 2669 if (!(mask & (1 << c))) 2670 continue; 2671 emit_sub(pc, temp, src[1][c], src[2][c]); 2672 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 2673 } 2674 break; 2675 case TGSI_OPCODE_MAD: 2676 for (c = 0; c < 4; c++) { 2677 if (!(mask & (1 << c))) 2678 continue; 2679 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 2680 } 2681 break; 2682 case TGSI_OPCODE_MAX: 2683 for (c = 0; c < 4; c++) { 2684 if (!(mask & (1 << c))) 2685 continue; 2686 emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]); 2687 } 2688 break; 2689 case TGSI_OPCODE_MIN: 2690 for (c = 0; c < 4; c++) { 2691 if (!(mask & (1 << c))) 2692 continue; 2693 emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]); 2694 } 2695 break; 2696 case TGSI_OPCODE_MOV: 2697 for (c = 0; c < 4; c++) { 2698 if (!(mask & (1 << c))) 2699 continue; 2700 emit_mov(pc, dst[c], src[0][c]); 2701 } 2702 break; 2703 case TGSI_OPCODE_MUL: 2704 for (c = 0; c < 4; c++) { 2705 if (!(mask & (1 << c))) 2706 continue; 2707 emit_mul(pc, dst[c], src[0][c], src[1][c]); 2708 } 2709 break; 2710 case TGSI_OPCODE_POW: 2711 emit_pow(pc, brdc, src[0][0], src[1][0]); 2712 break; 2713 case TGSI_OPCODE_RCP: 2714 emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]); 2715 break; 2716 case TGSI_OPCODE_RET: 2717 if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine) 2718 nv50_fp_move_results(pc); 2719 emit_ret(pc, -1, 0); 2720 break; 2721 case TGSI_OPCODE_RSQ: 2722 src[0][0]->mod |= NV50_MOD_ABS; 2723 emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]); 2724 break; 2725 case TGSI_OPCODE_SAD: 2726 for (c = 0; c < 4; c++) { 2727 if (!(mask & (1 << c))) 2728 continue; 2729 emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 2730 } 2731 break; 2732 case TGSI_OPCODE_SCS: 2733 temp = temp_temp(pc); 2734 if (mask & 3) 2735 emit_precossin(pc, temp, src[0][0]); 2736 if (mask & (1 << 0)) 2737 emit_flop(pc, NV50_FLOP_COS, dst[0], temp); 2738 if (mask & (1 << 1)) 2739 emit_flop(pc, NV50_FLOP_SIN, dst[1], temp); 2740 if (mask & (1 << 2)) 2741 emit_mov_immdval(pc, dst[2], 0.0); 2742 if (mask & (1 << 3)) 2743 emit_mov_immdval(pc, dst[3], 1.0); 2744 break; 2745 case TGSI_OPCODE_SHL: 2746 case TGSI_OPCODE_ISHR: 2747 case TGSI_OPCODE_USHR: 2748 for (c = 0; c < 4; c++) { 2749 if (!(mask & (1 << c))) 2750 continue; 2751 emit_shift(pc, dst[c], src[0][c], src[1][c], 2752 inst->Instruction.Opcode); 2753 } 2754 break; 2755 case TGSI_OPCODE_SIN: 2756 if (mask & 8) { 2757 emit_precossin(pc, temp, src[0][3]); 2758 emit_flop(pc, NV50_FLOP_SIN, dst[3], temp); 2759 if (!(mask &= 7)) 2760 break; 2761 if (temp == dst[3]) 2762 temp = brdc = temp_temp(pc); 2763 } 2764 emit_precossin(pc, temp, src[0][0]); 2765 emit_flop(pc, NV50_FLOP_SIN, brdc, temp); 2766 break; 2767 case TGSI_OPCODE_SLT: 2768 case TGSI_OPCODE_SGE: 2769 case TGSI_OPCODE_SEQ: 2770 case TGSI_OPCODE_SGT: 2771 case TGSI_OPCODE_SLE: 2772 case TGSI_OPCODE_SNE: 2773 case TGSI_OPCODE_ISLT: 2774 case TGSI_OPCODE_ISGE: 2775 case TGSI_OPCODE_USEQ: 2776 case TGSI_OPCODE_USGE: 2777 case TGSI_OPCODE_USLT: 2778 case TGSI_OPCODE_USNE: 2779 { 2780 uint8_t cc, ty; 2781 2782 map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty); 2783 2784 for (c = 0; c < 4; c++) { 2785 if (!(mask & (1 << c))) 2786 continue; 2787 emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty); 2788 } 2789 } 2790 break; 2791 case TGSI_OPCODE_SUB: 2792 for (c = 0; c < 4; c++) { 2793 if (!(mask & (1 << c))) 2794 continue; 2795 emit_sub(pc, dst[c], src[0][c], src[1][c]); 2796 } 2797 break; 2798 case TGSI_OPCODE_TEX: 2799 emit_tex(pc, dst, mask, src[0], unit, 2800 inst->Texture.Texture, FALSE, 0); 2801 break; 2802 case TGSI_OPCODE_TXB: 2803 emit_tex(pc, dst, mask, src[0], unit, 2804 inst->Texture.Texture, FALSE, -1); 2805 break; 2806 case TGSI_OPCODE_TXL: 2807 emit_tex(pc, dst, mask, src[0], unit, 2808 inst->Texture.Texture, FALSE, 1); 2809 break; 2810 case TGSI_OPCODE_TXP: 2811 emit_tex(pc, dst, mask, src[0], unit, 2812 inst->Texture.Texture, TRUE, 0); 2813 break; 2814 case TGSI_OPCODE_TRUNC: 2815 for (c = 0; c < 4; c++) { 2816 if (!(mask & (1 << c))) 2817 continue; 2818 emit_cvt(pc, dst[c], src[0][c], -1, 2819 CVT_TRUNC | CVT_F32_F32 | CVT_RI); 2820 } 2821 break; 2822 case TGSI_OPCODE_U2F: 2823 for (c = 0; c < 4; c++) { 2824 if (!(mask & (1 << c))) 2825 continue; 2826 emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32); 2827 } 2828 break; 2829 case TGSI_OPCODE_UADD: 2830 for (c = 0; c < 4; c++) { 2831 if (!(mask & (1 << c))) 2832 continue; 2833 emit_add_b32(pc, dst[c], src[0][c], src[1][c]); 2834 } 2835 break; 2836 case TGSI_OPCODE_UMAX: 2837 for (c = 0; c < 4; c++) { 2838 if (!(mask & (1 << c))) 2839 continue; 2840 emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]); 2841 } 2842 break; 2843 case TGSI_OPCODE_UMIN: 2844 for (c = 0; c < 4; c++) { 2845 if (!(mask & (1 << c))) 2846 continue; 2847 emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]); 2848 } 2849 break; 2850 case TGSI_OPCODE_XPD: 2851 temp = temp_temp(pc); 2852 if (mask & (1 << 0)) { 2853 emit_mul(pc, temp, src[0][2], src[1][1]); 2854 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 2855 } 2856 if (mask & (1 << 1)) { 2857 emit_mul(pc, temp, src[0][0], src[1][2]); 2858 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 2859 } 2860 if (mask & (1 << 2)) { 2861 emit_mul(pc, temp, src[0][1], src[1][0]); 2862 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 2863 } 2864 if (mask & (1 << 3)) 2865 emit_mov_immdval(pc, dst[3], 1.0); 2866 break; 2867 case TGSI_OPCODE_END: 2868 if (pc->p->type == PIPE_SHADER_FRAGMENT) 2869 nv50_fp_move_results(pc); 2870 2871 /* last insn must be long so it can have the exit bit set */ 2872 if (!is_long(pc->p->exec_tail)) 2873 convert_to_long(pc, pc->p->exec_tail); 2874 else 2875 if (is_immd(pc->p->exec_tail) || is_join(pc->p->exec_tail)) 2876 emit_nop(pc); 2877 2878 pc->p->exec_tail->inst[1] |= 1; /* set exit bit */ 2879 break; 2880 default: 2881 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 2882 return FALSE; 2883 } 2884 2885 if (brdc) { 2886 if (sat) 2887 emit_sat(pc, brdc, brdc); 2888 for (c = 0; c < 4; c++) 2889 if ((mask & (1 << c)) && dst[c] != brdc) 2890 emit_mov(pc, dst[c], brdc); 2891 } else 2892 if (sat) { 2893 for (c = 0; c < 4; c++) { 2894 if (!(mask & (1 << c))) 2895 continue; 2896 /* In this case we saturate later, and dst[c] won't 2897 * be another temp_temp (and thus lost), since rdst 2898 * already is TEMP (see above). */ 2899 if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) 2900 continue; 2901 emit_sat(pc, rdst[c], dst[c]); 2902 } 2903 } 2904 2905 kill_temp_temp(pc); 2906 pc->reg_instance_nr = 0; 2907 2908 return TRUE; 2909} 2910 2911static void 2912prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) 2913{ 2914 struct nv50_reg *r, *reg = NULL; 2915 const struct tgsi_full_src_register *src; 2916 const struct tgsi_dst_register *dst; 2917 unsigned i, c, k, mask; 2918 2919 dst = &insn->Dst[0].Register; 2920 mask = dst->WriteMask; 2921 2922 if (dst->File == TGSI_FILE_TEMPORARY) 2923 reg = pc->temp; 2924 else 2925 if (dst->File == TGSI_FILE_OUTPUT) { 2926 reg = pc->result; 2927 2928 if (insn->Instruction.Opcode == TGSI_OPCODE_MOV && 2929 dst->Index == pc->edgeflag_out && 2930 insn->Src[0].Register.File == TGSI_FILE_INPUT) 2931 pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index; 2932 } 2933 2934 if (reg) { 2935 for (c = 0; c < 4; c++) { 2936 if (!(mask & (1 << c))) 2937 continue; 2938 reg[dst->Index * 4 + c].acc = pc->insn_nr; 2939 } 2940 } 2941 2942 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2943 src = &insn->Src[i]; 2944 2945 if (src->Register.File == TGSI_FILE_TEMPORARY) 2946 reg = pc->temp; 2947 else 2948 if (src->Register.File == TGSI_FILE_INPUT) 2949 reg = pc->attr; 2950 else 2951 continue; 2952 2953 mask = nv50_tgsi_src_mask(insn, i); 2954 2955 for (c = 0; c < 4; c++) { 2956 if (!(mask & (1 << c))) 2957 continue; 2958 k = tgsi_util_get_full_src_register_swizzle(src, c); 2959 2960 r = ®[src->Register.Index * 4 + k]; 2961 2962 /* If used before written, pre-allocate the reg, 2963 * lest we overwrite results from a subroutine. 2964 */ 2965 if (!r->acc && r->type == P_TEMP) 2966 alloc_reg(pc, r); 2967 2968 r->acc = pc->insn_nr; 2969 } 2970 } 2971} 2972 2973/* Returns a bitmask indicating which dst components need to be 2974 * written to temporaries first to avoid 'corrupting' sources. 2975 * 2976 * m[i] (out) indicate component to write in the i-th position 2977 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 2978 */ 2979static unsigned 2980nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 2981{ 2982 unsigned i, c, x, unsafe; 2983 2984 for (c = 0; c < 4; c++) 2985 m[c] = c; 2986 2987 /* Swap as long as a dst component written earlier is depended on 2988 * by one written later, but the next one isn't depended on by it. 2989 */ 2990 for (c = 0; c < 3; c++) { 2991 if (rdep[m[c + 1]] & (1 << m[c])) 2992 continue; /* if next one is depended on by us */ 2993 for (i = c + 1; i < 4; i++) 2994 /* if we are depended on by a later one */ 2995 if (rdep[m[c]] & (1 << m[i])) 2996 break; 2997 if (i == 4) 2998 continue; 2999 /* now, swap */ 3000 x = m[c]; 3001 m[c] = m[c + 1]; 3002 m[c + 1] = x; 3003 3004 /* restart */ 3005 c = 0; 3006 } 3007 3008 /* mark dependencies that could not be resolved by reordering */ 3009 for (i = 0; i < 3; ++i) 3010 for (c = i + 1; c < 4; ++c) 3011 if (rdep[m[i]] & (1 << m[c])) 3012 unsafe |= (1 << i); 3013 3014 /* NOTE: $unsafe is with respect to order, not component */ 3015 return unsafe; 3016} 3017 3018/* Select a suitable dst register for broadcasting scalar results, 3019 * or return NULL if we have to allocate an extra TEMP. 3020 * 3021 * If e.g. only 1 component is written, we may also emit the final 3022 * result to a write-only register. 3023 */ 3024static struct nv50_reg * 3025tgsi_broadcast_dst(struct nv50_pc *pc, 3026 const struct tgsi_full_dst_register *fd, unsigned mask) 3027{ 3028 if (fd->Register.File == TGSI_FILE_TEMPORARY) { 3029 int c = ffs(~mask & fd->Register.WriteMask); 3030 if (c) 3031 return tgsi_dst(pc, c - 1, fd); 3032 } else { 3033 int c = ffs(fd->Register.WriteMask) - 1; 3034 if ((1 << c) == fd->Register.WriteMask) 3035 return tgsi_dst(pc, c, fd); 3036 } 3037 3038 return NULL; 3039} 3040 3041/* Scan source swizzles and return a bitmask indicating dst regs that 3042 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 3043 */ 3044static unsigned 3045nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 3046 unsigned rdep[4]) 3047{ 3048 const struct tgsi_full_dst_register *fd = &insn->Dst[0]; 3049 const struct tgsi_full_src_register *fs; 3050 unsigned i, deqs = 0; 3051 3052 for (i = 0; i < 4; ++i) 3053 rdep[i] = 0; 3054 3055 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 3056 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 3057 int ms = get_supported_mods(insn, i); 3058 3059 fs = &insn->Src[i]; 3060 if (fs->Register.File != fd->Register.File || 3061 fs->Register.Index != fd->Register.Index) 3062 continue; 3063 3064 for (chn = 0; chn < 4; ++chn) { 3065 unsigned s, c; 3066 3067 if (!(mask & (1 << chn))) /* src is not read */ 3068 continue; 3069 c = tgsi_util_get_full_src_register_swizzle(fs, chn); 3070 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 3071 3072 if (!(fd->Register.WriteMask & (1 << c))) 3073 continue; 3074 3075 if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG)) 3076 continue; 3077 if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS)) 3078 continue; 3079 if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3)) 3080 continue; 3081 3082 rdep[c] |= nv50_tgsi_dst_revdep( 3083 insn->Instruction.Opcode, i, chn); 3084 deqs |= (1 << c); 3085 } 3086 } 3087 3088 return deqs; 3089} 3090 3091static boolean 3092nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 3093{ 3094 struct tgsi_full_instruction insn = tok->FullInstruction; 3095 const struct tgsi_full_dst_register *fd; 3096 unsigned i, deqs, rdep[4], m[4]; 3097 3098 fd = &tok->FullInstruction.Dst[0]; 3099 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 3100 3101 if (is_scalar_op(insn.Instruction.Opcode)) { 3102 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 3103 if (!pc->r_brdc) 3104 pc->r_brdc = temp_temp(pc); 3105 return nv50_program_tx_insn(pc, &insn); 3106 } 3107 pc->r_brdc = NULL; 3108 3109 if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3])) 3110 return nv50_program_tx_insn(pc, &insn); 3111 3112 deqs = nv50_revdep_reorder(m, rdep); 3113 3114 for (i = 0; i < 4; ++i) { 3115 assert(pc->r_dst[m[i]] == NULL); 3116 3117 insn.Dst[0].Register.WriteMask = 3118 fd->Register.WriteMask & (1 << m[i]); 3119 3120 if (!insn.Dst[0].Register.WriteMask) 3121 continue; 3122 3123 if (deqs & (1 << i)) 3124 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 3125 3126 if (!nv50_program_tx_insn(pc, &insn)) 3127 return FALSE; 3128 } 3129 3130 for (i = 0; i < 4; i++) { 3131 struct nv50_reg *reg = pc->r_dst[i]; 3132 if (!reg) 3133 continue; 3134 pc->r_dst[i] = NULL; 3135 3136 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 3137 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 3138 else 3139 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 3140 free_temp(pc, reg); 3141 } 3142 3143 return TRUE; 3144} 3145 3146static void 3147load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) 3148{ 3149 struct nv50_reg *iv, **ppiv; 3150 unsigned mode = pc->interp_mode[reg->index]; 3151 3152 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; 3153 iv = *ppiv; 3154 3155 if ((mode & INTERP_PERSPECTIVE) && !iv) { 3156 iv = *ppiv = alloc_temp(pc, NULL); 3157 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; 3158 3159 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); 3160 emit_flop(pc, NV50_FLOP_RCP, iv, iv); 3161 3162 /* XXX: when loading interpolants dynamically, move these 3163 * to the program head, or make sure it can't be skipped. 3164 */ 3165 } 3166 3167 emit_interp(pc, reg, iv, mode); 3168} 3169 3170/* The face input is always at v[255] (varying space), with a 3171 * value of 0 for back-facing, and 0xffffffff for front-facing. 3172 */ 3173static void 3174load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a) 3175{ 3176 struct nv50_reg *one = alloc_immd(pc, 1.0f); 3177 3178 assert(a->rhw == -1); 3179 alloc_reg(pc, a); /* do this before rhw is set */ 3180 a->rhw = 255; 3181 load_interpolant(pc, a); 3182 emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND); 3183 3184 FREE(one); 3185} 3186 3187static boolean 3188nv50_program_tx_prep(struct nv50_pc *pc) 3189{ 3190 struct tgsi_parse_context tp; 3191 struct nv50_program *p = pc->p; 3192 boolean ret = FALSE; 3193 unsigned i, c, flat_nr = 0; 3194 3195 tgsi_parse_init(&tp, pc->p->pipe.tokens); 3196 while (!tgsi_parse_end_of_tokens(&tp)) { 3197 const union tgsi_full_token *tok = &tp.FullToken; 3198 3199 tgsi_parse_token(&tp); 3200 switch (tok->Token.Type) { 3201 case TGSI_TOKEN_TYPE_IMMEDIATE: 3202 { 3203 const struct tgsi_full_immediate *imm = 3204 &tp.FullToken.FullImmediate; 3205 3206 ctor_immd_4f32(pc, imm->u[0].Float, 3207 imm->u[1].Float, 3208 imm->u[2].Float, 3209 imm->u[3].Float); 3210 } 3211 break; 3212 case TGSI_TOKEN_TYPE_DECLARATION: 3213 { 3214 const struct tgsi_full_declaration *d; 3215 unsigned si, last, first, mode; 3216 3217 d = &tp.FullToken.FullDeclaration; 3218 first = d->Range.First; 3219 last = d->Range.Last; 3220 3221 switch (d->Declaration.File) { 3222 case TGSI_FILE_TEMPORARY: 3223 break; 3224 case TGSI_FILE_OUTPUT: 3225 if (!d->Declaration.Semantic || 3226 p->type == PIPE_SHADER_FRAGMENT) 3227 break; 3228 3229 si = d->Semantic.Index; 3230 switch (d->Semantic.Name) { 3231 case TGSI_SEMANTIC_BCOLOR: 3232 p->cfg.two_side[si].hw = first; 3233 if (p->cfg.io_nr > first) 3234 p->cfg.io_nr = first; 3235 break; 3236 case TGSI_SEMANTIC_PSIZE: 3237 p->cfg.psiz = first; 3238 if (p->cfg.io_nr > first) 3239 p->cfg.io_nr = first; 3240 break; 3241 case TGSI_SEMANTIC_EDGEFLAG: 3242 pc->edgeflag_out = first; 3243 break; 3244 /* 3245 case TGSI_SEMANTIC_CLIP_DISTANCE: 3246 p->cfg.clpd = MIN2(p->cfg.clpd, first); 3247 break; 3248 */ 3249 default: 3250 break; 3251 } 3252 break; 3253 case TGSI_FILE_INPUT: 3254 { 3255 if (p->type != PIPE_SHADER_FRAGMENT) 3256 break; 3257 3258 switch (d->Declaration.Interpolate) { 3259 case TGSI_INTERPOLATE_CONSTANT: 3260 mode = INTERP_FLAT; 3261 flat_nr++; 3262 break; 3263 case TGSI_INTERPOLATE_PERSPECTIVE: 3264 mode = INTERP_PERSPECTIVE; 3265 p->cfg.regs[1] |= 0x08 << 24; 3266 break; 3267 default: 3268 mode = INTERP_LINEAR; 3269 break; 3270 } 3271 if (d->Declaration.Centroid) 3272 mode |= INTERP_CENTROID; 3273 3274 assert(last < 32); 3275 for (i = first; i <= last; i++) 3276 pc->interp_mode[i] = mode; 3277 } 3278 break; 3279 case TGSI_FILE_ADDRESS: 3280 case TGSI_FILE_CONSTANT: 3281 case TGSI_FILE_SAMPLER: 3282 break; 3283 default: 3284 NOUVEAU_ERR("bad decl file %d\n", 3285 d->Declaration.File); 3286 goto out_err; 3287 } 3288 } 3289 break; 3290 case TGSI_TOKEN_TYPE_INSTRUCTION: 3291 pc->insn_nr++; 3292 prep_inspect_insn(pc, &tok->FullInstruction); 3293 break; 3294 default: 3295 break; 3296 } 3297 } 3298 3299 if (p->type == PIPE_SHADER_VERTEX) { 3300 int rid = 0; 3301 3302 for (i = 0; i < pc->attr_nr * 4; ++i) { 3303 if (pc->attr[i].acc) { 3304 pc->attr[i].hw = rid++; 3305 p->cfg.attr[i / 32] |= 1 << (i % 32); 3306 } 3307 } 3308 3309 for (i = 0, rid = 0; i < pc->result_nr; ++i) { 3310 p->cfg.io[i].hw = rid; 3311 p->cfg.io[i].id = i; 3312 3313 for (c = 0; c < 4; ++c) { 3314 int n = i * 4 + c; 3315 if (!pc->result[n].acc) 3316 continue; 3317 pc->result[n].hw = rid++; 3318 p->cfg.io[i].mask |= 1 << c; 3319 } 3320 } 3321 3322 for (c = 0; c < 2; ++c) 3323 if (p->cfg.two_side[c].hw < 0x40) 3324 p->cfg.two_side[c] = p->cfg.io[ 3325 p->cfg.two_side[c].hw]; 3326 3327 if (p->cfg.psiz < 0x40) 3328 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw; 3329 } else 3330 if (p->type == PIPE_SHADER_FRAGMENT) { 3331 int rid, aid; 3332 unsigned n = 0, m = pc->attr_nr - flat_nr; 3333 3334 pc->allow32 = TRUE; 3335 3336 int base = (TGSI_SEMANTIC_POSITION == 3337 p->info.input_semantic_name[0]) ? 0 : 1; 3338 3339 /* non-flat interpolants have to be mapped to 3340 * the lower hardware IDs, so sort them: 3341 */ 3342 for (i = 0; i < pc->attr_nr; i++) { 3343 if (pc->interp_mode[i] == INTERP_FLAT) 3344 p->cfg.io[m++].id = i; 3345 else { 3346 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) 3347 p->cfg.io[n].linear = TRUE; 3348 p->cfg.io[n++].id = i; 3349 } 3350 } 3351 3352 if (!base) /* set w-coordinate mask from perspective interp */ 3353 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24; 3354 3355 aid = popcnt4( /* if fcrd isn't contained in cfg.io */ 3356 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask); 3357 3358 for (n = 0; n < pc->attr_nr; ++n) { 3359 p->cfg.io[n].hw = rid = aid; 3360 i = p->cfg.io[n].id; 3361 3362 if (p->info.input_semantic_name[n] == 3363 TGSI_SEMANTIC_FACE) { 3364 load_frontfacing(pc, &pc->attr[i * 4]); 3365 continue; 3366 } 3367 3368 for (c = 0; c < 4; ++c) { 3369 if (!pc->attr[i * 4 + c].acc) 3370 continue; 3371 pc->attr[i * 4 + c].rhw = rid++; 3372 p->cfg.io[n].mask |= 1 << c; 3373 3374 load_interpolant(pc, &pc->attr[i * 4 + c]); 3375 } 3376 aid += popcnt4(p->cfg.io[n].mask); 3377 } 3378 3379 if (!base) 3380 p->cfg.regs[1] |= p->cfg.io[0].mask << 24; 3381 3382 m = popcnt4(p->cfg.regs[1] >> 24); 3383 3384 /* set count of non-position inputs and of non-flat 3385 * non-position inputs for FP_INTERPOLANT_CTRL 3386 */ 3387 p->cfg.regs[1] |= aid - m; 3388 3389 if (flat_nr) { 3390 i = p->cfg.io[pc->attr_nr - flat_nr].hw; 3391 p->cfg.regs[1] |= (i - m) << 16; 3392 } else 3393 p->cfg.regs[1] |= p->cfg.regs[1] << 16; 3394 3395 /* mark color semantic for light-twoside */ 3396 n = 0x40; 3397 for (i = 0; i < pc->attr_nr; i++) { 3398 ubyte si, sn; 3399 3400 sn = p->info.input_semantic_name[p->cfg.io[i].id]; 3401 si = p->info.input_semantic_index[p->cfg.io[i].id]; 3402 3403 if (sn == TGSI_SEMANTIC_COLOR) { 3404 p->cfg.two_side[si] = p->cfg.io[i]; 3405 3406 /* increase colour count */ 3407 p->cfg.regs[0] += popcnt4( 3408 p->cfg.two_side[si].mask) << 16; 3409 3410 n = MIN2(n, p->cfg.io[i].hw - m); 3411 } 3412 } 3413 if (n < 0x40) 3414 p->cfg.regs[0] += n; 3415 3416 /* Initialize FP results: 3417 * FragDepth is always first TGSI and last hw output 3418 */ 3419 i = p->info.writes_z ? 4 : 0; 3420 for (rid = 0; i < pc->result_nr * 4; i++) 3421 pc->result[i].rhw = rid++; 3422 if (p->info.writes_z) 3423 pc->result[2].rhw = rid; 3424 3425 p->cfg.high_result = rid; 3426 3427 /* separate/different colour results for MRTs ? */ 3428 if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) 3429 p->cfg.regs[2] |= 1; 3430 } 3431 3432 if (pc->immd_nr) { 3433 int rid = 0; 3434 3435 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); 3436 if (!pc->immd) 3437 goto out_err; 3438 3439 for (i = 0; i < pc->immd_nr; i++) { 3440 for (c = 0; c < 4; c++, rid++) 3441 ctor_reg(&pc->immd[rid], P_IMMD, i, rid); 3442 } 3443 } 3444 3445 ret = TRUE; 3446out_err: 3447 if (pc->iv_p) 3448 free_temp(pc, pc->iv_p); 3449 if (pc->iv_c) 3450 free_temp(pc, pc->iv_c); 3451 3452 tgsi_parse_free(&tp); 3453 return ret; 3454} 3455 3456static void 3457free_nv50_pc(struct nv50_pc *pc) 3458{ 3459 if (pc->immd) 3460 FREE(pc->immd); 3461 if (pc->param) 3462 FREE(pc->param); 3463 if (pc->result) 3464 FREE(pc->result); 3465 if (pc->attr) 3466 FREE(pc->attr); 3467 if (pc->temp) 3468 FREE(pc->temp); 3469 3470 FREE(pc); 3471} 3472 3473static boolean 3474ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) 3475{ 3476 int i, c; 3477 unsigned rtype[2] = { P_ATTR, P_RESULT }; 3478 3479 pc->p = p; 3480 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; 3481 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; 3482 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; 3483 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; 3484 pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; 3485 assert(pc->addr_nr <= 2); 3486 3487 p->cfg.high_temp = 4; 3488 3489 p->cfg.two_side[0].hw = 0x40; 3490 p->cfg.two_side[1].hw = 0x40; 3491 3492 p->cfg.edgeflag_in = pc->edgeflag_out = 0xff; 3493 3494 switch (p->type) { 3495 case PIPE_SHADER_VERTEX: 3496 p->cfg.psiz = 0x40; 3497 p->cfg.clpd = 0x40; 3498 p->cfg.io_nr = pc->result_nr; 3499 break; 3500 case PIPE_SHADER_FRAGMENT: 3501 rtype[0] = rtype[1] = P_TEMP; 3502 3503 p->cfg.regs[0] = 0x01000004; 3504 p->cfg.io_nr = pc->attr_nr; 3505 3506 if (p->info.writes_z) { 3507 p->cfg.regs[2] |= 0x00000100; 3508 p->cfg.regs[3] |= 0x00000011; 3509 } 3510 if (p->info.uses_kill) 3511 p->cfg.regs[2] |= 0x00100000; 3512 break; 3513 } 3514 3515 if (pc->temp_nr) { 3516 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); 3517 if (!pc->temp) 3518 return FALSE; 3519 3520 for (i = 0; i < pc->temp_nr * 4; ++i) 3521 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); 3522 } 3523 3524 if (pc->attr_nr) { 3525 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); 3526 if (!pc->attr) 3527 return FALSE; 3528 3529 for (i = 0; i < pc->attr_nr * 4; ++i) 3530 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); 3531 } 3532 3533 if (pc->result_nr) { 3534 unsigned nr = pc->result_nr * 4; 3535 3536 pc->result = MALLOC(nr * sizeof(struct nv50_reg)); 3537 if (!pc->result) 3538 return FALSE; 3539 3540 for (i = 0; i < nr; ++i) 3541 ctor_reg(&pc->result[i], rtype[1], i / 4, -1); 3542 } 3543 3544 if (pc->param_nr) { 3545 int rid = 0; 3546 3547 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); 3548 if (!pc->param) 3549 return FALSE; 3550 3551 for (i = 0; i < pc->param_nr; ++i) 3552 for (c = 0; c < 4; ++c, ++rid) 3553 ctor_reg(&pc->param[rid], P_CONST, i, rid); 3554 } 3555 3556 if (pc->addr_nr) { 3557 pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); 3558 if (!pc->addr) 3559 return FALSE; 3560 } 3561 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 3562 ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1); 3563 3564 return TRUE; 3565} 3566 3567static void 3568nv50_program_fixup_insns(struct nv50_pc *pc) 3569{ 3570 struct nv50_program_exec *e, **bra_list; 3571 unsigned i, n, pos; 3572 3573 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); 3574 3575 /* Collect branch instructions, we need to adjust their offsets 3576 * when converting 32 bit instructions to 64 bit ones 3577 */ 3578 for (n = 0, e = pc->p->exec_head; e; e = e->next) 3579 if (e->param.index >= 0 && !e->param.mask) 3580 bra_list[n++] = e; 3581 3582 /* Make sure we don't have any single 32 bit instructions. */ 3583 for (e = pc->p->exec_head, pos = 0; e; e = e->next) { 3584 pos += is_long(e) ? 2 : 1; 3585 3586 if ((pos & 1) && (!e->next || is_long(e->next))) { 3587 for (i = 0; i < n; ++i) 3588 if (bra_list[i]->param.index >= pos) 3589 bra_list[i]->param.index += 1; 3590 for (i = 0; i < pc->insn_nr; ++i) 3591 if (pc->insn_pos[i] >= pos) 3592 pc->insn_pos[i] += 1; 3593 convert_to_long(pc, e); 3594 ++pos; 3595 } 3596 } 3597 3598 FREE(bra_list); 3599 3600 if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL]) 3601 return; 3602 3603 /* fill in CALL offsets */ 3604 for (e = pc->p->exec_head; e; e = e->next) { 3605 if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2) 3606 e->param.index = pc->insn_pos[e->param.index]; 3607 } 3608} 3609 3610static boolean 3611nv50_program_tx(struct nv50_program *p) 3612{ 3613 struct tgsi_parse_context parse; 3614 struct nv50_pc *pc; 3615 boolean ret; 3616 3617 pc = CALLOC_STRUCT(nv50_pc); 3618 if (!pc) 3619 return FALSE; 3620 3621 ret = ctor_nv50_pc(pc, p); 3622 if (ret == FALSE) 3623 goto out_cleanup; 3624 3625 ret = nv50_program_tx_prep(pc); 3626 if (ret == FALSE) 3627 goto out_cleanup; 3628 3629 pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned)); 3630 3631 tgsi_parse_init(&parse, pc->p->pipe.tokens); 3632 while (!tgsi_parse_end_of_tokens(&parse)) { 3633 const union tgsi_full_token *tok = &parse.FullToken; 3634 3635 /* previously allow32 was FALSE for first & last instruction */ 3636 pc->allow32 = TRUE; 3637 3638 tgsi_parse_token(&parse); 3639 3640 switch (tok->Token.Type) { 3641 case TGSI_TOKEN_TYPE_INSTRUCTION: 3642 pc->insn_pos[pc->insn_cur] = pc->p->exec_size; 3643 ++pc->insn_cur; 3644 ret = nv50_tgsi_insn(pc, tok); 3645 if (ret == FALSE) 3646 goto out_err; 3647 break; 3648 default: 3649 break; 3650 } 3651 } 3652 3653 nv50_program_fixup_insns(pc); 3654 3655 p->param_nr = pc->param_nr * 4; 3656 p->immd_nr = pc->immd_nr * 4; 3657 p->immd = pc->immd_buf; 3658 3659out_err: 3660 tgsi_parse_free(&parse); 3661 3662out_cleanup: 3663 free_nv50_pc(pc); 3664 return ret; 3665} 3666 3667static void 3668nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 3669{ 3670 if (nv50_program_tx(p) == FALSE) 3671 assert(0); 3672 p->translated = TRUE; 3673} 3674 3675static void 3676nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map, 3677 unsigned start, unsigned count, unsigned cbuf) 3678{ 3679 struct nouveau_channel *chan = nv50->screen->base.channel; 3680 struct nouveau_grobj *tesla = nv50->screen->tesla; 3681 3682 while (count) { 3683 unsigned nr = count > 2047 ? 2047 : count; 3684 3685 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 3686 OUT_RING (chan, (cbuf << 0) | (start << 8)); 3687 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 3688 OUT_RINGp (chan, map, nr); 3689 3690 map += nr; 3691 start += nr; 3692 count -= nr; 3693 } 3694} 3695 3696static void 3697nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 3698{ 3699 struct pipe_screen *pscreen = nv50->pipe.screen; 3700 3701 if (!p->data[0] && p->immd_nr) { 3702 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 3703 3704 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 3705 while (heap->next && heap->size < p->immd_nr) { 3706 struct nv50_program *evict = heap->next->priv; 3707 nouveau_resource_free(&evict->data[0]); 3708 } 3709 3710 if (nouveau_resource_alloc(heap, p->immd_nr, p, 3711 &p->data[0])) 3712 assert(0); 3713 } 3714 3715 /* immediates only need to be uploaded again when freed */ 3716 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 3717 p->immd_nr, NV50_CB_PMISC); 3718 } 3719 3720 assert(p->param_nr <= 512); 3721 3722 if (p->param_nr) { 3723 unsigned cb; 3724 uint32_t *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], 3725 PIPE_BUFFER_USAGE_CPU_READ); 3726 3727 if (p->type == PIPE_SHADER_VERTEX) 3728 cb = NV50_CB_PVP; 3729 else 3730 cb = NV50_CB_PFP; 3731 3732 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); 3733 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); 3734 } 3735} 3736 3737static void 3738nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 3739{ 3740 struct nouveau_channel *chan = nv50->screen->base.channel; 3741 struct nv50_program_exec *e; 3742 uint32_t *up, i; 3743 boolean upload = FALSE; 3744 3745 if (!p->bo) { 3746 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 3747 p->exec_size * 4, &p->bo); 3748 upload = TRUE; 3749 } 3750 3751 if (p->data[0] && p->data[0]->start != p->data_start[0]) 3752 upload = TRUE; 3753 3754 if (!upload) 3755 return; 3756 3757 up = MALLOC(p->exec_size * 4); 3758 3759 for (i = 0, e = p->exec_head; e; e = e->next) { 3760 unsigned ei, ci, bs; 3761 3762 if (e->param.index >= 0 && e->param.mask) { 3763 bs = (e->inst[1] >> 22) & 0x07; 3764 assert(bs < 2); 3765 ei = e->param.shift >> 5; 3766 ci = e->param.index; 3767 if (bs == 0) 3768 ci += p->data[bs]->start; 3769 3770 e->inst[ei] &= ~e->param.mask; 3771 e->inst[ei] |= (ci << e->param.shift); 3772 } else 3773 if (e->param.index >= 0) { 3774 /* zero mask means param is a jump/branch offset */ 3775 assert(!(e->param.index & 1)); 3776 /* seem to be 8 byte steps */ 3777 ei = (e->param.index >> 1) + 0 /* START_ID */; 3778 3779 e->inst[0] &= 0xf0000fff; 3780 e->inst[0] |= ei << 12; 3781 } 3782 3783 up[i++] = e->inst[0]; 3784 if (is_long(e)) 3785 up[i++] = e->inst[1]; 3786 } 3787 assert(i == p->exec_size); 3788 3789 if (p->data[0]) 3790 p->data_start[0] = p->data[0]->start; 3791 3792#ifdef NV50_PROGRAM_DUMP 3793 NOUVEAU_ERR("-------\n"); 3794 for (e = p->exec_head; e; e = e->next) { 3795 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 3796 if (is_long(e)) 3797 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 3798 } 3799#endif 3800 nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM, 3801 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, 3802 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0, 3803 0, 0, p->exec_size * 4, 1, 1); 3804 3805 FREE(up); 3806} 3807 3808void 3809nv50_vertprog_validate(struct nv50_context *nv50) 3810{ 3811 struct nouveau_grobj *tesla = nv50->screen->tesla; 3812 struct nv50_program *p = nv50->vertprog; 3813 struct nouveau_stateobj *so; 3814 3815 if (!p->translated) { 3816 nv50_program_validate(nv50, p); 3817 if (!p->translated) 3818 assert(0); 3819 } 3820 3821 nv50_program_validate_data(nv50, p); 3822 nv50_program_validate_code(nv50, p); 3823 3824 so = so_new(5, 8, 2); 3825 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 3826 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3827 NOUVEAU_BO_HIGH, 0, 0); 3828 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3829 NOUVEAU_BO_LOW, 0, 0); 3830 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 3831 so_data (so, p->cfg.attr[0]); 3832 so_data (so, p->cfg.attr[1]); 3833 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 3834 so_data (so, p->cfg.high_result); 3835 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); 3836 so_data (so, p->cfg.high_result); //8); 3837 so_data (so, p->cfg.high_temp); 3838 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 3839 so_data (so, 0); /* program start offset */ 3840 so_ref(so, &nv50->state.vertprog); 3841 so_ref(NULL, &so); 3842} 3843 3844void 3845nv50_fragprog_validate(struct nv50_context *nv50) 3846{ 3847 struct nouveau_grobj *tesla = nv50->screen->tesla; 3848 struct nv50_program *p = nv50->fragprog; 3849 struct nouveau_stateobj *so; 3850 3851 if (!p->translated) { 3852 nv50_program_validate(nv50, p); 3853 if (!p->translated) 3854 assert(0); 3855 } 3856 3857 nv50_program_validate_data(nv50, p); 3858 nv50_program_validate_code(nv50, p); 3859 3860 so = so_new(6, 7, 2); 3861 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 3862 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3863 NOUVEAU_BO_HIGH, 0, 0); 3864 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 3865 NOUVEAU_BO_LOW, 0, 0); 3866 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); 3867 so_data (so, p->cfg.high_temp); 3868 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 3869 so_data (so, p->cfg.high_result); 3870 so_method(so, tesla, NV50TCL_FP_CONTROL, 1); 3871 so_data (so, p->cfg.regs[2]); 3872 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 3873 so_data (so, p->cfg.regs[3]); 3874 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 3875 so_data (so, 0); /* program start offset */ 3876 so_ref(so, &nv50->state.fragprog); 3877 so_ref(NULL, &so); 3878} 3879 3880static uint32_t 3881nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) 3882{ 3883 struct nv50_program *fp = nv50->fragprog; 3884 struct nv50_program *vp = nv50->vertprog; 3885 unsigned i, c, m = base; 3886 uint32_t origin = 0x00000010; 3887 3888 /* XXX: this might not work correctly in all cases yet - we'll 3889 * just assume that an FP generic input that is not written in 3890 * the VP is PointCoord. 3891 */ 3892 memset(pntc, 0, 8 * sizeof(uint32_t)); 3893 3894 for (i = 0; i < fp->cfg.io_nr; i++) { 3895 uint8_t sn, si; 3896 uint8_t j, k = fp->cfg.io[i].id; 3897 unsigned n = popcnt4(fp->cfg.io[i].mask); 3898 3899 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) { 3900 m += n; 3901 continue; 3902 } 3903 3904 for (j = 0; j < vp->info.num_outputs; ++j) { 3905 sn = vp->info.output_semantic_name[j]; 3906 si = vp->info.output_semantic_index[j]; 3907 3908 if (sn == fp->info.input_semantic_name[k] && 3909 si == fp->info.input_semantic_index[k]) 3910 break; 3911 } 3912 3913 if (j < vp->info.num_outputs) { 3914 ubyte mode = 3915 nv50->rasterizer->pipe.sprite_coord_mode[si]; 3916 3917 if (mode == PIPE_SPRITE_COORD_NONE) { 3918 m += n; 3919 continue; 3920 } else 3921 if (mode == PIPE_SPRITE_COORD_LOWER_LEFT) 3922 origin = 0; 3923 } 3924 3925 /* this is either PointCoord or replaced by sprite coords */ 3926 for (c = 0; c < 4; c++) { 3927 if (!(fp->cfg.io[i].mask & (1 << c))) 3928 continue; 3929 pntc[m / 8] |= (c + 1) << ((m % 8) * 4); 3930 ++m; 3931 } 3932 } 3933 return origin; 3934} 3935 3936static int 3937nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4], 3938 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) 3939{ 3940 int c; 3941 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; 3942 uint8_t *map = (uint8_t *)p_map; 3943 3944 for (c = 0; c < 4; ++c) { 3945 if (mf & 1) { 3946 if (fpi->linear == TRUE) 3947 lin[mid / 32] |= 1 << (mid % 32); 3948 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40); 3949 } 3950 3951 oid += mv & 1; 3952 mf >>= 1; 3953 mv >>= 1; 3954 } 3955 3956 return mid; 3957} 3958 3959void 3960nv50_linkage_validate(struct nv50_context *nv50) 3961{ 3962 struct nouveau_grobj *tesla = nv50->screen->tesla; 3963 struct nv50_program *vp = nv50->vertprog; 3964 struct nv50_program *fp = nv50->fragprog; 3965 struct nouveau_stateobj *so; 3966 struct nv50_sreg4 dummy, *vpo; 3967 int i, n, c, m = 0; 3968 uint32_t map[16], lin[4], reg[5], pcrd[8]; 3969 3970 memset(map, 0, sizeof(map)); 3971 memset(lin, 0, sizeof(lin)); 3972 3973 reg[1] = 0x00000004; /* low and high clip distance map ids */ 3974 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ 3975 reg[3] = 0x00000000; /* point size map id & enable */ 3976 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ 3977 reg[4] = fp->cfg.regs[1]; /* interpolant info */ 3978 3979 dummy.linear = FALSE; 3980 dummy.mask = 0xf; /* map all components of HPOS */ 3981 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]); 3982 3983 dummy.mask = 0x0; 3984 3985 if (vp->cfg.clpd < 0x40) { 3986 for (c = 0; c < vp->cfg.clpd_nr; ++c) 3987 map[m++] = vp->cfg.clpd + c; 3988 reg[1] = (m << 8); 3989 } 3990 3991 reg[0] |= m << 8; /* adjust BFC0 id */ 3992 3993 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ 3994 if (nv50->rasterizer->pipe.light_twoside) { 3995 vpo = &vp->cfg.two_side[0]; 3996 3997 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]); 3998 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]); 3999 } 4000 4001 reg[0] += m - 4; /* adjust FFC0 id */ 4002 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ 4003 4004 for (i = 0; i < fp->cfg.io_nr; i++) { 4005 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id]; 4006 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id]; 4007 4008 /* position must be mapped first */ 4009 assert(i == 0 || sn != TGSI_SEMANTIC_POSITION); 4010 4011 /* maybe even remove these from cfg.io */ 4012 if (sn == TGSI_SEMANTIC_POSITION || sn == TGSI_SEMANTIC_FACE) 4013 continue; 4014 4015 /* VP outputs and vp->cfg.io are in the same order */ 4016 for (n = 0; n < vp->info.num_outputs; ++n) { 4017 if (vp->info.output_semantic_name[n] == sn && 4018 vp->info.output_semantic_index[n] == si) 4019 break; 4020 } 4021 vpo = (n < vp->info.num_outputs) ? &vp->cfg.io[n] : &dummy; 4022 4023 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo); 4024 } 4025 4026 if (nv50->rasterizer->pipe.point_size_per_vertex) { 4027 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); 4028 reg[3] = (m++ << 4) | 1; 4029 } 4030 4031 /* now fill the stateobj */ 4032 so = so_new(7, 57, 0); 4033 4034 n = (m + 3) / 4; 4035 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); 4036 so_data (so, m); 4037 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); 4038 so_datap (so, map, n); 4039 4040 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 4041 so_datap (so, reg, 4); 4042 4043 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); 4044 so_data (so, reg[4]); 4045 4046 so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); 4047 so_datap (so, lin, 4); 4048 4049 if (nv50->rasterizer->pipe.point_sprite) { 4050 so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); 4051 so_data (so, 4052 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff)); 4053 4054 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); 4055 so_datap (so, pcrd, 8); 4056 } 4057 4058 so_ref(so, &nv50->state.programs); 4059 so_ref(NULL, &so); 4060} 4061 4062void 4063nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 4064{ 4065 while (p->exec_head) { 4066 struct nv50_program_exec *e = p->exec_head; 4067 4068 p->exec_head = e->next; 4069 FREE(e); 4070 } 4071 p->exec_tail = NULL; 4072 p->exec_size = 0; 4073 4074 nouveau_bo_ref(NULL, &p->bo); 4075 4076 nouveau_resource_free(&p->data[0]); 4077 4078 p->translated = 0; 4079} 4080