nv50_program.c revision 03e97e7f8d87f500c008cadd4982537adcfa4969
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "util/u_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33#include "nv50_transfer.h" 34 35#define NV50_SU_MAX_TEMP 127 36#define NV50_SU_MAX_ADDR 4 37//#define NV50_PROGRAM_DUMP 38 39/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ 40 41/* ARL - gallium craps itself on progs/vp/arl.txt 42 * 43 * MSB - Like MAD, but MUL+SUB 44 * - Fuck it off, introduce a way to negate args for ops that 45 * support it. 46 * 47 * Look into inlining IMMD for ops other than MOV (make it general?) 48 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 49 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 50 * 51 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 52 * case, if the emit_src() causes the inst to suddenly become long. 53 * 54 * Verify half-insns work where expected - and force disable them where they 55 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 56 * 57 * FUCK! watch dst==src vectors, can overwrite components that are needed. 58 * ie. SUB R0, R0.yzxw, R0 59 * 60 * Things to check with renouveau: 61 * FP attr/result assignment - how? 62 * attrib 63 * - 0x16bc maps vp output onto fp hpos 64 * - 0x16c0 maps vp output onto fp col0 65 * result 66 * - colr always 0-3 67 * - depr always 4 68 * 0x16bc->0x16e8 --> some binding between vp/fp regs 69 * 0x16b8 --> VP output count 70 * 71 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 72 * "MOV rcol.x, fcol.y" = 0x00000004 73 * 0x19a8 --> as above but 0x00000100 and 0x00000000 74 * - 0x00100000 used when KIL used 75 * 0x196c --> as above but 0x00000011 and 0x00000000 76 * 77 * 0x1988 --> 0xXXNNNNNN 78 * - XX == FP high something 79 */ 80struct nv50_reg { 81 enum { 82 P_TEMP, 83 P_ATTR, 84 P_RESULT, 85 P_CONST, 86 P_IMMD, 87 P_ADDR 88 } type; 89 int index; 90 91 int hw; 92 int mod; 93 94 int rhw; /* result hw for FP outputs, or interpolant index */ 95 int acc; /* instruction where this reg is last read (first insn == 1) */ 96 97 int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */ 98 int indirect[2]; /* index into pc->addr, or -1 */ 99 100 ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */ 101}; 102 103#define NV50_MOD_NEG 1 104#define NV50_MOD_ABS 2 105#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS) 106#define NV50_MOD_SAT 4 107#define NV50_MOD_I32 8 108 109/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */ 110 111/* STACK: Conditionals and loops have to use the (per warp) stack. 112 * Stack entries consist of an entry type (divergent path, join at), 113 * a mask indicating the active threads of the warp, and an address. 114 * MPs can store 12 stack entries internally, if we need more (and 115 * we probably do), we have to create a stack buffer in VRAM. 116 */ 117/* impose low limits for now */ 118#define NV50_MAX_COND_NESTING 4 119#define NV50_MAX_LOOP_NESTING 3 120 121#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2 122 123struct nv50_pc { 124 struct nv50_program *p; 125 126 /* hw resources */ 127 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 128 struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; 129 130 /* tgsi resources */ 131 struct nv50_reg *temp; 132 int temp_nr; 133 struct nv50_reg *attr; 134 int attr_nr; 135 struct nv50_reg *result; 136 int result_nr; 137 struct nv50_reg *param; 138 int param_nr; 139 struct nv50_reg *immd; 140 uint32_t *immd_buf; 141 int immd_nr; 142 struct nv50_reg **addr; 143 int addr_nr; 144 struct nv50_reg *sysval; 145 int sysval_nr; 146 147 struct nv50_reg *temp_temp[16]; 148 struct nv50_program_exec *temp_temp_exec[16]; 149 unsigned temp_temp_nr; 150 151 /* broadcast and destination replacement regs */ 152 struct nv50_reg *r_brdc; 153 struct nv50_reg *r_dst[4]; 154 155 struct nv50_reg reg_instances[16]; 156 unsigned reg_instance_nr; 157 158 unsigned interp_mode[32]; 159 /* perspective interpolation registers */ 160 struct nv50_reg *iv_p; 161 struct nv50_reg *iv_c; 162 163 struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING]; 164 struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING]; 165 struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING]; 166 int if_lvl, loop_lvl; 167 unsigned loop_pos[NV50_MAX_LOOP_NESTING]; 168 169 unsigned *insn_pos; /* actual program offset of each TGSI insn */ 170 boolean in_subroutine; 171 172 /* current instruction and total number of insns */ 173 unsigned insn_cur; 174 unsigned insn_nr; 175 176 boolean allow32; 177 178 uint8_t edgeflag_out; 179}; 180 181static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *); 182 183static INLINE void 184ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) 185{ 186 reg->type = type; 187 reg->index = index; 188 reg->hw = hw; 189 reg->mod = 0; 190 reg->rhw = -1; 191 reg->vtx = -1; 192 reg->acc = 0; 193 reg->indirect[0] = reg->indirect[1] = -1; 194 reg->buf_index = (type == P_CONST) ? 1 : 0; 195} 196 197static INLINE unsigned 198popcnt4(uint32_t val) 199{ 200 static const unsigned cnt[16] 201 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 202 return cnt[val & 0xf]; 203} 204 205static void 206terminate_mbb(struct nv50_pc *pc) 207{ 208 int i; 209 210 /* remove records of temporary address register values */ 211 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 212 if (pc->r_addr[i].index < 0) 213 pc->r_addr[i].acc = 0; 214} 215 216static void 217alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 218{ 219 int i = 0; 220 221 if (reg->type == P_RESULT) { 222 if (pc->p->cfg.high_result < (reg->hw + 1)) 223 pc->p->cfg.high_result = reg->hw + 1; 224 } 225 226 if (reg->type != P_TEMP) 227 return; 228 229 if (reg->hw >= 0) { 230 /*XXX: do this here too to catch FP temp-as-attr usage.. 231 * not clean, but works */ 232 if (pc->p->cfg.high_temp < (reg->hw + 1)) 233 pc->p->cfg.high_temp = reg->hw + 1; 234 return; 235 } 236 237 if (reg->rhw != -1) { 238 /* try to allocate temporary with index rhw first */ 239 if (!(pc->r_temp[reg->rhw])) { 240 pc->r_temp[reg->rhw] = reg; 241 reg->hw = reg->rhw; 242 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 243 pc->p->cfg.high_temp = reg->rhw + 1; 244 return; 245 } 246 /* make sure we don't get things like $r0 needs to go 247 * in $r1 and $r1 in $r0 248 */ 249 i = pc->result_nr * 4; 250 } 251 252 for (; i < NV50_SU_MAX_TEMP; i++) { 253 if (!(pc->r_temp[i])) { 254 pc->r_temp[i] = reg; 255 reg->hw = i; 256 if (pc->p->cfg.high_temp < (i + 1)) 257 pc->p->cfg.high_temp = i + 1; 258 return; 259 } 260 } 261 262 NOUVEAU_ERR("out of registers\n"); 263 abort(); 264} 265 266static INLINE struct nv50_reg * 267reg_instance(struct nv50_pc *pc, struct nv50_reg *reg) 268{ 269 struct nv50_reg *ri; 270 271 assert(pc->reg_instance_nr < 16); 272 ri = &pc->reg_instances[pc->reg_instance_nr++]; 273 if (reg) { 274 alloc_reg(pc, reg); 275 *ri = *reg; 276 reg->indirect[0] = reg->indirect[1] = -1; 277 reg->mod = 0; 278 } 279 return ri; 280} 281 282/* XXX: For shaders that aren't executed linearly (e.g. shaders that 283 * contain loops), we need to assign all hw regs to TGSI TEMPs early, 284 * lest we risk temp_temps overwriting regs alloc'd "later". 285 */ 286static struct nv50_reg * 287alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 288{ 289 struct nv50_reg *r; 290 int i; 291 292 if (dst && dst->type == P_TEMP && dst->hw == -1) 293 return dst; 294 295 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 296 if (!pc->r_temp[i]) { 297 r = MALLOC_STRUCT(nv50_reg); 298 ctor_reg(r, P_TEMP, -1, i); 299 pc->r_temp[i] = r; 300 return r; 301 } 302 } 303 304 NOUVEAU_ERR("out of registers\n"); 305 abort(); 306 return NULL; 307} 308 309/* release the hardware resource held by r */ 310static void 311release_hw(struct nv50_pc *pc, struct nv50_reg *r) 312{ 313 assert(r->type == P_TEMP); 314 if (r->hw == -1) 315 return; 316 317 assert(pc->r_temp[r->hw] == r); 318 pc->r_temp[r->hw] = NULL; 319 320 r->acc = 0; 321 if (r->index == -1) 322 FREE(r); 323} 324 325static void 326free_temp(struct nv50_pc *pc, struct nv50_reg *r) 327{ 328 if (r->index == -1) { 329 unsigned hw = r->hw; 330 331 FREE(pc->r_temp[hw]); 332 pc->r_temp[hw] = NULL; 333 } 334} 335 336static int 337alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 338{ 339 int i; 340 341 if ((idx + 4) >= NV50_SU_MAX_TEMP) 342 return 1; 343 344 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 345 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 346 return alloc_temp4(pc, dst, idx + 4); 347 348 for (i = 0; i < 4; i++) { 349 dst[i] = MALLOC_STRUCT(nv50_reg); 350 ctor_reg(dst[i], P_TEMP, -1, idx + i); 351 pc->r_temp[idx + i] = dst[i]; 352 } 353 354 return 0; 355} 356 357static void 358free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 359{ 360 int i; 361 362 for (i = 0; i < 4; i++) 363 free_temp(pc, reg[i]); 364} 365 366static struct nv50_reg * 367temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) 368{ 369 if (pc->temp_temp_nr >= 16) 370 assert(0); 371 372 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 373 pc->temp_temp_exec[pc->temp_temp_nr] = e; 374 return pc->temp_temp[pc->temp_temp_nr++]; 375} 376 377/* This *must* be called for all nv50_program_exec that have been 378 * given as argument to temp_temp, or the temps will be leaked ! 379 */ 380static void 381kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) 382{ 383 int i; 384 385 for (i = 0; i < pc->temp_temp_nr; i++) 386 if (pc->temp_temp_exec[i] == e) 387 free_temp(pc, pc->temp_temp[i]); 388 if (!e) 389 pc->temp_temp_nr = 0; 390} 391 392static int 393ctor_immd_4u32(struct nv50_pc *pc, 394 uint32_t x, uint32_t y, uint32_t z, uint32_t w) 395{ 396 unsigned size = pc->immd_nr * 4 * sizeof(uint32_t); 397 398 pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t)); 399 400 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 401 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 402 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 403 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 404 405 return pc->immd_nr++; 406} 407 408static INLINE int 409ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w) 410{ 411 return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w)); 412} 413 414static struct nv50_reg * 415alloc_immd(struct nv50_pc *pc, float f) 416{ 417 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); 418 unsigned hw; 419 420 for (hw = 0; hw < pc->immd_nr * 4; hw++) 421 if (pc->immd_buf[hw] == fui(f)) 422 break; 423 424 if (hw == pc->immd_nr * 4) 425 hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4; 426 427 ctor_reg(r, P_IMMD, -1, hw); 428 return r; 429} 430 431static struct nv50_program_exec * 432exec(struct nv50_pc *pc) 433{ 434 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 435 436 e->param.index = -1; 437 return e; 438} 439 440static void 441emit(struct nv50_pc *pc, struct nv50_program_exec *e) 442{ 443 struct nv50_program *p = pc->p; 444 445 if (p->exec_tail) 446 p->exec_tail->next = e; 447 if (!p->exec_head) 448 p->exec_head = e; 449 p->exec_tail = e; 450 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 451 452 kill_temp_temp(pc, e); 453} 454 455static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 456 457static boolean 458is_long(struct nv50_program_exec *e) 459{ 460 if (e->inst[0] & 1) 461 return TRUE; 462 return FALSE; 463} 464 465static boolean 466is_immd(struct nv50_program_exec *e) 467{ 468 if (is_long(e) && (e->inst[1] & 3) == 3) 469 return TRUE; 470 return FALSE; 471} 472 473static boolean 474is_join(struct nv50_program_exec *e) 475{ 476 if (is_long(e) && (e->inst[1] & 3) == 2) 477 return TRUE; 478 return FALSE; 479} 480 481static INLINE boolean 482is_control_flow(struct nv50_program_exec *e) 483{ 484 return (e->inst[0] & 2); 485} 486 487static INLINE void 488set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 489 struct nv50_program_exec *e) 490{ 491 assert(!is_immd(e)); 492 set_long(pc, e); 493 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 494 e->inst[1] |= (pred << 7) | (idx << 12); 495} 496 497static INLINE void 498set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 499 struct nv50_program_exec *e) 500{ 501 set_long(pc, e); 502 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 503 e->inst[1] |= (idx << 4) | (on << 6); 504} 505 506static INLINE void 507set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 508{ 509 if (is_long(e)) 510 return; 511 512 e->inst[0] |= 1; 513 set_pred(pc, 0xf, 0, e); 514 set_pred_wr(pc, 0, 0, e); 515} 516 517static INLINE void 518set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 519{ 520 if (dst->type == P_RESULT) { 521 set_long(pc, e); 522 e->inst[1] |= 0x00000008; 523 } 524 525 alloc_reg(pc, dst); 526 if (dst->hw > 63) 527 set_long(pc, e); 528 e->inst[0] |= (dst->hw << 2); 529} 530 531static INLINE void 532set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 533{ 534 set_long(pc, e); 535 /* XXX: can't be predicated - bits overlap; cases where both 536 * are required should be avoided by using pc->allow32 */ 537 set_pred(pc, 0, 0, e); 538 set_pred_wr(pc, 0, 0, e); 539 540 e->inst[1] |= 0x00000002 | 0x00000001; 541 e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16; 542 e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2; 543} 544 545static INLINE void 546set_addr(struct nv50_program_exec *e, struct nv50_reg *a) 547{ 548 assert(a->type == P_ADDR); 549 550 assert(!(e->inst[0] & 0x0c000000)); 551 assert(!(e->inst[1] & 0x00000004)); 552 553 e->inst[0] |= (a->hw & 3) << 26; 554 e->inst[1] |= a->hw & 4; 555} 556 557static void 558emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t); 559 560static void 561emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int); 562 563static void 564emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst, 565 struct nv50_reg *src) 566{ 567 struct nv50_program_exec *e = exec(pc); 568 569 e->inst[1] = 0x40000000; 570 set_long(pc, e); 571 set_dst(pc, dst, e); 572 set_addr(e, src); 573 574 emit(pc, e); 575} 576 577static void 578emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, 579 struct nv50_reg *src0, uint16_t src1_val) 580{ 581 struct nv50_program_exec *e = exec(pc); 582 583 e->inst[0] = 0xd0000000 | (src1_val << 9); 584 e->inst[1] = 0x20000000; 585 set_long(pc, e); 586 e->inst[0] |= dst->hw << 2; 587 if (src0) /* otherwise will add to $a0, which is always 0 */ 588 set_addr(e, src0); 589 590 emit(pc, e); 591} 592 593#define INTERP_LINEAR 0 594#define INTERP_FLAT 1 595#define INTERP_PERSPECTIVE 2 596#define INTERP_CENTROID 4 597 598/* interpolant index has been stored in dst->rhw */ 599static void 600emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 601 unsigned mode) 602{ 603 struct nv50_program_exec *e = exec(pc); 604 assert(dst->rhw != -1); 605 606 e->inst[0] |= 0x80000000; 607 set_dst(pc, dst, e); 608 e->inst[0] |= (dst->rhw << 16); 609 610 if (mode & INTERP_FLAT) { 611 e->inst[0] |= (1 << 8); 612 } else { 613 if (mode & INTERP_PERSPECTIVE) { 614 e->inst[0] |= (1 << 25); 615 alloc_reg(pc, iv); 616 e->inst[0] |= (iv->hw << 9); 617 } 618 619 if (mode & INTERP_CENTROID) 620 e->inst[0] |= (1 << 24); 621 } 622 623 emit(pc, e); 624} 625 626static void 627set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 628 struct nv50_program_exec *e) 629{ 630 set_long(pc, e); 631 632 e->param.index = src->hw & 127; 633 e->param.shift = s; 634 e->param.mask = m << (s % 32); 635 636 if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */ 637 set_addr(e, get_address_reg(pc, src)); 638 else 639 if (src->acc < 0) { 640 assert(src->type == P_CONST); 641 set_addr(e, pc->addr[src->indirect[0]]); 642 } 643 644 e->inst[1] |= (src->buf_index << 22); 645} 646 647/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */ 648static void 649emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 650{ 651 struct nv50_program_exec *e = exec(pc); 652 653 e->inst[0] = 0x10000000; 654 if (!pc->allow32) 655 set_long(pc, e); 656 657 set_dst(pc, dst, e); 658 659 if (!is_long(e) && src->type == P_IMMD) { 660 set_immd(pc, src, e); 661 /*XXX: 32-bit, but steals part of "half" reg space - need to 662 * catch and handle this case if/when we do half-regs 663 */ 664 } else 665 if (src->type == P_IMMD || src->type == P_CONST) { 666 set_long(pc, e); 667 set_data(pc, src, 0x7f, 9, e); 668 e->inst[1] |= 0x20000000; /* mov from c[] */ 669 } else { 670 if (src->type == P_ATTR) { 671 set_long(pc, e); 672 e->inst[1] |= 0x00200000; 673 674 if (src->vtx >= 0) { 675 /* indirect (vertex base + c) load from p[] */ 676 e->inst[0] |= 0x01800000; 677 set_addr(e, get_address_reg(pc, src)); 678 } 679 } 680 681 alloc_reg(pc, src); 682 if (src->hw > 63) 683 set_long(pc, e); 684 e->inst[0] |= (src->hw << 9); 685 } 686 687 if (is_long(e) && !is_immd(e)) { 688 e->inst[1] |= 0x04000000; /* 32-bit */ 689 e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */ 690 if (!(e->inst[1] & 0x20000000)) 691 e->inst[1] |= 0x00030000; /* lane mask 2:3 */ 692 } else 693 e->inst[0] |= 0x00008000; 694 695 emit(pc, e); 696} 697 698static INLINE void 699emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 700{ 701 struct nv50_reg *imm = alloc_immd(pc, f); 702 emit_mov(pc, dst, imm); 703 FREE(imm); 704} 705 706/* Assign the hw of the discarded temporary register src 707 * to the tgsi register dst and free src. 708 */ 709static void 710assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 711{ 712 assert(src->index == -1 && src->hw != -1); 713 714 if (pc->if_lvl || pc->loop_lvl || 715 (dst->type != P_TEMP) || 716 (src->hw < pc->result_nr * 4 && 717 pc->p->type == PIPE_SHADER_FRAGMENT) || 718 pc->p->info.opcode_count[TGSI_OPCODE_CAL] || 719 pc->p->info.opcode_count[TGSI_OPCODE_BRA]) { 720 721 emit_mov(pc, dst, src); 722 free_temp(pc, src); 723 return; 724 } 725 726 if (dst->hw != -1) 727 pc->r_temp[dst->hw] = NULL; 728 pc->r_temp[src->hw] = dst; 729 dst->hw = src->hw; 730 731 FREE(src); 732} 733 734static void 735emit_nop(struct nv50_pc *pc) 736{ 737 struct nv50_program_exec *e = exec(pc); 738 739 e->inst[0] = 0xf0000000; 740 set_long(pc, e); 741 e->inst[1] = 0xe0000000; 742 emit(pc, e); 743} 744 745static boolean 746check_swap_src_0_1(struct nv50_pc *pc, 747 struct nv50_reg **s0, struct nv50_reg **s1) 748{ 749 struct nv50_reg *src0 = *s0, *src1 = *s1; 750 751 if (src0->type == P_CONST) { 752 if (src1->type != P_CONST) { 753 *s0 = src1; 754 *s1 = src0; 755 return TRUE; 756 } 757 } else 758 if (src1->type == P_ATTR) { 759 if (src0->type != P_ATTR) { 760 *s0 = src1; 761 *s1 = src0; 762 return TRUE; 763 } 764 } 765 766 return FALSE; 767} 768 769static void 770set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, 771 struct nv50_program_exec *e) 772{ 773 struct nv50_reg *temp; 774 775 if (src->type != P_TEMP) { 776 temp = temp_temp(pc, e); 777 emit_mov(pc, temp, src); 778 src = temp; 779 } 780 781 alloc_reg(pc, src); 782 if (src->hw > 63) 783 set_long(pc, e); 784 e->inst[0] |= (src->hw << 9); 785} 786 787static void 788set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 789{ 790 if (src->type == P_ATTR) { 791 set_long(pc, e); 792 e->inst[1] |= 0x00200000; 793 794 if (src->vtx >= 0) { 795 e->inst[0] |= 0x01800000; /* src from p[] */ 796 set_addr(e, get_address_reg(pc, src)); 797 } 798 } else 799 if (src->type == P_CONST || src->type == P_IMMD) { 800 struct nv50_reg *temp = temp_temp(pc, e); 801 802 emit_mov(pc, temp, src); 803 src = temp; 804 } 805 806 alloc_reg(pc, src); 807 if (src->hw > 63) 808 set_long(pc, e); 809 e->inst[0] |= (src->hw << 9); 810} 811 812static void 813set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 814{ 815 if (src->type == P_ATTR) { 816 struct nv50_reg *temp = temp_temp(pc, e); 817 818 emit_mov(pc, temp, src); 819 src = temp; 820 } else 821 if (src->type == P_CONST || src->type == P_IMMD) { 822 if (e->inst[0] & 0x01800000) { 823 struct nv50_reg *temp = temp_temp(pc, e); 824 825 emit_mov(pc, temp, src); 826 src = temp; 827 } else { 828 assert(!(e->inst[0] & 0x00800000)); 829 set_data(pc, src, 0x7f, 16, e); 830 e->inst[0] |= 0x00800000; 831 } 832 } 833 834 alloc_reg(pc, src); 835 if (src->hw > 63) 836 set_long(pc, e); 837 e->inst[0] |= ((src->hw & 127) << 16); 838} 839 840static void 841set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 842{ 843 set_long(pc, e); 844 845 if (src->type == P_ATTR) { 846 struct nv50_reg *temp = temp_temp(pc, e); 847 848 emit_mov(pc, temp, src); 849 src = temp; 850 } else 851 if (src->type == P_CONST || src->type == P_IMMD) { 852 if (e->inst[0] & 0x01800000) { 853 struct nv50_reg *temp = temp_temp(pc, e); 854 855 emit_mov(pc, temp, src); 856 src = temp; 857 } else { 858 assert(!(e->inst[0] & 0x01000000)); 859 set_data(pc, src, 0x7f, 32+14, e); 860 e->inst[0] |= 0x01000000; 861 } 862 } 863 864 alloc_reg(pc, src); 865 e->inst[1] |= ((src->hw & 127) << 14); 866} 867 868static void 869set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh, 870 struct nv50_program_exec *e, int pos) 871{ 872 struct nv50_reg *r = src; 873 874 alloc_reg(pc, r); 875 if (r->type != P_TEMP) { 876 r = temp_temp(pc, e); 877 emit_mov(pc, r, src); 878 } 879 880 if (r->hw > (NV50_SU_MAX_TEMP / 2)) { 881 NOUVEAU_ERR("out of low GPRs\n"); 882 abort(); 883 } 884 885 e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32); 886} 887 888static void 889emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred) 890{ 891 struct nv50_program_exec *e = exec(pc); 892 893 assert(dst->type == P_TEMP); 894 e->inst[1] = 0x20000000 | (pred << 12); 895 set_long(pc, e); 896 set_dst(pc, dst, e); 897 898 emit(pc, e); 899} 900 901static void 902emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src) 903{ 904 struct nv50_program_exec *e = exec(pc); 905 906 e->inst[0] = 0x000001fc; 907 e->inst[1] = 0xa0000008; 908 set_long(pc, e); 909 set_pred_wr(pc, 1, pred, e); 910 set_src_0_restricted(pc, src, e); 911 912 emit(pc, e); 913} 914 915static void 916emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 917 struct nv50_reg *src1) 918{ 919 struct nv50_program_exec *e = exec(pc); 920 921 e->inst[0] |= 0xc0000000; 922 923 if (!pc->allow32) 924 set_long(pc, e); 925 926 check_swap_src_0_1(pc, &src0, &src1); 927 set_dst(pc, dst, e); 928 set_src_0(pc, src0, e); 929 if (src1->type == P_IMMD && !is_long(e)) { 930 if (src0->mod ^ src1->mod) 931 e->inst[0] |= 0x00008000; 932 set_immd(pc, src1, e); 933 } else { 934 set_src_1(pc, src1, e); 935 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { 936 if (is_long(e)) 937 e->inst[1] |= 0x08000000; 938 else 939 e->inst[0] |= 0x00008000; 940 } 941 } 942 943 emit(pc, e); 944} 945 946static void 947emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 948 struct nv50_reg *src0, struct nv50_reg *src1) 949{ 950 struct nv50_program_exec *e = exec(pc); 951 952 e->inst[0] = 0xb0000000; 953 954 alloc_reg(pc, src1); 955 check_swap_src_0_1(pc, &src0, &src1); 956 957 if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { 958 set_long(pc, e); 959 e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | 960 ((src1->mod & NV50_MOD_NEG) << 27); 961 } 962 963 set_dst(pc, dst, e); 964 set_src_0(pc, src0, e); 965 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 966 set_src_2(pc, src1, e); 967 else 968 if (src1->type == P_IMMD) 969 set_immd(pc, src1, e); 970 else 971 set_src_1(pc, src1, e); 972 973 emit(pc, e); 974} 975 976static void 977emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 978 uint8_t s) 979{ 980 struct nv50_program_exec *e = exec(pc); 981 982 set_long(pc, e); 983 e->inst[1] |= 0xc0000000; 984 985 e->inst[0] |= dst->hw << 2; 986 e->inst[0] |= s << 16; /* shift left */ 987 set_src_0(pc, src, e); 988 989 emit(pc, e); 990} 991 992static boolean 993address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r) 994{ 995 if (!r) 996 return FALSE; 997 998 if (r->vtx != a->vtx) 999 return FALSE; 1000 if (r->vtx >= 0) 1001 return (r->indirect[1] == a->indirect[1]); 1002 1003 if (r->hw < a->rhw || (r->hw - a->rhw) >= 128) 1004 return FALSE; 1005 1006 if (a->index >= 0) 1007 return (a->index == r->indirect[0]); 1008 return (a->indirect[0] == r->indirect[0]); 1009} 1010 1011static void 1012load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst, 1013 struct nv50_reg *a, int shift) 1014{ 1015 struct nv50_reg mem, *temp; 1016 1017 ctor_reg(&mem, P_ATTR, -1, dst->vtx); 1018 1019 assert(dst->type == P_ADDR); 1020 if (!a) { 1021 emit_arl(pc, dst, &mem, 0); 1022 return; 1023 } 1024 temp = alloc_temp(pc, NULL); 1025 1026 if (shift) { 1027 emit_mov_from_addr(pc, temp, a); 1028 if (shift < 0) 1029 emit_shl_imm(pc, temp, temp, shift); 1030 emit_arl(pc, dst, temp, MAX2(shift, 0)); 1031 } 1032 emit_mov(pc, temp, &mem); 1033 set_addr(pc->p->exec_tail, dst); 1034 1035 emit_arl(pc, dst, temp, 0); 1036 free_temp(pc, temp); 1037} 1038 1039/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS 1040 * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX 1041 * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX 1042 * case (vtx < 0, acc >= 0): memory address too high to encode 1043 * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS 1044 */ 1045static struct nv50_reg * 1046get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref) 1047{ 1048 int i; 1049 struct nv50_reg *a_ref, *a = NULL; 1050 1051 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) { 1052 if (pc->r_addr[i].acc == 0) 1053 a = &pc->r_addr[i]; /* an unused address reg */ 1054 else 1055 if (address_reg_suitable(&pc->r_addr[i], ref)) { 1056 pc->r_addr[i].acc = pc->insn_cur; 1057 return &pc->r_addr[i]; 1058 } else 1059 if (!a && pc->r_addr[i].index < 0 && 1060 pc->r_addr[i].acc < pc->insn_cur) 1061 a = &pc->r_addr[i]; 1062 } 1063 if (!a) { 1064 /* We'll be able to spill address regs when this 1065 * mess is replaced with a proper compiler ... 1066 */ 1067 NOUVEAU_ERR("out of address regs\n"); 1068 abort(); 1069 return NULL; 1070 } 1071 1072 /* initialize and reserve for this TGSI instruction */ 1073 a->rhw = 0; 1074 a->index = a->indirect[0] = a->indirect[1] = -1; 1075 a->acc = pc->insn_cur; 1076 1077 if (!ref) { 1078 a->vtx = -1; 1079 return a; 1080 } 1081 a->vtx = ref->vtx; 1082 1083 /* now put in the correct value ... */ 1084 1085 if (ref->vtx >= 0) { 1086 a->indirect[1] = ref->indirect[1]; 1087 1088 /* For an indirect vertex index, we need to shift address right 1089 * by 2, the address register will contain vtx * 16, we need to 1090 * load from a[vtx * 4]. 1091 */ 1092 load_vertex_base(pc, a, (ref->acc < 0) ? 1093 pc->addr[ref->indirect[1]] : NULL, -2); 1094 } else { 1095 assert(ref->acc < 0 || ref->indirect[0] < 0); 1096 1097 a->rhw = ref->hw & ~0x7f; 1098 a->indirect[0] = ref->indirect[0]; 1099 a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL; 1100 1101 emit_add_addr_imm(pc, a, a_ref, a->rhw * 4); 1102 } 1103 return a; 1104} 1105 1106#define NV50_MAX_F32 0x880 1107#define NV50_MAX_S32 0x08c 1108#define NV50_MAX_U32 0x084 1109#define NV50_MIN_F32 0x8a0 1110#define NV50_MIN_S32 0x0ac 1111#define NV50_MIN_U32 0x0a4 1112 1113static void 1114emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 1115 struct nv50_reg *src0, struct nv50_reg *src1) 1116{ 1117 struct nv50_program_exec *e = exec(pc); 1118 1119 set_long(pc, e); 1120 e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20); 1121 e->inst[1] |= (sub << 24); 1122 1123 check_swap_src_0_1(pc, &src0, &src1); 1124 set_dst(pc, dst, e); 1125 set_src_0(pc, src0, e); 1126 set_src_1(pc, src1, e); 1127 1128 if (src0->mod & NV50_MOD_ABS) 1129 e->inst[1] |= 0x00100000; 1130 if (src1->mod & NV50_MOD_ABS) 1131 e->inst[1] |= 0x00080000; 1132 1133 emit(pc, e); 1134} 1135 1136static INLINE void 1137emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1138 struct nv50_reg *src1) 1139{ 1140 src1->mod ^= NV50_MOD_NEG; 1141 emit_add(pc, dst, src0, src1); 1142 src1->mod ^= NV50_MOD_NEG; 1143} 1144 1145static void 1146emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1147 struct nv50_reg *src1, unsigned op) 1148{ 1149 struct nv50_program_exec *e = exec(pc); 1150 1151 e->inst[0] = 0xd0000000; 1152 set_long(pc, e); 1153 1154 check_swap_src_0_1(pc, &src0, &src1); 1155 set_dst(pc, dst, e); 1156 set_src_0(pc, src0, e); 1157 1158 if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && 1159 op != TGSI_OPCODE_XOR) 1160 assert(!"invalid bit op"); 1161 1162 assert(!(src0->mod | src1->mod)); 1163 1164 if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { 1165 set_immd(pc, src1, e); 1166 if (op == TGSI_OPCODE_OR) 1167 e->inst[0] |= 0x0100; 1168 else 1169 if (op == TGSI_OPCODE_XOR) 1170 e->inst[0] |= 0x8000; 1171 } else { 1172 set_src_1(pc, src1, e); 1173 e->inst[1] |= 0x04000000; /* 32 bit */ 1174 if (op == TGSI_OPCODE_OR) 1175 e->inst[1] |= 0x4000; 1176 else 1177 if (op == TGSI_OPCODE_XOR) 1178 e->inst[1] |= 0x8000; 1179 } 1180 1181 emit(pc, e); 1182} 1183 1184static void 1185emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1186{ 1187 struct nv50_program_exec *e = exec(pc); 1188 1189 e->inst[0] = 0xd0000000; 1190 e->inst[1] = 0x0402c000; 1191 set_long(pc, e); 1192 set_dst(pc, dst, e); 1193 set_src_1(pc, src, e); 1194 1195 emit(pc, e); 1196} 1197 1198static void 1199emit_shift(struct nv50_pc *pc, struct nv50_reg *dst, 1200 struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir) 1201{ 1202 struct nv50_program_exec *e = exec(pc); 1203 1204 e->inst[0] = 0x30000000; 1205 e->inst[1] = 0xc4000000; 1206 1207 set_long(pc, e); 1208 set_dst(pc, dst, e); 1209 set_src_0(pc, src0, e); 1210 1211 if (src1->type == P_IMMD) { 1212 e->inst[1] |= (1 << 20); 1213 e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16; 1214 } else 1215 set_src_1(pc, src1, e); 1216 1217 if (dir != TGSI_OPCODE_SHL) 1218 e->inst[1] |= (1 << 29); 1219 1220 if (dir == TGSI_OPCODE_ISHR) 1221 e->inst[1] |= (1 << 27); 1222 1223 emit(pc, e); 1224} 1225 1226static void 1227emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst, 1228 struct nv50_reg *src, int s) 1229{ 1230 struct nv50_program_exec *e = exec(pc); 1231 1232 e->inst[0] = 0x30000000; 1233 e->inst[1] = 0xc4100000; 1234 if (s < 0) { 1235 e->inst[1] |= 1 << 29; 1236 s = -s; 1237 } 1238 e->inst[1] |= ((s & 0x7f) << 16); 1239 1240 set_long(pc, e); 1241 set_dst(pc, dst, e); 1242 set_src_0(pc, src, e); 1243 1244 emit(pc, e); 1245} 1246 1247static void 1248emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1249 struct nv50_reg *src1, struct nv50_reg *src2) 1250{ 1251 struct nv50_program_exec *e = exec(pc); 1252 1253 e->inst[0] |= 0xe0000000; 1254 1255 check_swap_src_0_1(pc, &src0, &src1); 1256 set_dst(pc, dst, e); 1257 set_src_0(pc, src0, e); 1258 set_src_1(pc, src1, e); 1259 set_src_2(pc, src2, e); 1260 1261 if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) 1262 e->inst[1] |= 0x04000000; 1263 if (src2->mod & NV50_MOD_NEG) 1264 e->inst[1] |= 0x08000000; 1265 1266 emit(pc, e); 1267} 1268 1269static INLINE void 1270emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 1271 struct nv50_reg *src1, struct nv50_reg *src2) 1272{ 1273 src2->mod ^= NV50_MOD_NEG; 1274 emit_mad(pc, dst, src0, src1, src2); 1275 src2->mod ^= NV50_MOD_NEG; 1276} 1277 1278#define NV50_FLOP_RCP 0 1279#define NV50_FLOP_RSQ 2 1280#define NV50_FLOP_LG2 3 1281#define NV50_FLOP_SIN 4 1282#define NV50_FLOP_COS 5 1283#define NV50_FLOP_EX2 6 1284 1285/* rcp, rsqrt, lg2 support neg and abs */ 1286static void 1287emit_flop(struct nv50_pc *pc, unsigned sub, 1288 struct nv50_reg *dst, struct nv50_reg *src) 1289{ 1290 struct nv50_program_exec *e = exec(pc); 1291 1292 e->inst[0] |= 0x90000000; 1293 if (sub || src->mod) { 1294 set_long(pc, e); 1295 e->inst[1] |= (sub << 29); 1296 } 1297 1298 set_dst(pc, dst, e); 1299 set_src_0_restricted(pc, src, e); 1300 1301 assert(!src->mod || sub < 4); 1302 1303 if (src->mod & NV50_MOD_NEG) 1304 e->inst[1] |= 0x04000000; 1305 if (src->mod & NV50_MOD_ABS) 1306 e->inst[1] |= 0x00100000; 1307 1308 emit(pc, e); 1309} 1310 1311static void 1312emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1313{ 1314 struct nv50_program_exec *e = exec(pc); 1315 1316 e->inst[0] |= 0xb0000000; 1317 1318 set_dst(pc, dst, e); 1319 set_src_0(pc, src, e); 1320 set_long(pc, e); 1321 e->inst[1] |= (6 << 29) | 0x00004000; 1322 1323 if (src->mod & NV50_MOD_NEG) 1324 e->inst[1] |= 0x04000000; 1325 if (src->mod & NV50_MOD_ABS) 1326 e->inst[1] |= 0x00100000; 1327 1328 emit(pc, e); 1329} 1330 1331static void 1332emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1333{ 1334 struct nv50_program_exec *e = exec(pc); 1335 1336 e->inst[0] |= 0xb0000000; 1337 1338 set_dst(pc, dst, e); 1339 set_src_0(pc, src, e); 1340 set_long(pc, e); 1341 e->inst[1] |= (6 << 29); 1342 1343 if (src->mod & NV50_MOD_NEG) 1344 e->inst[1] |= 0x04000000; 1345 if (src->mod & NV50_MOD_ABS) 1346 e->inst[1] |= 0x00100000; 1347 1348 emit(pc, e); 1349} 1350 1351#define CVT_RN (0x00 << 16) 1352#define CVT_FLOOR (0x02 << 16) 1353#define CVT_CEIL (0x04 << 16) 1354#define CVT_TRUNC (0x06 << 16) 1355#define CVT_SAT (0x08 << 16) 1356#define CVT_ABS (0x10 << 16) 1357 1358#define CVT_X32_X32 0x04004000 1359#define CVT_X32_S32 0x04014000 1360#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) 1361#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) 1362#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) 1363#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) 1364#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) 1365#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) 1366#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) 1367#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) 1368 1369#define CVT_NEG 0x20000000 1370#define CVT_RI 0x08000000 1371 1372static void 1373emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 1374 int wp, uint32_t cvn) 1375{ 1376 struct nv50_program_exec *e; 1377 1378 e = exec(pc); 1379 1380 if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG; 1381 if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS; 1382 1383 e->inst[0] = 0xa0000000; 1384 e->inst[1] = cvn; 1385 set_long(pc, e); 1386 set_src_0(pc, src, e); 1387 1388 if (wp >= 0) 1389 set_pred_wr(pc, 1, wp, e); 1390 1391 if (dst) 1392 set_dst(pc, dst, e); 1393 else { 1394 e->inst[0] |= 0x000001fc; 1395 e->inst[1] |= 0x00000008; 1396 } 1397 1398 emit(pc, e); 1399} 1400 1401/* nv50 Condition codes: 1402 * 0x1 = LT 1403 * 0x2 = EQ 1404 * 0x3 = LE 1405 * 0x4 = GT 1406 * 0x5 = NE 1407 * 0x6 = GE 1408 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) 1409 * 0x8 = unordered bit (allows NaN) 1410 * 1411 * mode = 0x04 (u32), 0x0c (s32), 0x80 (f32) 1412 */ 1413static void 1414emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, 1415 struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode) 1416{ 1417 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 1418 1419 struct nv50_program_exec *e = exec(pc); 1420 struct nv50_reg *rdst; 1421 1422 assert(ccode < 16); 1423 if (check_swap_src_0_1(pc, &src0, &src1)) 1424 ccode = cc_swapped[ccode & 7] | (ccode & 8); 1425 1426 rdst = dst; 1427 if (dst && dst->type != P_TEMP) 1428 dst = alloc_temp(pc, NULL); 1429 1430 set_long(pc, e); 1431 e->inst[0] |= 0x30000000 | (mode << 24); 1432 e->inst[1] |= 0x60000000 | (ccode << 14); 1433 1434 if (wp >= 0) 1435 set_pred_wr(pc, 1, wp, e); 1436 if (dst) 1437 set_dst(pc, dst, e); 1438 else { 1439 e->inst[0] |= 0x000001fc; 1440 e->inst[1] |= 0x00000008; 1441 } 1442 1443 set_src_0(pc, src0, e); 1444 set_src_1(pc, src1, e); 1445 1446 emit(pc, e); 1447 1448 if (rdst && mode == 0x80) /* convert to float ? */ 1449 emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32); 1450 if (rdst && rdst != dst) 1451 free_temp(pc, dst); 1452} 1453 1454static INLINE void 1455map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty) 1456{ 1457 switch (op) { 1458 case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break; 1459 case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break; 1460 case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break; 1461 case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break; 1462 case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break; 1463 case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break; 1464 1465 case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break; 1466 case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break; 1467 case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break; 1468 case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break; 1469 case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break; 1470 case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break; 1471 default: 1472 assert(0); 1473 return; 1474 } 1475} 1476 1477static void 1478emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst, 1479 struct nv50_reg *src0, struct nv50_reg *rsrc1) 1480{ 1481 struct nv50_program_exec *e = exec(pc); 1482 struct nv50_reg *src1; 1483 1484 e->inst[0] = 0x20000000; 1485 1486 alloc_reg(pc, rsrc1); 1487 check_swap_src_0_1(pc, &src0, &rsrc1); 1488 1489 src1 = rsrc1; 1490 if (src0->mod & rsrc1->mod & NV50_MOD_NEG) { 1491 src1 = temp_temp(pc, e); 1492 emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32); 1493 } 1494 1495 if (!pc->allow32 || src1->hw > 63 || 1496 (src1->type != P_TEMP && src1->type != P_IMMD)) 1497 set_long(pc, e); 1498 1499 set_dst(pc, dst, e); 1500 set_src_0(pc, src0, e); 1501 1502 if (is_long(e)) { 1503 e->inst[1] |= 1 << 26; 1504 set_src_2(pc, src1, e); 1505 } else { 1506 e->inst[0] |= 0x8000; 1507 if (src1->type == P_IMMD) 1508 set_immd(pc, src1, e); 1509 else 1510 set_src_1(pc, src1, e); 1511 } 1512 1513 if (src0->mod & NV50_MOD_NEG) 1514 e->inst[0] |= 1 << 28; 1515 else 1516 if (src1->mod & NV50_MOD_NEG) 1517 e->inst[0] |= 1 << 22; 1518 1519 emit(pc, e); 1520} 1521 1522static void 1523emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst, 1524 struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1, 1525 struct nv50_reg *src2) 1526{ 1527 struct nv50_program_exec *e = exec(pc); 1528 1529 e->inst[0] = 0x60000000; 1530 if (!pc->allow32) 1531 set_long(pc, e); 1532 set_dst(pc, dst, e); 1533 1534 set_half_src(pc, src0, lh_0, e, 9); 1535 set_half_src(pc, src1, lh_1, e, 16); 1536 alloc_reg(pc, src2); 1537 if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw)) 1538 set_src_2(pc, src2, e); 1539 1540 emit(pc, e); 1541} 1542 1543static void 1544emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst, 1545 struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1) 1546{ 1547 struct nv50_program_exec *e = exec(pc); 1548 1549 e->inst[0] = 0x40000000; 1550 set_long(pc, e); 1551 set_dst(pc, dst, e); 1552 1553 set_half_src(pc, src0, lh_0, e, 9); 1554 set_half_src(pc, src1, lh_1, e, 16); 1555 1556 emit(pc, e); 1557} 1558 1559static void 1560emit_sad(struct nv50_pc *pc, struct nv50_reg *dst, 1561 struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2) 1562{ 1563 struct nv50_program_exec *e = exec(pc); 1564 1565 e->inst[0] = 0x50000000; 1566 if (!pc->allow32) 1567 set_long(pc, e); 1568 check_swap_src_0_1(pc, &src0, &src1); 1569 set_dst(pc, dst, e); 1570 set_src_0(pc, src0, e); 1571 set_src_1(pc, src1, e); 1572 alloc_reg(pc, src2); 1573 if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw)) 1574 set_src_2(pc, src2, e); 1575 1576 if (is_long(e)) 1577 e->inst[1] |= 0x0c << 24; 1578 else 1579 e->inst[0] |= 0x81 << 8; 1580 1581 emit(pc, e); 1582} 1583 1584static INLINE void 1585emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1586{ 1587 emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI); 1588} 1589 1590static void 1591emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 1592 struct nv50_reg *v, struct nv50_reg *e) 1593{ 1594 struct nv50_reg *temp = alloc_temp(pc, NULL); 1595 1596 emit_flop(pc, NV50_FLOP_LG2, temp, v); 1597 emit_mul(pc, temp, temp, e); 1598 emit_preex2(pc, temp, temp); 1599 emit_flop(pc, NV50_FLOP_EX2, dst, temp); 1600 1601 free_temp(pc, temp); 1602} 1603 1604static INLINE void 1605emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1606{ 1607 emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32); 1608} 1609 1610static void 1611emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1612 struct nv50_reg **src) 1613{ 1614 struct nv50_reg *one = alloc_immd(pc, 1.0); 1615 struct nv50_reg *zero = alloc_immd(pc, 0.0); 1616 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 1617 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 1618 struct nv50_reg *tmp[4] = { 0 }; 1619 boolean allow32 = pc->allow32; 1620 1621 pc->allow32 = FALSE; 1622 1623 if (mask & (3 << 1)) { 1624 tmp[0] = alloc_temp(pc, NULL); 1625 emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero); 1626 } 1627 1628 if (mask & (1 << 2)) { 1629 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 1630 1631 tmp[1] = temp_temp(pc, NULL); 1632 emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero); 1633 1634 tmp[3] = temp_temp(pc, NULL); 1635 emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128); 1636 emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128); 1637 1638 emit_pow(pc, dst[2], tmp[1], tmp[3]); 1639 emit_mov(pc, dst[2], zero); 1640 set_pred(pc, 3, 0, pc->p->exec_tail); 1641 } 1642 1643 if (mask & (1 << 1)) 1644 assimilate_temp(pc, dst[1], tmp[0]); 1645 else 1646 if (mask & (1 << 2)) 1647 free_temp(pc, tmp[0]); 1648 1649 pc->allow32 = allow32; 1650 1651 /* do this last, in case src[i,j] == dst[0,3] */ 1652 if (mask & (1 << 0)) 1653 emit_mov(pc, dst[0], one); 1654 1655 if (mask & (1 << 3)) 1656 emit_mov(pc, dst[3], one); 1657 1658 FREE(pos128); 1659 FREE(neg128); 1660 FREE(zero); 1661 FREE(one); 1662} 1663 1664static void 1665emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 1666{ 1667 struct nv50_program_exec *e; 1668 const int r_pred = 1; 1669 1670 e = exec(pc); 1671 e->inst[0] = 0x00000002; /* discard */ 1672 set_long(pc, e); /* sets cond code to ALWAYS */ 1673 1674 if (src) { 1675 set_pred(pc, 0x1 /* cc = LT */, r_pred, e); 1676 /* write to predicate reg */ 1677 emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32); 1678 } 1679 1680 emit(pc, e); 1681} 1682 1683static struct nv50_program_exec * 1684emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc) 1685{ 1686 struct nv50_program_exec *e = exec(pc); 1687 1688 e->inst[0] = (op << 28) | 2; 1689 set_long(pc, e); 1690 if (pred >= 0) 1691 set_pred(pc, cc, pred, e); 1692 1693 emit(pc, e); 1694 return e; 1695} 1696 1697static INLINE struct nv50_program_exec * 1698emit_breakaddr(struct nv50_pc *pc) 1699{ 1700 return emit_control_flow(pc, 0x4, -1, 0); 1701} 1702 1703static INLINE void 1704emit_break(struct nv50_pc *pc, int pred, unsigned cc) 1705{ 1706 emit_control_flow(pc, 0x5, pred, cc); 1707} 1708 1709static INLINE struct nv50_program_exec * 1710emit_joinat(struct nv50_pc *pc) 1711{ 1712 return emit_control_flow(pc, 0xa, -1, 0); 1713} 1714 1715static INLINE struct nv50_program_exec * 1716emit_branch(struct nv50_pc *pc, int pred, unsigned cc) 1717{ 1718 return emit_control_flow(pc, 0x1, pred, cc); 1719} 1720 1721static INLINE struct nv50_program_exec * 1722emit_call(struct nv50_pc *pc, int pred, unsigned cc) 1723{ 1724 return emit_control_flow(pc, 0x2, pred, cc); 1725} 1726 1727static INLINE void 1728emit_ret(struct nv50_pc *pc, int pred, unsigned cc) 1729{ 1730 emit_control_flow(pc, 0x3, pred, cc); 1731} 1732 1733static void 1734emit_prim_cmd(struct nv50_pc *pc, unsigned cmd) 1735{ 1736 struct nv50_program_exec *e = exec(pc); 1737 1738 e->inst[0] = 0xf0000000 | (cmd << 9); 1739 e->inst[1] = 0xc0000000; 1740 set_long(pc, e); 1741 1742 emit(pc, e); 1743} 1744 1745#define QOP_ADD 0 1746#define QOP_SUBR 1 1747#define QOP_SUB 2 1748#define QOP_MOV_SRC1 3 1749 1750/* For a quad of threads / top left, top right, bottom left, bottom right 1751 * pixels, do a different operation, and take src0 from a specific thread. 1752 */ 1753static void 1754emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0, 1755 struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop) 1756{ 1757 struct nv50_program_exec *e = exec(pc); 1758 1759 e->inst[0] = 0xc0000000; 1760 e->inst[1] = 0x80000000; 1761 set_long(pc, e); 1762 e->inst[0] |= lane_src0 << 16; 1763 set_src_0(pc, src0, e); 1764 set_src_2(pc, src1, e); 1765 1766 if (wp >= 0) 1767 set_pred_wr(pc, 1, wp, e); 1768 1769 if (dst) 1770 set_dst(pc, dst, e); 1771 else { 1772 e->inst[0] |= 0x000001fc; 1773 e->inst[1] |= 0x00000008; 1774 } 1775 1776 e->inst[0] |= (qop & 3) << 20; 1777 e->inst[1] |= (qop >> 2) << 22; 1778 1779 emit(pc, e); 1780} 1781 1782static void 1783load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], 1784 struct nv50_reg **src, unsigned arg, boolean proj) 1785{ 1786 int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; 1787 1788 src[0]->mod |= NV50_MOD_ABS; 1789 src[1]->mod |= NV50_MOD_ABS; 1790 src[2]->mod |= NV50_MOD_ABS; 1791 1792 emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]); 1793 emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]); 1794 1795 src[0]->mod = mod[0]; 1796 src[1]->mod = mod[1]; 1797 src[2]->mod = mod[2]; 1798 1799 if (proj && 0 /* looks more correct without this */) 1800 emit_mul(pc, t[2], t[2], src[3]); 1801 else 1802 if (arg == 4) /* there is no textureProj(samplerCubeShadow) */ 1803 emit_mov(pc, t[3], src[3]); 1804 1805 emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]); 1806 1807 emit_mul(pc, t[0], src[0], t[2]); 1808 emit_mul(pc, t[1], src[1], t[2]); 1809 emit_mul(pc, t[2], src[2], t[2]); 1810} 1811 1812static void 1813load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], 1814 struct nv50_reg **src, unsigned dim, unsigned arg) 1815{ 1816 unsigned c, mode; 1817 1818 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1819 mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE; 1820 1821 t[3]->rhw = src[3]->rhw; 1822 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1823 emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]); 1824 1825 for (c = 0; c < dim; ++c) { 1826 t[c]->rhw = src[c]->rhw; 1827 emit_interp(pc, t[c], t[3], mode); 1828 } 1829 if (arg != dim) { /* depth reference value */ 1830 t[dim]->rhw = src[2]->rhw; 1831 emit_interp(pc, t[dim], t[3], mode); 1832 } 1833 } else { 1834 /* XXX: for some reason the blob sometimes uses MAD 1835 * (mad f32 $rX $rY $rZ neg $r63) 1836 */ 1837 emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]); 1838 for (c = 0; c < dim; ++c) 1839 emit_mul(pc, t[c], src[c], t[3]); 1840 if (arg != dim) /* depth reference value */ 1841 emit_mul(pc, t[dim], src[2], t[3]); 1842 } 1843} 1844 1845static INLINE void 1846get_tex_dim(unsigned type, unsigned *dim, unsigned *arg) 1847{ 1848 switch (type) { 1849 case TGSI_TEXTURE_1D: 1850 *arg = *dim = 1; 1851 break; 1852 case TGSI_TEXTURE_SHADOW1D: 1853 *dim = 1; 1854 *arg = 2; 1855 break; 1856 case TGSI_TEXTURE_UNKNOWN: 1857 case TGSI_TEXTURE_2D: 1858 case TGSI_TEXTURE_RECT: 1859 *arg = *dim = 2; 1860 break; 1861 case TGSI_TEXTURE_SHADOW2D: 1862 case TGSI_TEXTURE_SHADOWRECT: 1863 *dim = 2; 1864 *arg = 3; 1865 break; 1866 case TGSI_TEXTURE_3D: 1867 case TGSI_TEXTURE_CUBE: 1868 *dim = *arg = 3; 1869 break; 1870 default: 1871 assert(0); 1872 break; 1873 } 1874} 1875 1876/* We shouldn't execute TEXLOD if any of the pixels in a quad have 1877 * different LOD values, so branch off groups of equal LOD. 1878 */ 1879static void 1880emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod, 1881 struct nv50_reg *src, struct nv50_program_exec *tex) 1882{ 1883 struct nv50_program_exec *join_at; 1884 unsigned i, target = pc->p->exec_size + 9 * 2; 1885 1886 if (pc->p->type != PIPE_SHADER_FRAGMENT) { 1887 emit(pc, tex); 1888 return; 1889 } 1890 pc->allow32 = FALSE; 1891 1892 /* Subtract lod of each pixel from lod of top left pixel, jump 1893 * texlod insn if result is 0, then repeat for 2 other pixels. 1894 */ 1895 join_at = emit_joinat(pc); 1896 emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55); 1897 emit_branch(pc, 0, 2)->param.index = target; 1898 1899 for (i = 1; i < 4; ++i) { 1900 emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55); 1901 emit_branch(pc, 0, 2)->param.index = target; 1902 } 1903 1904 emit_mov(pc, tlod, src); /* target */ 1905 emit(pc, tex); /* texlod */ 1906 1907 join_at->param.index = target + 2 * 2; 1908 JOIN_ON(emit_nop(pc)); /* join _after_ tex */ 1909} 1910 1911static void 1912emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg, 1913 struct nv50_program_exec *tex) 1914{ 1915 struct nv50_program_exec *e; 1916 struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL); 1917 int r_pred = 0; 1918 unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 }; 1919 1920 pc->allow32 = FALSE; 1921 ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4); 1922 1923 /* Subtract bias value of thread i from bias values of each thread, 1924 * store result in r_pred, and set bit i in r_bits if result was 0. 1925 */ 1926 assert(arg < 4); 1927 for (i = 0; i < 4; ++i, ++imm_1248.hw) { 1928 emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55); 1929 emit_mov(pc, r_bits, &imm_1248); 1930 set_pred(pc, 2, r_pred, pc->p->exec_tail); 1931 } 1932 emit_mov_to_pred(pc, r_pred, r_bits); 1933 1934 /* The lanes of a quad are now grouped by the bit in r_pred they have 1935 * set. Put the input values for TEX into a new register set for each 1936 * group and execute TEX only for a specific group. 1937 * We cannot use the same register set for each group because we need 1938 * the derivatives, which are implicitly calculated, to be correct. 1939 */ 1940 for (i = 1; i < 4; ++i) { 1941 alloc_temp4(pc, t123[i], 0); 1942 1943 for (c = 0; c <= arg; ++c) 1944 emit_mov(pc, t123[i][c], t[c]); 1945 1946 *(e = exec(pc)) = *(tex); 1947 e->inst[0] &= ~0x01fc; 1948 set_dst(pc, t123[i][0], e); 1949 set_pred(pc, cc[i], r_pred, e); 1950 emit(pc, e); 1951 } 1952 /* finally TEX on the original regs (where we kept the input) */ 1953 set_pred(pc, cc[0], r_pred, tex); 1954 emit(pc, tex); 1955 1956 /* put the 3 * n other results into regs for lane 0 */ 1957 n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc)); 1958 for (i = 1; i < 4; ++i) { 1959 for (c = 0; c < n; ++c) { 1960 emit_mov(pc, t[c], t123[i][c]); 1961 set_pred(pc, cc[i], r_pred, pc->p->exec_tail); 1962 } 1963 free_temp4(pc, t123[i]); 1964 } 1965 1966 emit_nop(pc); 1967 free_temp(pc, r_bits); 1968} 1969 1970static void 1971emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1972 struct nv50_reg **src, unsigned unit, unsigned type, 1973 boolean proj, int bias_lod) 1974{ 1975 struct nv50_reg *t[4]; 1976 struct nv50_program_exec *e; 1977 unsigned c, dim, arg; 1978 1979 /* t[i] must be within a single 128 bit super-reg */ 1980 alloc_temp4(pc, t, 0); 1981 1982 e = exec(pc); 1983 e->inst[0] = 0xf0000000; 1984 set_long(pc, e); 1985 set_dst(pc, t[0], e); 1986 1987 /* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */ 1988 e->inst[0] |= (unit << 9) /* | (unit << 17) */; 1989 1990 /* live flag (don't set if TEX results affect input to another TEX): */ 1991 /* e->inst[0] |= 0x00000004; */ 1992 1993 get_tex_dim(type, &dim, &arg); 1994 1995 if (type == TGSI_TEXTURE_CUBE) { 1996 e->inst[0] |= 0x08000000; 1997 load_cube_tex_coords(pc, t, src, arg, proj); 1998 } else 1999 if (proj) 2000 load_proj_tex_coords(pc, t, src, dim, arg); 2001 else { 2002 for (c = 0; c < dim; c++) 2003 emit_mov(pc, t[c], src[c]); 2004 if (arg != dim) /* depth reference value (always src.z here) */ 2005 emit_mov(pc, t[dim], src[2]); 2006 } 2007 2008 e->inst[0] |= (mask & 0x3) << 25; 2009 e->inst[1] |= (mask & 0xc) << 12; 2010 2011 if (!bias_lod) { 2012 e->inst[0] |= (arg - 1) << 22; 2013 emit(pc, e); 2014 } else 2015 if (bias_lod < 0) { 2016 assert(pc->p->type == PIPE_SHADER_FRAGMENT); 2017 e->inst[0] |= arg << 22; 2018 e->inst[1] |= 0x20000000; /* texbias */ 2019 emit_mov(pc, t[arg], src[3]); 2020 emit_texbias_sequence(pc, t, arg, e); 2021 } else { 2022 e->inst[0] |= arg << 22; 2023 e->inst[1] |= 0x40000000; /* texlod */ 2024 emit_mov(pc, t[arg], src[3]); 2025 emit_texlod_sequence(pc, t[arg], src[3], e); 2026 } 2027 2028#if 1 2029 c = 0; 2030 if (mask & 1) emit_mov(pc, dst[0], t[c++]); 2031 if (mask & 2) emit_mov(pc, dst[1], t[c++]); 2032 if (mask & 4) emit_mov(pc, dst[2], t[c++]); 2033 if (mask & 8) emit_mov(pc, dst[3], t[c]); 2034 2035 free_temp4(pc, t); 2036#else 2037 /* XXX: if p.e. MUL is used directly after TEX, it would still use 2038 * the texture coordinates, not the fetched values: latency ? */ 2039 2040 for (c = 0; c < 4; c++) { 2041 if (mask & (1 << c)) 2042 assimilate_temp(pc, dst[c], t[c]); 2043 else 2044 free_temp(pc, t[c]); 2045 } 2046#endif 2047} 2048 2049static void 2050emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 2051{ 2052 struct nv50_program_exec *e = exec(pc); 2053 2054 assert(src->type == P_TEMP); 2055 2056 e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000; 2057 e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000; 2058 set_long(pc, e); 2059 set_dst(pc, dst, e); 2060 set_src_0(pc, src, e); 2061 set_src_2(pc, src, e); 2062 2063 emit(pc, e); 2064} 2065 2066static void 2067emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 2068{ 2069 struct nv50_program_exec *e = exec(pc); 2070 2071 assert(src->type == P_TEMP); 2072 2073 e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000; 2074 e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000; 2075 set_long(pc, e); 2076 set_dst(pc, dst, e); 2077 set_src_0(pc, src, e); 2078 set_src_2(pc, src, e); 2079 2080 emit(pc, e); 2081} 2082 2083static void 2084convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 2085{ 2086 unsigned q = 0, m = ~0; 2087 2088 assert(!is_long(e)); 2089 2090 switch (e->inst[0] >> 28) { 2091 case 0x1: 2092 /* MOV */ 2093 q = 0x0403c000; 2094 m = 0xffff7fff; 2095 break; 2096 case 0x2: 2097 case 0x3: 2098 /* ADD, SUB, SUBR b32 */ 2099 m = ~(0x8000 | (127 << 16)); 2100 q = ((e->inst[0] & (~m)) >> 2) | (1 << 26); 2101 break; 2102 case 0x5: 2103 /* SAD */ 2104 m = ~(0x81 << 8); 2105 q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12); 2106 break; 2107 case 0x6: 2108 /* MAD u16 */ 2109 q = (e->inst[0] & (0x7f << 2)) << 12; 2110 break; 2111 case 0x8: 2112 /* INTERP (move centroid, perspective and flat bits) */ 2113 m = ~0x03000100; 2114 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 2115 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 2116 break; 2117 case 0x9: 2118 /* RCP */ 2119 break; 2120 case 0xB: 2121 /* ADD */ 2122 m = ~(127 << 16); 2123 q = ((e->inst[0] & (~m)) >> 2); 2124 break; 2125 case 0xC: 2126 /* MUL */ 2127 m = ~0x00008000; 2128 q = ((e->inst[0] & (~m)) << 12); 2129 break; 2130 case 0xE: 2131 /* MAD (if src2 == dst) */ 2132 q = ((e->inst[0] & 0x1fc) << 12); 2133 break; 2134 default: 2135 assert(0); 2136 break; 2137 } 2138 2139 set_long(pc, e); 2140 pc->p->exec_size++; 2141 2142 e->inst[0] &= m; 2143 e->inst[1] |= q; 2144} 2145 2146/* Some operations support an optional negation flag. */ 2147static int 2148get_supported_mods(const struct tgsi_full_instruction *insn, int i) 2149{ 2150 switch (insn->Instruction.Opcode) { 2151 case TGSI_OPCODE_ADD: 2152 case TGSI_OPCODE_COS: 2153 case TGSI_OPCODE_DDX: 2154 case TGSI_OPCODE_DDY: 2155 case TGSI_OPCODE_DP3: 2156 case TGSI_OPCODE_DP4: 2157 case TGSI_OPCODE_EX2: 2158 case TGSI_OPCODE_KIL: 2159 case TGSI_OPCODE_LG2: 2160 case TGSI_OPCODE_MAD: 2161 case TGSI_OPCODE_MUL: 2162 case TGSI_OPCODE_POW: 2163 case TGSI_OPCODE_RCP: 2164 case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */ 2165 case TGSI_OPCODE_SCS: 2166 case TGSI_OPCODE_SIN: 2167 case TGSI_OPCODE_SUB: 2168 return NV50_MOD_NEG; 2169 case TGSI_OPCODE_MAX: 2170 case TGSI_OPCODE_MIN: 2171 case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */ 2172 return NV50_MOD_ABS; 2173 case TGSI_OPCODE_CEIL: 2174 case TGSI_OPCODE_FLR: 2175 case TGSI_OPCODE_TRUNC: 2176 return NV50_MOD_NEG | NV50_MOD_ABS; 2177 case TGSI_OPCODE_F2I: 2178 case TGSI_OPCODE_F2U: 2179 case TGSI_OPCODE_I2F: 2180 case TGSI_OPCODE_U2F: 2181 return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32; 2182 case TGSI_OPCODE_UADD: 2183 return NV50_MOD_NEG | NV50_MOD_I32; 2184 case TGSI_OPCODE_SAD: 2185 case TGSI_OPCODE_SHL: 2186 case TGSI_OPCODE_IMAX: 2187 case TGSI_OPCODE_IMIN: 2188 case TGSI_OPCODE_ISHR: 2189 case TGSI_OPCODE_NOT: 2190 case TGSI_OPCODE_UMAD: 2191 case TGSI_OPCODE_UMAX: 2192 case TGSI_OPCODE_UMIN: 2193 case TGSI_OPCODE_UMUL: 2194 case TGSI_OPCODE_USHR: 2195 return NV50_MOD_I32; 2196 default: 2197 return 0; 2198 } 2199} 2200 2201/* Return a read mask for source registers deduced from opcode & write mask. */ 2202static unsigned 2203nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 2204{ 2205 unsigned x, mask = insn->Dst[0].Register.WriteMask; 2206 2207 switch (insn->Instruction.Opcode) { 2208 case TGSI_OPCODE_COS: 2209 case TGSI_OPCODE_SIN: 2210 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 2211 case TGSI_OPCODE_DP3: 2212 return 0x7; 2213 case TGSI_OPCODE_DP4: 2214 case TGSI_OPCODE_DPH: 2215 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 2216 return 0xf; 2217 case TGSI_OPCODE_DST: 2218 return mask & (c ? 0xa : 0x6); 2219 case TGSI_OPCODE_EX2: 2220 case TGSI_OPCODE_EXP: 2221 case TGSI_OPCODE_LG2: 2222 case TGSI_OPCODE_LOG: 2223 case TGSI_OPCODE_POW: 2224 case TGSI_OPCODE_RCP: 2225 case TGSI_OPCODE_RSQ: 2226 case TGSI_OPCODE_SCS: 2227 return 0x1; 2228 case TGSI_OPCODE_IF: 2229 return 0x1; 2230 case TGSI_OPCODE_LIT: 2231 return 0xb; 2232 case TGSI_OPCODE_TEX: 2233 case TGSI_OPCODE_TXB: 2234 case TGSI_OPCODE_TXL: 2235 case TGSI_OPCODE_TXP: 2236 { 2237 const struct tgsi_instruction_texture *tex; 2238 2239 assert(insn->Instruction.Texture); 2240 tex = &insn->Texture; 2241 2242 mask = 0x7; 2243 if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && 2244 insn->Instruction.Opcode != TGSI_OPCODE_TXD) 2245 mask |= 0x8; /* bias, lod or proj */ 2246 2247 switch (tex->Texture) { 2248 case TGSI_TEXTURE_1D: 2249 mask &= 0x9; 2250 break; 2251 case TGSI_TEXTURE_SHADOW1D: 2252 mask &= 0x5; 2253 break; 2254 case TGSI_TEXTURE_2D: 2255 mask &= 0xb; 2256 break; 2257 default: 2258 break; 2259 } 2260 } 2261 return mask; 2262 case TGSI_OPCODE_XPD: 2263 x = 0; 2264 if (mask & 1) x |= 0x6; 2265 if (mask & 2) x |= 0x5; 2266 if (mask & 4) x |= 0x3; 2267 return x; 2268 default: 2269 break; 2270 } 2271 2272 return mask; 2273} 2274 2275static struct nv50_reg * 2276tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 2277{ 2278 switch (dst->Register.File) { 2279 case TGSI_FILE_TEMPORARY: 2280 return &pc->temp[dst->Register.Index * 4 + c]; 2281 case TGSI_FILE_OUTPUT: 2282 return &pc->result[dst->Register.Index * 4 + c]; 2283 case TGSI_FILE_ADDRESS: 2284 { 2285 struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; 2286 if (!r) { 2287 r = get_address_reg(pc, NULL); 2288 r->index = dst->Register.Index * 4 + c; 2289 pc->addr[r->index] = r; 2290 } 2291 assert(r); 2292 return r; 2293 } 2294 case TGSI_FILE_NULL: 2295 return NULL; 2296 case TGSI_FILE_SYSTEM_VALUE: 2297 assert(pc->sysval[dst->Register.Index].type == P_RESULT); 2298 assert(c == 0); 2299 return &pc->sysval[dst->Register.Index]; 2300 default: 2301 break; 2302 } 2303 2304 return NULL; 2305} 2306 2307static struct nv50_reg * 2308tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 2309 int mod) 2310{ 2311 struct nv50_reg *r = NULL; 2312 struct nv50_reg *temp = NULL; 2313 unsigned sgn, c, swz, cvn; 2314 2315 if (src->Register.File != TGSI_FILE_CONSTANT) 2316 assert(!src->Register.Indirect); 2317 2318 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 2319 2320 c = tgsi_util_get_full_src_register_swizzle(src, chan); 2321 switch (c) { 2322 case TGSI_SWIZZLE_X: 2323 case TGSI_SWIZZLE_Y: 2324 case TGSI_SWIZZLE_Z: 2325 case TGSI_SWIZZLE_W: 2326 switch (src->Register.File) { 2327 case TGSI_FILE_INPUT: 2328 r = &pc->attr[src->Register.Index * 4 + c]; 2329 2330 if (!src->Dimension.Dimension) 2331 break; 2332 r = reg_instance(pc, r); 2333 r->vtx = src->Dimension.Index; 2334 2335 if (!src->Dimension.Indirect) 2336 break; 2337 swz = tgsi_util_get_src_register_swizzle( 2338 &src->DimIndirect, 0); 2339 r->acc = -1; 2340 r->indirect[1] = src->DimIndirect.Index * 4 + swz; 2341 break; 2342 case TGSI_FILE_TEMPORARY: 2343 r = &pc->temp[src->Register.Index * 4 + c]; 2344 break; 2345 case TGSI_FILE_CONSTANT: 2346 if (!src->Register.Indirect) { 2347 r = &pc->param[src->Register.Index * 4 + c]; 2348 break; 2349 } 2350 /* Indicate indirection by setting r->acc < 0 and 2351 * use the index field to select the address reg. 2352 */ 2353 r = reg_instance(pc, NULL); 2354 ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c); 2355 2356 swz = tgsi_util_get_src_register_swizzle( 2357 &src->Indirect, 0); 2358 r->acc = -1; 2359 r->indirect[0] = src->Indirect.Index * 4 + swz; 2360 break; 2361 case TGSI_FILE_IMMEDIATE: 2362 r = &pc->immd[src->Register.Index * 4 + c]; 2363 break; 2364 case TGSI_FILE_SAMPLER: 2365 return NULL; 2366 case TGSI_FILE_ADDRESS: 2367 r = pc->addr[src->Register.Index * 4 + c]; 2368 assert(r); 2369 break; 2370 case TGSI_FILE_SYSTEM_VALUE: 2371 assert(c == 0); 2372 r = &pc->sysval[src->Register.Index]; 2373 break; 2374 default: 2375 assert(0); 2376 break; 2377 } 2378 break; 2379 default: 2380 assert(0); 2381 break; 2382 } 2383 2384 cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32; 2385 2386 switch (sgn) { 2387 case TGSI_UTIL_SIGN_CLEAR: 2388 r->mod = NV50_MOD_ABS; 2389 break; 2390 case TGSI_UTIL_SIGN_SET: 2391 r->mod = NV50_MOD_NEG_ABS; 2392 break; 2393 case TGSI_UTIL_SIGN_TOGGLE: 2394 r->mod = NV50_MOD_NEG; 2395 break; 2396 default: 2397 assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP); 2398 break; 2399 } 2400 2401 if ((r->mod & mod) != r->mod) { 2402 temp = temp_temp(pc, NULL); 2403 emit_cvt(pc, temp, r, -1, cvn); 2404 r->mod = 0; 2405 r = temp; 2406 } else 2407 r->mod |= mod & NV50_MOD_I32; 2408 2409 assert(r); 2410 if (r->acc >= 0 && r->vtx < 0 && r != temp) 2411 return reg_instance(pc, r); /* will clear r->mod */ 2412 return r; 2413} 2414 2415/* return TRUE for ops that produce only a single result */ 2416static boolean 2417is_scalar_op(unsigned op) 2418{ 2419 switch (op) { 2420 case TGSI_OPCODE_COS: 2421 case TGSI_OPCODE_DP2: 2422 case TGSI_OPCODE_DP3: 2423 case TGSI_OPCODE_DP4: 2424 case TGSI_OPCODE_DPH: 2425 case TGSI_OPCODE_EX2: 2426 case TGSI_OPCODE_LG2: 2427 case TGSI_OPCODE_POW: 2428 case TGSI_OPCODE_RCP: 2429 case TGSI_OPCODE_RSQ: 2430 case TGSI_OPCODE_SIN: 2431 /* 2432 case TGSI_OPCODE_KIL: 2433 case TGSI_OPCODE_LIT: 2434 case TGSI_OPCODE_SCS: 2435 */ 2436 return TRUE; 2437 default: 2438 return FALSE; 2439 } 2440} 2441 2442/* Returns a bitmask indicating which dst components depend 2443 * on source s, component c (reverse of nv50_tgsi_src_mask). 2444 */ 2445static unsigned 2446nv50_tgsi_dst_revdep(unsigned op, int s, int c) 2447{ 2448 if (is_scalar_op(op)) 2449 return 0x1; 2450 2451 switch (op) { 2452 case TGSI_OPCODE_DST: 2453 return (1 << c) & (s ? 0xa : 0x6); 2454 case TGSI_OPCODE_XPD: 2455 switch (c) { 2456 case 0: return 0x6; 2457 case 1: return 0x5; 2458 case 2: return 0x3; 2459 case 3: return 0x0; 2460 default: 2461 assert(0); 2462 return 0x0; 2463 } 2464 case TGSI_OPCODE_EXP: 2465 case TGSI_OPCODE_LOG: 2466 case TGSI_OPCODE_LIT: 2467 case TGSI_OPCODE_SCS: 2468 case TGSI_OPCODE_TEX: 2469 case TGSI_OPCODE_TXB: 2470 case TGSI_OPCODE_TXL: 2471 case TGSI_OPCODE_TXP: 2472 /* these take care of dangerous swizzles themselves */ 2473 return 0x0; 2474 case TGSI_OPCODE_IF: 2475 case TGSI_OPCODE_KIL: 2476 /* don't call this function for these ops */ 2477 assert(0); 2478 return 0; 2479 default: 2480 /* linear vector instruction */ 2481 return (1 << c); 2482 } 2483} 2484 2485static INLINE boolean 2486has_pred(struct nv50_program_exec *e, unsigned cc) 2487{ 2488 if (!is_long(e) || is_immd(e)) 2489 return FALSE; 2490 return ((e->inst[1] & 0x780) == (cc << 7)); 2491} 2492 2493/* on ENDIF see if we can do "@p0.neu single_op" instead of: 2494 * join_at ENDIF 2495 * @p0.eq bra ENDIF 2496 * single_op 2497 * ENDIF: nop.join 2498 */ 2499static boolean 2500nv50_kill_branch(struct nv50_pc *pc) 2501{ 2502 int lvl = pc->if_lvl; 2503 2504 if (pc->if_insn[lvl]->next != pc->p->exec_tail) 2505 return FALSE; 2506 if (is_immd(pc->p->exec_tail)) 2507 return FALSE; 2508 2509 /* if ccode == 'true', the BRA is from an ELSE and the predicate 2510 * reg may no longer be valid, since we currently always use $p0 2511 */ 2512 if (has_pred(pc->if_insn[lvl], 0xf)) 2513 return FALSE; 2514 assert(pc->if_insn[lvl] && pc->if_join[lvl]); 2515 2516 /* We'll use the exec allocated for JOIN_AT (we can't easily 2517 * access nv50_program_exec's prev). 2518 */ 2519 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ 2520 2521 *pc->if_join[lvl] = *pc->p->exec_tail; 2522 2523 FREE(pc->if_insn[lvl]); 2524 FREE(pc->p->exec_tail); 2525 2526 pc->p->exec_tail = pc->if_join[lvl]; 2527 pc->p->exec_tail->next = NULL; 2528 set_pred(pc, 0xd, 0, pc->p->exec_tail); 2529 2530 return TRUE; 2531} 2532 2533static void 2534nv50_fp_move_results(struct nv50_pc *pc) 2535{ 2536 struct nv50_reg reg; 2537 unsigned i; 2538 2539 ctor_reg(®, P_TEMP, -1, -1); 2540 2541 for (i = 0; i < pc->result_nr * 4; ++i) { 2542 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) 2543 continue; 2544 if (pc->result[i].rhw != pc->result[i].hw) { 2545 reg.hw = pc->result[i].rhw; 2546 emit_mov(pc, ®, &pc->result[i]); 2547 } 2548 } 2549} 2550 2551static boolean 2552nv50_program_tx_insn(struct nv50_pc *pc, 2553 const struct tgsi_full_instruction *inst) 2554{ 2555 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 2556 unsigned mask, sat, unit = 0; 2557 int i, c; 2558 2559 mask = inst->Dst[0].Register.WriteMask; 2560 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 2561 2562 memset(src, 0, sizeof(src)); 2563 2564 for (c = 0; c < 4; c++) { 2565 if ((mask & (1 << c)) && !pc->r_dst[c]) 2566 dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); 2567 else 2568 dst[c] = pc->r_dst[c]; 2569 rdst[c] = dst[c]; 2570 } 2571 2572 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 2573 const struct tgsi_full_src_register *fs = &inst->Src[i]; 2574 unsigned src_mask; 2575 int mod_supp; 2576 2577 src_mask = nv50_tgsi_src_mask(inst, i); 2578 mod_supp = get_supported_mods(inst, i); 2579 2580 if (fs->Register.File == TGSI_FILE_SAMPLER) 2581 unit = fs->Register.Index; 2582 2583 for (c = 0; c < 4; c++) 2584 if (src_mask & (1 << c)) 2585 src[i][c] = tgsi_src(pc, c, fs, mod_supp); 2586 } 2587 2588 brdc = temp = pc->r_brdc; 2589 if (brdc && brdc->type != P_TEMP) { 2590 temp = temp_temp(pc, NULL); 2591 if (sat) 2592 brdc = temp; 2593 } else 2594 if (sat) { 2595 for (c = 0; c < 4; c++) { 2596 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 2597 continue; 2598 /* rdst[c] = dst[c]; */ /* done above */ 2599 dst[c] = temp_temp(pc, NULL); 2600 } 2601 } 2602 2603 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 2604 2605 switch (inst->Instruction.Opcode) { 2606 case TGSI_OPCODE_ABS: 2607 for (c = 0; c < 4; c++) { 2608 if (!(mask & (1 << c))) 2609 continue; 2610 emit_cvt(pc, dst[c], src[0][c], -1, 2611 CVT_ABS | CVT_F32_F32); 2612 } 2613 break; 2614 case TGSI_OPCODE_ADD: 2615 for (c = 0; c < 4; c++) { 2616 if (!(mask & (1 << c))) 2617 continue; 2618 emit_add(pc, dst[c], src[0][c], src[1][c]); 2619 } 2620 break; 2621 case TGSI_OPCODE_AND: 2622 case TGSI_OPCODE_XOR: 2623 case TGSI_OPCODE_OR: 2624 for (c = 0; c < 4; c++) { 2625 if (!(mask & (1 << c))) 2626 continue; 2627 emit_bitop2(pc, dst[c], src[0][c], src[1][c], 2628 inst->Instruction.Opcode); 2629 } 2630 break; 2631 case TGSI_OPCODE_ARL: 2632 temp = temp_temp(pc, NULL); 2633 for (c = 0; c < 4; c++) { 2634 if (!(mask & (1 << c))) 2635 continue; 2636 emit_cvt(pc, temp, src[0][c], -1, 2637 CVT_FLOOR | CVT_S32_F32); 2638 emit_arl(pc, dst[c], temp, 4); 2639 } 2640 break; 2641 case TGSI_OPCODE_BGNLOOP: 2642 pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc); 2643 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; 2644 terminate_mbb(pc); 2645 break; 2646 case TGSI_OPCODE_BGNSUB: 2647 assert(!pc->in_subroutine); 2648 pc->in_subroutine = TRUE; 2649 /* probably not necessary, but align to 8 byte boundary */ 2650 if (!is_long(pc->p->exec_tail)) 2651 convert_to_long(pc, pc->p->exec_tail); 2652 break; 2653 case TGSI_OPCODE_BRK: 2654 assert(pc->loop_lvl > 0); 2655 emit_break(pc, -1, 0); 2656 break; 2657 case TGSI_OPCODE_CAL: 2658 assert(inst->Label.Label < pc->insn_nr); 2659 emit_call(pc, -1, 0)->param.index = inst->Label.Label; 2660 /* replaced by actual offset in nv50_program_fixup_insns */ 2661 break; 2662 case TGSI_OPCODE_CEIL: 2663 for (c = 0; c < 4; c++) { 2664 if (!(mask & (1 << c))) 2665 continue; 2666 emit_cvt(pc, dst[c], src[0][c], -1, 2667 CVT_CEIL | CVT_F32_F32 | CVT_RI); 2668 } 2669 break; 2670 case TGSI_OPCODE_CMP: 2671 pc->allow32 = FALSE; 2672 for (c = 0; c < 4; c++) { 2673 if (!(mask & (1 << c))) 2674 continue; 2675 emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32); 2676 emit_mov(pc, dst[c], src[1][c]); 2677 set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ 2678 emit_mov(pc, dst[c], src[2][c]); 2679 set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ 2680 } 2681 break; 2682 case TGSI_OPCODE_CONT: 2683 assert(pc->loop_lvl > 0); 2684 emit_branch(pc, -1, 0)->param.index = 2685 pc->loop_pos[pc->loop_lvl - 1]; 2686 break; 2687 case TGSI_OPCODE_COS: 2688 if (mask & 8) { 2689 emit_precossin(pc, temp, src[0][3]); 2690 emit_flop(pc, NV50_FLOP_COS, dst[3], temp); 2691 if (!(mask &= 7)) 2692 break; 2693 if (temp == dst[3]) 2694 temp = brdc = temp_temp(pc, NULL); 2695 } 2696 emit_precossin(pc, temp, src[0][0]); 2697 emit_flop(pc, NV50_FLOP_COS, brdc, temp); 2698 break; 2699 case TGSI_OPCODE_DDX: 2700 for (c = 0; c < 4; c++) { 2701 if (!(mask & (1 << c))) 2702 continue; 2703 emit_ddx(pc, dst[c], src[0][c]); 2704 } 2705 break; 2706 case TGSI_OPCODE_DDY: 2707 for (c = 0; c < 4; c++) { 2708 if (!(mask & (1 << c))) 2709 continue; 2710 emit_ddy(pc, dst[c], src[0][c]); 2711 } 2712 break; 2713 case TGSI_OPCODE_DP3: 2714 emit_mul(pc, temp, src[0][0], src[1][0]); 2715 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2716 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 2717 break; 2718 case TGSI_OPCODE_DP4: 2719 emit_mul(pc, temp, src[0][0], src[1][0]); 2720 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2721 emit_mad(pc, temp, src[0][2], src[1][2], temp); 2722 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 2723 break; 2724 case TGSI_OPCODE_DPH: 2725 emit_mul(pc, temp, src[0][0], src[1][0]); 2726 emit_mad(pc, temp, src[0][1], src[1][1], temp); 2727 emit_mad(pc, temp, src[0][2], src[1][2], temp); 2728 emit_add(pc, brdc, src[1][3], temp); 2729 break; 2730 case TGSI_OPCODE_DST: 2731 if (mask & (1 << 1)) 2732 emit_mul(pc, dst[1], src[0][1], src[1][1]); 2733 if (mask & (1 << 2)) 2734 emit_mov(pc, dst[2], src[0][2]); 2735 if (mask & (1 << 3)) 2736 emit_mov(pc, dst[3], src[1][3]); 2737 if (mask & (1 << 0)) 2738 emit_mov_immdval(pc, dst[0], 1.0f); 2739 break; 2740 case TGSI_OPCODE_ELSE: 2741 emit_branch(pc, -1, 0); 2742 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 2743 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; 2744 terminate_mbb(pc); 2745 break; 2746 case TGSI_OPCODE_EMIT: 2747 emit_prim_cmd(pc, 1); 2748 break; 2749 case TGSI_OPCODE_ENDIF: 2750 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 2751 2752 /* try to replace branch over 1 insn with a predicated insn */ 2753 if (nv50_kill_branch(pc) == TRUE) 2754 break; 2755 2756 if (pc->if_join[pc->if_lvl]) { 2757 pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size; 2758 pc->if_join[pc->if_lvl] = NULL; 2759 } 2760 terminate_mbb(pc); 2761 /* emit a NOP as join point, we could set it on the next 2762 * one, but would have to make sure it is long and !immd 2763 */ 2764 JOIN_ON(emit_nop(pc)); 2765 break; 2766 case TGSI_OPCODE_ENDLOOP: 2767 emit_branch(pc, -1, 0)->param.index = 2768 pc->loop_pos[--pc->loop_lvl]; 2769 pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size; 2770 terminate_mbb(pc); 2771 break; 2772 case TGSI_OPCODE_ENDPRIM: 2773 emit_prim_cmd(pc, 2); 2774 break; 2775 case TGSI_OPCODE_ENDSUB: 2776 assert(pc->in_subroutine); 2777 terminate_mbb(pc); 2778 pc->in_subroutine = FALSE; 2779 break; 2780 case TGSI_OPCODE_EX2: 2781 emit_preex2(pc, temp, src[0][0]); 2782 emit_flop(pc, NV50_FLOP_EX2, brdc, temp); 2783 break; 2784 case TGSI_OPCODE_EXP: 2785 { 2786 struct nv50_reg *t[2]; 2787 2788 assert(!temp); 2789 t[0] = temp_temp(pc, NULL); 2790 t[1] = temp_temp(pc, NULL); 2791 2792 if (mask & 0x6) 2793 emit_mov(pc, t[0], src[0][0]); 2794 if (mask & 0x3) 2795 emit_flr(pc, t[1], src[0][0]); 2796 2797 if (mask & (1 << 1)) 2798 emit_sub(pc, dst[1], t[0], t[1]); 2799 if (mask & (1 << 0)) { 2800 emit_preex2(pc, t[1], t[1]); 2801 emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]); 2802 } 2803 if (mask & (1 << 2)) { 2804 emit_preex2(pc, t[0], t[0]); 2805 emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]); 2806 } 2807 if (mask & (1 << 3)) 2808 emit_mov_immdval(pc, dst[3], 1.0f); 2809 } 2810 break; 2811 case TGSI_OPCODE_F2I: 2812 for (c = 0; c < 4; c++) { 2813 if (!(mask & (1 << c))) 2814 continue; 2815 emit_cvt(pc, dst[c], src[0][c], -1, 2816 CVT_TRUNC | CVT_S32_F32); 2817 } 2818 break; 2819 case TGSI_OPCODE_F2U: 2820 for (c = 0; c < 4; c++) { 2821 if (!(mask & (1 << c))) 2822 continue; 2823 emit_cvt(pc, dst[c], src[0][c], -1, 2824 CVT_TRUNC | CVT_U32_F32); 2825 } 2826 break; 2827 case TGSI_OPCODE_FLR: 2828 for (c = 0; c < 4; c++) { 2829 if (!(mask & (1 << c))) 2830 continue; 2831 emit_flr(pc, dst[c], src[0][c]); 2832 } 2833 break; 2834 case TGSI_OPCODE_FRC: 2835 temp = temp_temp(pc, NULL); 2836 for (c = 0; c < 4; c++) { 2837 if (!(mask & (1 << c))) 2838 continue; 2839 emit_flr(pc, temp, src[0][c]); 2840 emit_sub(pc, dst[c], src[0][c], temp); 2841 } 2842 break; 2843 case TGSI_OPCODE_I2F: 2844 for (c = 0; c < 4; c++) { 2845 if (!(mask & (1 << c))) 2846 continue; 2847 emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32); 2848 } 2849 break; 2850 case TGSI_OPCODE_IF: 2851 assert(pc->if_lvl < NV50_MAX_COND_NESTING); 2852 emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32); 2853 pc->if_join[pc->if_lvl] = emit_joinat(pc); 2854 pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);; 2855 terminate_mbb(pc); 2856 break; 2857 case TGSI_OPCODE_IMAX: 2858 for (c = 0; c < 4; c++) { 2859 if (!(mask & (1 << c))) 2860 continue; 2861 emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]); 2862 } 2863 break; 2864 case TGSI_OPCODE_IMIN: 2865 for (c = 0; c < 4; c++) { 2866 if (!(mask & (1 << c))) 2867 continue; 2868 emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]); 2869 } 2870 break; 2871 case TGSI_OPCODE_INEG: 2872 for (c = 0; c < 4; c++) { 2873 if (!(mask & (1 << c))) 2874 continue; 2875 emit_cvt(pc, dst[c], src[0][c], -1, 2876 CVT_S32_S32 | CVT_NEG); 2877 } 2878 break; 2879 case TGSI_OPCODE_KIL: 2880 assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]); 2881 emit_kil(pc, src[0][0]); 2882 emit_kil(pc, src[0][1]); 2883 emit_kil(pc, src[0][2]); 2884 emit_kil(pc, src[0][3]); 2885 break; 2886 case TGSI_OPCODE_KILP: 2887 emit_kil(pc, NULL); 2888 break; 2889 case TGSI_OPCODE_LIT: 2890 emit_lit(pc, &dst[0], mask, &src[0][0]); 2891 break; 2892 case TGSI_OPCODE_LG2: 2893 emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]); 2894 break; 2895 case TGSI_OPCODE_LOG: 2896 { 2897 struct nv50_reg *t[2]; 2898 2899 t[0] = temp_temp(pc, NULL); 2900 if (mask & (1 << 1)) 2901 t[1] = temp_temp(pc, NULL); 2902 else 2903 t[1] = t[0]; 2904 2905 emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32); 2906 emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]); 2907 if (mask & (1 << 2)) 2908 emit_mov(pc, dst[2], t[1]); 2909 emit_flr(pc, t[1], t[1]); 2910 if (mask & (1 << 0)) 2911 emit_mov(pc, dst[0], t[1]); 2912 if (mask & (1 << 1)) { 2913 t[1]->mod = NV50_MOD_NEG; 2914 emit_preex2(pc, t[1], t[1]); 2915 t[1]->mod = 0; 2916 emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]); 2917 emit_mul(pc, dst[1], t[0], t[1]); 2918 } 2919 if (mask & (1 << 3)) 2920 emit_mov_immdval(pc, dst[3], 1.0f); 2921 } 2922 break; 2923 case TGSI_OPCODE_LRP: 2924 temp = temp_temp(pc, NULL); 2925 for (c = 0; c < 4; c++) { 2926 if (!(mask & (1 << c))) 2927 continue; 2928 emit_sub(pc, temp, src[1][c], src[2][c]); 2929 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 2930 } 2931 break; 2932 case TGSI_OPCODE_MAD: 2933 for (c = 0; c < 4; c++) { 2934 if (!(mask & (1 << c))) 2935 continue; 2936 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 2937 } 2938 break; 2939 case TGSI_OPCODE_MAX: 2940 for (c = 0; c < 4; c++) { 2941 if (!(mask & (1 << c))) 2942 continue; 2943 emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]); 2944 } 2945 break; 2946 case TGSI_OPCODE_MIN: 2947 for (c = 0; c < 4; c++) { 2948 if (!(mask & (1 << c))) 2949 continue; 2950 emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]); 2951 } 2952 break; 2953 case TGSI_OPCODE_MOV: 2954 for (c = 0; c < 4; c++) { 2955 if (!(mask & (1 << c))) 2956 continue; 2957 emit_mov(pc, dst[c], src[0][c]); 2958 } 2959 break; 2960 case TGSI_OPCODE_MUL: 2961 for (c = 0; c < 4; c++) { 2962 if (!(mask & (1 << c))) 2963 continue; 2964 emit_mul(pc, dst[c], src[0][c], src[1][c]); 2965 } 2966 break; 2967 case TGSI_OPCODE_NOT: 2968 for (c = 0; c < 4; c++) { 2969 if (!(mask & (1 << c))) 2970 continue; 2971 emit_not(pc, dst[c], src[0][c]); 2972 } 2973 break; 2974 case TGSI_OPCODE_POW: 2975 emit_pow(pc, brdc, src[0][0], src[1][0]); 2976 break; 2977 case TGSI_OPCODE_RCP: 2978 if (!sat && popcnt4(mask) == 1) 2979 brdc = dst[ffs(mask) - 1]; 2980 emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]); 2981 break; 2982 case TGSI_OPCODE_RET: 2983 if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine) 2984 nv50_fp_move_results(pc); 2985 emit_ret(pc, -1, 0); 2986 break; 2987 case TGSI_OPCODE_RSQ: 2988 if (!sat && popcnt4(mask) == 1) 2989 brdc = dst[ffs(mask) - 1]; 2990 src[0][0]->mod |= NV50_MOD_ABS; 2991 emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]); 2992 break; 2993 case TGSI_OPCODE_SAD: 2994 for (c = 0; c < 4; c++) { 2995 if (!(mask & (1 << c))) 2996 continue; 2997 emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 2998 } 2999 break; 3000 case TGSI_OPCODE_SCS: 3001 temp = temp_temp(pc, NULL); 3002 if (mask & 3) 3003 emit_precossin(pc, temp, src[0][0]); 3004 if (mask & (1 << 0)) 3005 emit_flop(pc, NV50_FLOP_COS, dst[0], temp); 3006 if (mask & (1 << 1)) 3007 emit_flop(pc, NV50_FLOP_SIN, dst[1], temp); 3008 if (mask & (1 << 2)) 3009 emit_mov_immdval(pc, dst[2], 0.0); 3010 if (mask & (1 << 3)) 3011 emit_mov_immdval(pc, dst[3], 1.0); 3012 break; 3013 case TGSI_OPCODE_SHL: 3014 case TGSI_OPCODE_ISHR: 3015 case TGSI_OPCODE_USHR: 3016 for (c = 0; c < 4; c++) { 3017 if (!(mask & (1 << c))) 3018 continue; 3019 emit_shift(pc, dst[c], src[0][c], src[1][c], 3020 inst->Instruction.Opcode); 3021 } 3022 break; 3023 case TGSI_OPCODE_SIN: 3024 if (mask & 8) { 3025 emit_precossin(pc, temp, src[0][3]); 3026 emit_flop(pc, NV50_FLOP_SIN, dst[3], temp); 3027 if (!(mask &= 7)) 3028 break; 3029 if (temp == dst[3]) 3030 temp = brdc = temp_temp(pc, NULL); 3031 } 3032 emit_precossin(pc, temp, src[0][0]); 3033 emit_flop(pc, NV50_FLOP_SIN, brdc, temp); 3034 break; 3035 case TGSI_OPCODE_SLT: 3036 case TGSI_OPCODE_SGE: 3037 case TGSI_OPCODE_SEQ: 3038 case TGSI_OPCODE_SGT: 3039 case TGSI_OPCODE_SLE: 3040 case TGSI_OPCODE_SNE: 3041 case TGSI_OPCODE_ISLT: 3042 case TGSI_OPCODE_ISGE: 3043 case TGSI_OPCODE_USEQ: 3044 case TGSI_OPCODE_USGE: 3045 case TGSI_OPCODE_USLT: 3046 case TGSI_OPCODE_USNE: 3047 { 3048 uint8_t cc, ty; 3049 3050 map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty); 3051 3052 for (c = 0; c < 4; c++) { 3053 if (!(mask & (1 << c))) 3054 continue; 3055 emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty); 3056 } 3057 } 3058 break; 3059 case TGSI_OPCODE_SUB: 3060 for (c = 0; c < 4; c++) { 3061 if (!(mask & (1 << c))) 3062 continue; 3063 emit_sub(pc, dst[c], src[0][c], src[1][c]); 3064 } 3065 break; 3066 case TGSI_OPCODE_TEX: 3067 emit_tex(pc, dst, mask, src[0], unit, 3068 inst->Texture.Texture, FALSE, 0); 3069 break; 3070 case TGSI_OPCODE_TXB: 3071 emit_tex(pc, dst, mask, src[0], unit, 3072 inst->Texture.Texture, FALSE, -1); 3073 break; 3074 case TGSI_OPCODE_TXL: 3075 emit_tex(pc, dst, mask, src[0], unit, 3076 inst->Texture.Texture, FALSE, 1); 3077 break; 3078 case TGSI_OPCODE_TXP: 3079 emit_tex(pc, dst, mask, src[0], unit, 3080 inst->Texture.Texture, TRUE, 0); 3081 break; 3082 case TGSI_OPCODE_TRUNC: 3083 for (c = 0; c < 4; c++) { 3084 if (!(mask & (1 << c))) 3085 continue; 3086 emit_cvt(pc, dst[c], src[0][c], -1, 3087 CVT_TRUNC | CVT_F32_F32 | CVT_RI); 3088 } 3089 break; 3090 case TGSI_OPCODE_U2F: 3091 for (c = 0; c < 4; c++) { 3092 if (!(mask & (1 << c))) 3093 continue; 3094 emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32); 3095 } 3096 break; 3097 case TGSI_OPCODE_UADD: 3098 for (c = 0; c < 4; c++) { 3099 if (!(mask & (1 << c))) 3100 continue; 3101 emit_add_b32(pc, dst[c], src[0][c], src[1][c]); 3102 } 3103 break; 3104 case TGSI_OPCODE_UMAX: 3105 for (c = 0; c < 4; c++) { 3106 if (!(mask & (1 << c))) 3107 continue; 3108 emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]); 3109 } 3110 break; 3111 case TGSI_OPCODE_UMIN: 3112 for (c = 0; c < 4; c++) { 3113 if (!(mask & (1 << c))) 3114 continue; 3115 emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]); 3116 } 3117 break; 3118 case TGSI_OPCODE_UMAD: 3119 { 3120 assert(!temp); 3121 temp = temp_temp(pc, NULL); 3122 for (c = 0; c < 4; c++) { 3123 if (!(mask & (1 << c))) 3124 continue; 3125 emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); 3126 emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, 3127 temp); 3128 emit_shl_imm(pc, temp, temp, 16); 3129 emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0, 3130 temp); 3131 emit_add_b32(pc, dst[c], temp, src[2][c]); 3132 } 3133 } 3134 break; 3135 case TGSI_OPCODE_UMUL: 3136 { 3137 assert(!temp); 3138 temp = temp_temp(pc, NULL); 3139 for (c = 0; c < 4; c++) { 3140 if (!(mask & (1 << c))) 3141 continue; 3142 emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); 3143 emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, 3144 temp); 3145 emit_shl_imm(pc, temp, temp, 16); 3146 emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0, 3147 temp); 3148 } 3149 } 3150 break; 3151 case TGSI_OPCODE_XPD: 3152 temp = temp_temp(pc, NULL); 3153 if (mask & (1 << 0)) { 3154 emit_mul(pc, temp, src[0][2], src[1][1]); 3155 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 3156 } 3157 if (mask & (1 << 1)) { 3158 emit_mul(pc, temp, src[0][0], src[1][2]); 3159 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 3160 } 3161 if (mask & (1 << 2)) { 3162 emit_mul(pc, temp, src[0][1], src[1][0]); 3163 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 3164 } 3165 if (mask & (1 << 3)) 3166 emit_mov_immdval(pc, dst[3], 1.0); 3167 break; 3168 case TGSI_OPCODE_END: 3169 if (pc->p->type == PIPE_SHADER_FRAGMENT) 3170 nv50_fp_move_results(pc); 3171 3172 /* last insn must be long so it can have the exit bit set */ 3173 if (!is_long(pc->p->exec_tail)) 3174 convert_to_long(pc, pc->p->exec_tail); 3175 else 3176 if (is_immd(pc->p->exec_tail) || 3177 is_join(pc->p->exec_tail) || 3178 is_control_flow(pc->p->exec_tail)) 3179 emit_nop(pc); 3180 3181 pc->p->exec_tail->inst[1] |= 1; /* set exit bit */ 3182 3183 terminate_mbb(pc); 3184 break; 3185 default: 3186 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 3187 return FALSE; 3188 } 3189 3190 if (brdc) { 3191 if (sat) 3192 emit_sat(pc, brdc, brdc); 3193 for (c = 0; c < 4; c++) 3194 if ((mask & (1 << c)) && dst[c] != brdc) 3195 emit_mov(pc, dst[c], brdc); 3196 } else 3197 if (sat) { 3198 for (c = 0; c < 4; c++) { 3199 if (!(mask & (1 << c))) 3200 continue; 3201 /* In this case we saturate later, and dst[c] won't 3202 * be another temp_temp (and thus lost), since rdst 3203 * already is TEMP (see above). */ 3204 if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) 3205 continue; 3206 emit_sat(pc, rdst[c], dst[c]); 3207 } 3208 } 3209 3210 kill_temp_temp(pc, NULL); 3211 pc->reg_instance_nr = 0; 3212 3213 return TRUE; 3214} 3215 3216static void 3217prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) 3218{ 3219 struct nv50_reg *r, *reg = NULL; 3220 const struct tgsi_full_src_register *src; 3221 const struct tgsi_dst_register *dst; 3222 unsigned i, c, k, mask; 3223 3224 dst = &insn->Dst[0].Register; 3225 mask = dst->WriteMask; 3226 3227 if (dst->File == TGSI_FILE_TEMPORARY) 3228 reg = pc->temp; 3229 else 3230 if (dst->File == TGSI_FILE_OUTPUT) { 3231 reg = pc->result; 3232 3233 if (insn->Instruction.Opcode == TGSI_OPCODE_MOV && 3234 dst->Index == pc->edgeflag_out && 3235 insn->Src[0].Register.File == TGSI_FILE_INPUT) 3236 pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index; 3237 } 3238 3239 if (reg) { 3240 for (c = 0; c < 4; c++) { 3241 if (!(mask & (1 << c))) 3242 continue; 3243 reg[dst->Index * 4 + c].acc = pc->insn_nr; 3244 } 3245 } 3246 3247 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 3248 src = &insn->Src[i]; 3249 3250 if (src->Register.File == TGSI_FILE_TEMPORARY) 3251 reg = pc->temp; 3252 else 3253 if (src->Register.File == TGSI_FILE_INPUT) 3254 reg = pc->attr; 3255 else 3256 continue; 3257 3258 mask = nv50_tgsi_src_mask(insn, i); 3259 3260 for (c = 0; c < 4; c++) { 3261 if (!(mask & (1 << c))) 3262 continue; 3263 k = tgsi_util_get_full_src_register_swizzle(src, c); 3264 3265 r = ®[src->Register.Index * 4 + k]; 3266 3267 /* If used before written, pre-allocate the reg, 3268 * lest we overwrite results from a subroutine. 3269 */ 3270 if (!r->acc && r->type == P_TEMP) 3271 alloc_reg(pc, r); 3272 3273 r->acc = pc->insn_nr; 3274 } 3275 } 3276} 3277 3278/* Returns a bitmask indicating which dst components need to be 3279 * written to temporaries first to avoid 'corrupting' sources. 3280 * 3281 * m[i] (out) indicate component to write in the i-th position 3282 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 3283 */ 3284static unsigned 3285nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 3286{ 3287 unsigned i, c, x, unsafe = 0; 3288 3289 for (c = 0; c < 4; c++) 3290 m[c] = c; 3291 3292 /* Swap as long as a dst component written earlier is depended on 3293 * by one written later, but the next one isn't depended on by it. 3294 */ 3295 for (c = 0; c < 3; c++) { 3296 if (rdep[m[c + 1]] & (1 << m[c])) 3297 continue; /* if next one is depended on by us */ 3298 for (i = c + 1; i < 4; i++) 3299 /* if we are depended on by a later one */ 3300 if (rdep[m[c]] & (1 << m[i])) 3301 break; 3302 if (i == 4) 3303 continue; 3304 /* now, swap */ 3305 x = m[c]; 3306 m[c] = m[c + 1]; 3307 m[c + 1] = x; 3308 3309 /* restart */ 3310 c = 0; 3311 } 3312 3313 /* mark dependencies that could not be resolved by reordering */ 3314 for (i = 0; i < 3; ++i) 3315 for (c = i + 1; c < 4; ++c) 3316 if (rdep[m[i]] & (1 << m[c])) 3317 unsafe |= (1 << i); 3318 3319 /* NOTE: $unsafe is with respect to order, not component */ 3320 return unsafe; 3321} 3322 3323/* Select a suitable dst register for broadcasting scalar results, 3324 * or return NULL if we have to allocate an extra TEMP. 3325 * 3326 * If e.g. only 1 component is written, we may also emit the final 3327 * result to a write-only register. 3328 */ 3329static struct nv50_reg * 3330tgsi_broadcast_dst(struct nv50_pc *pc, 3331 const struct tgsi_full_dst_register *fd, unsigned mask) 3332{ 3333 if (fd->Register.File == TGSI_FILE_TEMPORARY) { 3334 int c = ffs(~mask & fd->Register.WriteMask); 3335 if (c) 3336 return tgsi_dst(pc, c - 1, fd); 3337 } else { 3338 int c = ffs(fd->Register.WriteMask) - 1; 3339 if ((1 << c) == fd->Register.WriteMask) 3340 return tgsi_dst(pc, c, fd); 3341 } 3342 3343 return NULL; 3344} 3345 3346/* Scan source swizzles and return a bitmask indicating dst regs that 3347 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 3348 */ 3349static unsigned 3350nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 3351 unsigned rdep[4]) 3352{ 3353 const struct tgsi_full_dst_register *fd = &insn->Dst[0]; 3354 const struct tgsi_full_src_register *fs; 3355 unsigned i, deqs = 0; 3356 3357 for (i = 0; i < 4; ++i) 3358 rdep[i] = 0; 3359 3360 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 3361 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 3362 int ms = get_supported_mods(insn, i); 3363 3364 fs = &insn->Src[i]; 3365 if (fs->Register.File != fd->Register.File || 3366 fs->Register.Index != fd->Register.Index) 3367 continue; 3368 3369 for (chn = 0; chn < 4; ++chn) { 3370 unsigned s, c; 3371 3372 if (!(mask & (1 << chn))) /* src is not read */ 3373 continue; 3374 c = tgsi_util_get_full_src_register_swizzle(fs, chn); 3375 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 3376 3377 if (!(fd->Register.WriteMask & (1 << c))) 3378 continue; 3379 3380 if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG)) 3381 continue; 3382 if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS)) 3383 continue; 3384 if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3)) 3385 continue; 3386 3387 rdep[c] |= nv50_tgsi_dst_revdep( 3388 insn->Instruction.Opcode, i, chn); 3389 deqs |= (1 << c); 3390 } 3391 } 3392 3393 return deqs; 3394} 3395 3396static boolean 3397nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 3398{ 3399 struct tgsi_full_instruction insn = tok->FullInstruction; 3400 const struct tgsi_full_dst_register *fd; 3401 unsigned i, deqs, rdep[4], m[4]; 3402 3403 fd = &tok->FullInstruction.Dst[0]; 3404 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 3405 3406 if (is_scalar_op(insn.Instruction.Opcode)) { 3407 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 3408 if (!pc->r_brdc) 3409 pc->r_brdc = temp_temp(pc, NULL); 3410 return nv50_program_tx_insn(pc, &insn); 3411 } 3412 pc->r_brdc = NULL; 3413 3414 if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3])) 3415 return nv50_program_tx_insn(pc, &insn); 3416 3417 deqs = nv50_revdep_reorder(m, rdep); 3418 3419 for (i = 0; i < 4; ++i) { 3420 assert(pc->r_dst[m[i]] == NULL); 3421 3422 insn.Dst[0].Register.WriteMask = 3423 fd->Register.WriteMask & (1 << m[i]); 3424 3425 if (!insn.Dst[0].Register.WriteMask) 3426 continue; 3427 3428 if (deqs & (1 << i)) 3429 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 3430 3431 if (!nv50_program_tx_insn(pc, &insn)) 3432 return FALSE; 3433 } 3434 3435 for (i = 0; i < 4; i++) { 3436 struct nv50_reg *reg = pc->r_dst[i]; 3437 if (!reg) 3438 continue; 3439 pc->r_dst[i] = NULL; 3440 3441 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 3442 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 3443 else 3444 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 3445 free_temp(pc, reg); 3446 } 3447 3448 return TRUE; 3449} 3450 3451static void 3452load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) 3453{ 3454 struct nv50_reg *iv, **ppiv; 3455 unsigned mode = pc->interp_mode[reg->index]; 3456 3457 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; 3458 iv = *ppiv; 3459 3460 if ((mode & INTERP_PERSPECTIVE) && !iv) { 3461 iv = *ppiv = alloc_temp(pc, NULL); 3462 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; 3463 3464 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); 3465 emit_flop(pc, NV50_FLOP_RCP, iv, iv); 3466 3467 /* XXX: when loading interpolants dynamically, move these 3468 * to the program head, or make sure it can't be skipped. 3469 */ 3470 } 3471 3472 emit_interp(pc, reg, iv, mode); 3473} 3474 3475/* The face input is always at v[255] (varying space), with a 3476 * value of 0 for back-facing, and 0xffffffff for front-facing. 3477 */ 3478static void 3479load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv) 3480{ 3481 struct nv50_reg *temp = alloc_temp(pc, NULL); 3482 int r_pred = 0; 3483 3484 temp->rhw = 255; 3485 emit_interp(pc, temp, NULL, INTERP_FLAT); 3486 3487 emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32); 3488 3489 emit_not(pc, temp, temp); 3490 set_pred(pc, 0x2, r_pred, pc->p->exec_tail); 3491 emit_cvt(pc, sv, temp, -1, CVT_F32_S32); 3492 set_pred(pc, 0x2, r_pred, pc->p->exec_tail); 3493 3494 free_temp(pc, temp); 3495} 3496 3497static void 3498load_instance_id(struct nv50_pc *pc, unsigned index) 3499{ 3500 struct nv50_reg reg, mem; 3501 3502 ctor_reg(®, P_TEMP, -1, -1); 3503 ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */ 3504 mem.buf_index = 2; 3505 3506 emit_add_b32(pc, ®, &pc->sysval[index], &mem); 3507 pc->sysval[index] = reg; 3508} 3509 3510static void 3511copy_semantic_info(struct nv50_program *p) 3512{ 3513 unsigned i, id; 3514 3515 for (i = 0; i < p->cfg.in_nr; ++i) { 3516 id = p->cfg.in[i].id; 3517 p->cfg.in[i].sn = p->info.input_semantic_name[id]; 3518 p->cfg.in[i].si = p->info.input_semantic_index[id]; 3519 } 3520 3521 for (i = 0; i < p->cfg.out_nr; ++i) { 3522 id = p->cfg.out[i].id; 3523 p->cfg.out[i].sn = p->info.output_semantic_name[id]; 3524 p->cfg.out[i].si = p->info.output_semantic_index[id]; 3525 } 3526} 3527 3528static boolean 3529nv50_program_tx_prep(struct nv50_pc *pc) 3530{ 3531 struct tgsi_parse_context tp; 3532 struct nv50_program *p = pc->p; 3533 boolean ret = FALSE; 3534 unsigned i, c, instance_id = 0, vertex_id = 0, flat_nr = 0; 3535 3536 tgsi_parse_init(&tp, pc->p->pipe.tokens); 3537 while (!tgsi_parse_end_of_tokens(&tp)) { 3538 const union tgsi_full_token *tok = &tp.FullToken; 3539 3540 tgsi_parse_token(&tp); 3541 switch (tok->Token.Type) { 3542 case TGSI_TOKEN_TYPE_IMMEDIATE: 3543 { 3544 const struct tgsi_full_immediate *imm = 3545 &tp.FullToken.FullImmediate; 3546 3547 ctor_immd_4f32(pc, imm->u[0].Float, 3548 imm->u[1].Float, 3549 imm->u[2].Float, 3550 imm->u[3].Float); 3551 } 3552 break; 3553 case TGSI_TOKEN_TYPE_DECLARATION: 3554 { 3555 const struct tgsi_full_declaration *d; 3556 unsigned si, last, first, mode; 3557 3558 d = &tp.FullToken.FullDeclaration; 3559 first = d->Range.First; 3560 last = d->Range.Last; 3561 3562 switch (d->Declaration.File) { 3563 case TGSI_FILE_TEMPORARY: 3564 break; 3565 case TGSI_FILE_OUTPUT: 3566 if (!d->Declaration.Semantic || 3567 p->type == PIPE_SHADER_FRAGMENT) 3568 break; 3569 3570 si = d->Semantic.Index; 3571 switch (d->Semantic.Name) { 3572 case TGSI_SEMANTIC_BCOLOR: 3573 p->cfg.two_side[si].hw = first; 3574 if (p->cfg.out_nr > first) 3575 p->cfg.out_nr = first; 3576 break; 3577 case TGSI_SEMANTIC_PSIZE: 3578 p->cfg.psiz = first; 3579 if (p->cfg.out_nr > first) 3580 p->cfg.out_nr = first; 3581 break; 3582 case TGSI_SEMANTIC_EDGEFLAG: 3583 pc->edgeflag_out = first; 3584 break; 3585 /* 3586 case TGSI_SEMANTIC_CLIP_DISTANCE: 3587 p->cfg.clpd = MIN2(p->cfg.clpd, first); 3588 break; 3589 */ 3590 default: 3591 break; 3592 } 3593 break; 3594 case TGSI_FILE_INPUT: 3595 { 3596 if (p->type != PIPE_SHADER_FRAGMENT) 3597 break; 3598 3599 switch (d->Declaration.Interpolate) { 3600 case TGSI_INTERPOLATE_CONSTANT: 3601 mode = INTERP_FLAT; 3602 flat_nr++; 3603 break; 3604 case TGSI_INTERPOLATE_PERSPECTIVE: 3605 mode = INTERP_PERSPECTIVE; 3606 p->cfg.regs[1] |= 0x08 << 24; 3607 break; 3608 default: 3609 mode = INTERP_LINEAR; 3610 break; 3611 } 3612 if (d->Declaration.Centroid) 3613 mode |= INTERP_CENTROID; 3614 3615 assert(last < 32); 3616 for (i = first; i <= last; i++) 3617 pc->interp_mode[i] = mode; 3618 } 3619 break; 3620 case TGSI_FILE_SYSTEM_VALUE: 3621 assert(d->Declaration.Semantic); 3622 switch (d->Semantic.Name) { 3623 case TGSI_SEMANTIC_FACE: 3624 assert(p->type == PIPE_SHADER_FRAGMENT); 3625 load_frontfacing(pc, 3626 &pc->sysval[first]); 3627 break; 3628 case TGSI_SEMANTIC_INSTANCEID: 3629 assert(p->type == PIPE_SHADER_VERTEX); 3630 instance_id = first; 3631 p->cfg.regs[0] |= (1 << 4); 3632 break; 3633 case TGSI_SEMANTIC_PRIMID: 3634 assert(p->type != PIPE_SHADER_VERTEX); 3635 p->cfg.prim_id = first; 3636 break; 3637 /* 3638 case TGSI_SEMANTIC_PRIMIDIN: 3639 assert(p->type == PIPE_SHADER_GEOMETRY); 3640 pc->sysval[first].hw = 6; 3641 p->cfg.regs[0] |= (1 << 8); 3642 break; 3643 case TGSI_SEMANTIC_VERTEXID: 3644 assert(p->type == PIPE_SHADER_VERTEX); 3645 vertex_id = first; 3646 p->cfg.regs[0] |= (1 << 12) | (1 << 0); 3647 break; 3648 */ 3649 } 3650 break; 3651 case TGSI_FILE_ADDRESS: 3652 case TGSI_FILE_CONSTANT: 3653 case TGSI_FILE_SAMPLER: 3654 break; 3655 default: 3656 NOUVEAU_ERR("bad decl file %d\n", 3657 d->Declaration.File); 3658 goto out_err; 3659 } 3660 } 3661 break; 3662 case TGSI_TOKEN_TYPE_INSTRUCTION: 3663 pc->insn_nr++; 3664 prep_inspect_insn(pc, &tok->FullInstruction); 3665 break; 3666 default: 3667 break; 3668 } 3669 } 3670 3671 if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) { 3672 int rid = 0; 3673 3674 if (p->type == PIPE_SHADER_GEOMETRY) { 3675 for (i = 0; i < pc->attr_nr; ++i) { 3676 p->cfg.in[i].hw = rid; 3677 p->cfg.in[i].id = i; 3678 3679 for (c = 0; c < 4; ++c) { 3680 int n = i * 4 + c; 3681 if (!pc->attr[n].acc) 3682 continue; 3683 pc->attr[n].hw = rid++; 3684 p->cfg.in[i].mask |= 1 << c; 3685 } 3686 } 3687 } else { 3688 for (i = 0; i < pc->attr_nr * 4; ++i) { 3689 if (pc->attr[i].acc) { 3690 pc->attr[i].hw = rid++; 3691 p->cfg.attr[i / 32] |= 1 << (i % 32); 3692 } 3693 } 3694 if (p->cfg.regs[0] & (1 << 0)) 3695 pc->sysval[vertex_id].hw = rid++; 3696 if (p->cfg.regs[0] & (1 << 4)) { 3697 pc->sysval[instance_id].hw = rid++; 3698 load_instance_id(pc, instance_id); 3699 } 3700 } 3701 3702 for (i = 0, rid = 0; i < pc->result_nr; ++i) { 3703 p->cfg.out[i].hw = rid; 3704 p->cfg.out[i].id = i; 3705 3706 for (c = 0; c < 4; ++c) { 3707 int n = i * 4 + c; 3708 if (!pc->result[n].acc) 3709 continue; 3710 pc->result[n].hw = rid++; 3711 p->cfg.out[i].mask |= 1 << c; 3712 } 3713 } 3714 if (p->cfg.prim_id < 0x40) { 3715 /* GP has to write to PrimitiveID */ 3716 ctor_reg(&pc->sysval[p->cfg.prim_id], 3717 P_RESULT, p->cfg.prim_id, rid); 3718 p->cfg.prim_id = rid++; 3719 } 3720 3721 for (c = 0; c < 2; ++c) 3722 if (p->cfg.two_side[c].hw < 0x40) 3723 p->cfg.two_side[c] = p->cfg.out[ 3724 p->cfg.two_side[c].hw]; 3725 3726 if (p->cfg.psiz < 0x40) 3727 p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw; 3728 3729 copy_semantic_info(p); 3730 } else 3731 if (p->type == PIPE_SHADER_FRAGMENT) { 3732 int rid = 0, aid; 3733 unsigned n = 0, m = pc->attr_nr - flat_nr; 3734 3735 pc->allow32 = TRUE; 3736 3737 /* do we read FragCoord ? */ 3738 if (pc->attr_nr && 3739 p->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) { 3740 /* select FCRD components we want accessible */ 3741 for (c = 0; c < 4; ++c) 3742 if (pc->attr[c].acc) 3743 p->cfg.regs[1] |= 1 << (24 + c); 3744 aid = 0; 3745 } else /* offset by 1 if FCRD.w is needed for pinterp */ 3746 aid = popcnt4(p->cfg.regs[1] >> 24); 3747 3748 /* non-flat interpolants have to be mapped to 3749 * the lower hardware IDs, so sort them: 3750 */ 3751 for (i = 0; i < pc->attr_nr; i++) { 3752 if (pc->interp_mode[i] == INTERP_FLAT) 3753 p->cfg.in[m++].id = i; 3754 else { 3755 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) 3756 p->cfg.in[n].linear = TRUE; 3757 p->cfg.in[n++].id = i; 3758 } 3759 } 3760 copy_semantic_info(p); 3761 3762 for (n = 0; n < pc->attr_nr; ++n) { 3763 p->cfg.in[n].hw = rid = aid; 3764 i = p->cfg.in[n].id; 3765 3766 if (p->info.input_semantic_name[i] == 3767 TGSI_SEMANTIC_FACE) { 3768 load_frontfacing(pc, &pc->attr[i * 4]); 3769 continue; 3770 } 3771 3772 for (c = 0; c < 4; ++c) { 3773 if (!pc->attr[i * 4 + c].acc) 3774 continue; 3775 pc->attr[i * 4 + c].rhw = rid++; 3776 p->cfg.in[n].mask |= 1 << c; 3777 3778 load_interpolant(pc, &pc->attr[i * 4 + c]); 3779 } 3780 aid += popcnt4(p->cfg.in[n].mask); 3781 } 3782 3783 m = popcnt4(p->cfg.regs[1] >> 24); 3784 3785 /* set count of non-position inputs and of non-flat 3786 * non-position inputs for FP_INTERPOLANT_CTRL 3787 */ 3788 p->cfg.regs[1] |= aid - m; 3789 3790 if (flat_nr) { 3791 i = p->cfg.in[pc->attr_nr - flat_nr].hw; 3792 p->cfg.regs[1] |= (i - m) << 16; 3793 } else 3794 p->cfg.regs[1] |= p->cfg.regs[1] << 16; 3795 3796 /* mark color semantic for light-twoside */ 3797 n = 0x80; 3798 for (i = 0; i < p->cfg.in_nr; i++) { 3799 if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) { 3800 n = MIN2(n, p->cfg.in[i].hw - m); 3801 p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i]; 3802 3803 p->cfg.regs[0] += /* increase colour count */ 3804 popcnt4(p->cfg.in[i].mask) << 16; 3805 } 3806 } 3807 if (n < 0x80) 3808 p->cfg.regs[0] += n; 3809 3810 if (p->cfg.prim_id < 0x40) { 3811 pc->sysval[p->cfg.prim_id].rhw = rid++; 3812 emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL, 3813 INTERP_FLAT); 3814 /* increase FP_INTERPOLANT_CTRL_COUNT */ 3815 p->cfg.regs[1] += 1; 3816 } 3817 3818 /* Initialize FP results: 3819 * FragDepth is always first TGSI and last hw output 3820 */ 3821 i = p->info.writes_z ? 4 : 0; 3822 for (rid = 0; i < pc->result_nr * 4; i++) 3823 pc->result[i].rhw = rid++; 3824 if (p->info.writes_z) 3825 pc->result[2].rhw = rid++; 3826 3827 p->cfg.high_result = rid; 3828 3829 /* separate/different colour results for MRTs ? */ 3830 if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) 3831 p->cfg.regs[2] |= 1; 3832 } 3833 3834 if (pc->immd_nr) { 3835 int rid = 0; 3836 3837 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); 3838 if (!pc->immd) 3839 goto out_err; 3840 3841 for (i = 0; i < pc->immd_nr; i++) { 3842 for (c = 0; c < 4; c++, rid++) 3843 ctor_reg(&pc->immd[rid], P_IMMD, i, rid); 3844 } 3845 } 3846 3847 ret = TRUE; 3848out_err: 3849 if (pc->iv_p) 3850 free_temp(pc, pc->iv_p); 3851 if (pc->iv_c) 3852 free_temp(pc, pc->iv_c); 3853 3854 tgsi_parse_free(&tp); 3855 return ret; 3856} 3857 3858static void 3859free_nv50_pc(struct nv50_pc *pc) 3860{ 3861 if (pc->immd) 3862 FREE(pc->immd); 3863 if (pc->param) 3864 FREE(pc->param); 3865 if (pc->result) 3866 FREE(pc->result); 3867 if (pc->attr) 3868 FREE(pc->attr); 3869 if (pc->temp) 3870 FREE(pc->temp); 3871 if (pc->sysval) 3872 FREE(pc->sysval); 3873 if (pc->insn_pos) 3874 FREE(pc->insn_pos); 3875 3876 FREE(pc); 3877} 3878 3879static INLINE uint32_t 3880nv50_map_gs_output_prim(unsigned pprim) 3881{ 3882 switch (pprim) { 3883 case PIPE_PRIM_POINTS: 3884 return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS; 3885 case PIPE_PRIM_LINE_STRIP: 3886 return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP; 3887 case PIPE_PRIM_TRIANGLE_STRIP: 3888 return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP; 3889 default: 3890 NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim); 3891 abort(); 3892 return 0; 3893 } 3894} 3895 3896static boolean 3897ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) 3898{ 3899 int i, c; 3900 unsigned rtype[2] = { P_ATTR, P_RESULT }; 3901 3902 pc->p = p; 3903 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; 3904 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; 3905 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; 3906 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; 3907 pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; 3908 assert(pc->addr_nr <= 2); 3909 pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1; 3910 3911 p->cfg.high_temp = 4; 3912 3913 p->cfg.two_side[0].hw = 0x40; 3914 p->cfg.two_side[1].hw = 0x40; 3915 p->cfg.prim_id = 0x40; 3916 3917 p->cfg.edgeflag_in = pc->edgeflag_out = 0xff; 3918 3919 for (i = 0; i < p->info.num_properties; ++i) { 3920 unsigned *data = &p->info.properties[i].data[0]; 3921 3922 switch (p->info.properties[i].name) { 3923 case TGSI_PROPERTY_GS_OUTPUT_PRIM: 3924 p->cfg.prim_type = nv50_map_gs_output_prim(data[0]); 3925 break; 3926 case TGSI_PROPERTY_GS_MAX_VERTICES: 3927 p->cfg.vert_count = data[0]; 3928 break; 3929 default: 3930 break; 3931 } 3932 } 3933 3934 switch (p->type) { 3935 case PIPE_SHADER_VERTEX: 3936 p->cfg.psiz = 0x40; 3937 p->cfg.clpd = 0x40; 3938 p->cfg.out_nr = pc->result_nr; 3939 break; 3940 case PIPE_SHADER_GEOMETRY: 3941 assert(p->cfg.prim_type); 3942 assert(p->cfg.vert_count); 3943 3944 p->cfg.psiz = 0x80; 3945 p->cfg.clpd = 0x80; 3946 p->cfg.prim_id = 0x80; 3947 p->cfg.out_nr = pc->result_nr; 3948 p->cfg.in_nr = pc->attr_nr; 3949 3950 p->cfg.two_side[0].hw = 0x80; 3951 p->cfg.two_side[1].hw = 0x80; 3952 break; 3953 case PIPE_SHADER_FRAGMENT: 3954 rtype[0] = rtype[1] = P_TEMP; 3955 3956 p->cfg.regs[0] = 0x01000004; 3957 p->cfg.in_nr = pc->attr_nr; 3958 3959 if (p->info.writes_z) { 3960 p->cfg.regs[2] |= 0x00000100; 3961 p->cfg.regs[3] |= 0x00000011; 3962 } 3963 if (p->info.uses_kill) 3964 p->cfg.regs[2] |= 0x00100000; 3965 break; 3966 } 3967 3968 if (pc->temp_nr) { 3969 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); 3970 if (!pc->temp) 3971 return FALSE; 3972 3973 for (i = 0; i < pc->temp_nr * 4; ++i) 3974 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); 3975 } 3976 3977 if (pc->attr_nr) { 3978 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); 3979 if (!pc->attr) 3980 return FALSE; 3981 3982 for (i = 0; i < pc->attr_nr * 4; ++i) 3983 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); 3984 } 3985 3986 if (pc->result_nr) { 3987 unsigned nr = pc->result_nr * 4; 3988 3989 pc->result = MALLOC(nr * sizeof(struct nv50_reg)); 3990 if (!pc->result) 3991 return FALSE; 3992 3993 for (i = 0; i < nr; ++i) 3994 ctor_reg(&pc->result[i], rtype[1], i / 4, -1); 3995 } 3996 3997 if (pc->param_nr) { 3998 int rid = 0; 3999 4000 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); 4001 if (!pc->param) 4002 return FALSE; 4003 4004 for (i = 0; i < pc->param_nr; ++i) 4005 for (c = 0; c < 4; ++c, ++rid) 4006 ctor_reg(&pc->param[rid], P_CONST, i, rid); 4007 } 4008 4009 if (pc->addr_nr) { 4010 pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); 4011 if (!pc->addr) 4012 return FALSE; 4013 } 4014 for (i = 0; i < NV50_SU_MAX_ADDR; ++i) 4015 ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1); 4016 4017 if (pc->sysval_nr) { 4018 pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *)); 4019 if (!pc->sysval) 4020 return FALSE; 4021 /* will only ever use SYSTEM_VALUE[i].x (hopefully) */ 4022 for (i = 0; i < pc->sysval_nr; ++i) 4023 ctor_reg(&pc->sysval[i], rtype[0], i, -1); 4024 } 4025 4026 return TRUE; 4027} 4028 4029static void 4030nv50_program_fixup_insns(struct nv50_pc *pc) 4031{ 4032 struct nv50_program_exec *e, **bra_list; 4033 unsigned i, n, pos; 4034 4035 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); 4036 4037 /* Collect branch instructions, we need to adjust their offsets 4038 * when converting 32 bit instructions to 64 bit ones 4039 */ 4040 for (n = 0, e = pc->p->exec_head; e; e = e->next) 4041 if (e->param.index >= 0 && !e->param.mask) 4042 bra_list[n++] = e; 4043 4044 /* Make sure we don't have any single 32 bit instructions. */ 4045 for (e = pc->p->exec_head, pos = 0; e; e = e->next) { 4046 pos += is_long(e) ? 2 : 1; 4047 4048 if ((pos & 1) && (!e->next || is_long(e->next))) { 4049 for (i = 0; i < n; ++i) 4050 if (bra_list[i]->param.index >= pos) 4051 bra_list[i]->param.index += 1; 4052 for (i = 0; i < pc->insn_nr; ++i) 4053 if (pc->insn_pos[i] >= pos) 4054 pc->insn_pos[i] += 1; 4055 convert_to_long(pc, e); 4056 ++pos; 4057 } 4058 } 4059 4060 FREE(bra_list); 4061 4062 if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL]) 4063 return; 4064 4065 /* fill in CALL offsets */ 4066 for (e = pc->p->exec_head; e; e = e->next) { 4067 if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2) 4068 e->param.index = pc->insn_pos[e->param.index]; 4069 } 4070} 4071 4072static boolean 4073nv50_program_tx(struct nv50_program *p) 4074{ 4075 struct tgsi_parse_context parse; 4076 struct nv50_pc *pc; 4077 boolean ret; 4078 4079 pc = CALLOC_STRUCT(nv50_pc); 4080 if (!pc) 4081 return FALSE; 4082 4083 ret = ctor_nv50_pc(pc, p); 4084 if (ret == FALSE) 4085 goto out_cleanup; 4086 4087 ret = nv50_program_tx_prep(pc); 4088 if (ret == FALSE) 4089 goto out_cleanup; 4090 4091 pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned)); 4092 4093 tgsi_parse_init(&parse, pc->p->pipe.tokens); 4094 while (!tgsi_parse_end_of_tokens(&parse)) { 4095 const union tgsi_full_token *tok = &parse.FullToken; 4096 4097 /* previously allow32 was FALSE for first & last instruction */ 4098 pc->allow32 = TRUE; 4099 4100 tgsi_parse_token(&parse); 4101 4102 switch (tok->Token.Type) { 4103 case TGSI_TOKEN_TYPE_INSTRUCTION: 4104 pc->insn_pos[pc->insn_cur] = pc->p->exec_size; 4105 ++pc->insn_cur; 4106 ret = nv50_tgsi_insn(pc, tok); 4107 if (ret == FALSE) 4108 goto out_err; 4109 break; 4110 default: 4111 break; 4112 } 4113 } 4114 4115 nv50_program_fixup_insns(pc); 4116 4117 p->param_nr = pc->param_nr * 4; 4118 p->immd_nr = pc->immd_nr * 4; 4119 p->immd = pc->immd_buf; 4120 4121out_err: 4122 tgsi_parse_free(&parse); 4123 4124out_cleanup: 4125 free_nv50_pc(pc); 4126 return ret; 4127} 4128 4129static void 4130nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 4131{ 4132 if (nv50_program_tx(p) == FALSE) 4133 assert(0); 4134 p->translated = TRUE; 4135} 4136 4137static void 4138nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map, 4139 unsigned start, unsigned count, unsigned cbuf) 4140{ 4141 struct nouveau_channel *chan = nv50->screen->base.channel; 4142 struct nouveau_grobj *tesla = nv50->screen->tesla; 4143 4144 while (count) { 4145 unsigned nr = count > 2047 ? 2047 : count; 4146 4147 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 4148 OUT_RING (chan, (cbuf << 0) | (start << 8)); 4149 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 4150 OUT_RINGp (chan, map, nr); 4151 4152 map += nr; 4153 start += nr; 4154 count -= nr; 4155 } 4156} 4157 4158static void 4159nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 4160{ 4161 struct pipe_context *pipe = &nv50->pipe; 4162 struct pipe_transfer *transfer; 4163 4164 if (!p->data[0] && p->immd_nr) { 4165 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 4166 4167 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 4168 while (heap->next && heap->size < p->immd_nr) { 4169 struct nv50_program *evict = heap->next->priv; 4170 nouveau_resource_free(&evict->data[0]); 4171 } 4172 4173 if (nouveau_resource_alloc(heap, p->immd_nr, p, 4174 &p->data[0])) 4175 assert(0); 4176 } 4177 4178 /* immediates only need to be uploaded again when freed */ 4179 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 4180 p->immd_nr, NV50_CB_PMISC); 4181 } 4182 4183 assert(p->param_nr <= 512); 4184 4185 if (p->param_nr) { 4186 unsigned cb; 4187 uint32_t *map = pipe_buffer_map(pipe, 4188 nv50->constbuf[p->type], 4189 PIPE_TRANSFER_READ, 4190 &transfer); 4191 switch (p->type) { 4192 case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break; 4193 case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break; 4194 default: 4195 cb = NV50_CB_PVP; 4196 assert(p->type == PIPE_SHADER_VERTEX); 4197 break; 4198 } 4199 4200 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); 4201 pipe_buffer_unmap(pipe, nv50->constbuf[p->type], 4202 transfer); 4203 } 4204} 4205 4206static void 4207nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 4208{ 4209 struct nouveau_channel *chan = nv50->screen->base.channel; 4210 struct nv50_program_exec *e; 4211 uint32_t *up, i; 4212 boolean upload = FALSE; 4213 4214 if (!p->bo) { 4215 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 4216 p->exec_size * 4, &p->bo); 4217 upload = TRUE; 4218 } 4219 4220 if (p->data[0] && p->data[0]->start != p->data_start[0]) 4221 upload = TRUE; 4222 4223 if (!upload) 4224 return; 4225 4226 up = MALLOC(p->exec_size * 4); 4227 4228 for (i = 0, e = p->exec_head; e; e = e->next) { 4229 unsigned ei, ci, bs; 4230 4231 if (e->param.index >= 0 && e->param.mask) { 4232 bs = (e->inst[1] >> 22) & 0x07; 4233 assert(bs < 2); 4234 ei = e->param.shift >> 5; 4235 ci = e->param.index; 4236 if (bs == 0) 4237 ci += p->data[bs]->start; 4238 4239 e->inst[ei] &= ~e->param.mask; 4240 e->inst[ei] |= (ci << e->param.shift); 4241 } else 4242 if (e->param.index >= 0) { 4243 /* zero mask means param is a jump/branch offset */ 4244 assert(!(e->param.index & 1)); 4245 /* seem to be 8 byte steps */ 4246 ei = (e->param.index >> 1) + 0 /* START_ID */; 4247 4248 e->inst[0] &= 0xf0000fff; 4249 e->inst[0] |= ei << 12; 4250 } 4251 4252 up[i++] = e->inst[0]; 4253 if (is_long(e)) 4254 up[i++] = e->inst[1]; 4255 } 4256 assert(i == p->exec_size); 4257 4258 if (p->data[0]) 4259 p->data_start[0] = p->data[0]->start; 4260 4261#ifdef NV50_PROGRAM_DUMP 4262 NOUVEAU_ERR("-------\n"); 4263 for (e = p->exec_head; e; e = e->next) { 4264 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 4265 if (is_long(e)) 4266 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 4267 } 4268#endif 4269 nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM, 4270 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, 4271 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0, 4272 0, 0, p->exec_size * 4, 1, 1); 4273 4274 FREE(up); 4275} 4276 4277struct nouveau_stateobj * 4278nv50_vertprog_validate(struct nv50_context *nv50) 4279{ 4280 struct nouveau_grobj *tesla = nv50->screen->tesla; 4281 struct nv50_program *p = nv50->vertprog; 4282 struct nouveau_stateobj *so; 4283 4284 if (!p->translated) { 4285 nv50_program_validate(nv50, p); 4286 if (!p->translated) 4287 assert(0); 4288 } 4289 4290 nv50_program_validate_data(nv50, p); 4291 nv50_program_validate_code(nv50, p); 4292 4293 if (!(nv50->dirty & NV50_NEW_VERTPROG)) 4294 return NULL; 4295 4296 so = so_new(5, 7, 2); 4297 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 4298 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 4299 NOUVEAU_BO_HIGH, 0, 0); 4300 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 4301 NOUVEAU_BO_LOW, 0, 0); 4302 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 4303 so_data (so, p->cfg.attr[0]); 4304 so_data (so, p->cfg.attr[1]); 4305 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 4306 so_data (so, p->cfg.high_result); 4307 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); 4308 so_data (so, p->cfg.high_temp); 4309 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 4310 so_data (so, 0); /* program start offset */ 4311 return so; 4312} 4313 4314struct nouveau_stateobj * 4315nv50_fragprog_validate(struct nv50_context *nv50) 4316{ 4317 struct nouveau_grobj *tesla = nv50->screen->tesla; 4318 struct nv50_program *p = nv50->fragprog; 4319 struct nouveau_stateobj *so; 4320 4321 if (!p->translated) { 4322 nv50_program_validate(nv50, p); 4323 if (!p->translated) 4324 assert(0); 4325 } 4326 4327 nv50_program_validate_data(nv50, p); 4328 nv50_program_validate_code(nv50, p); 4329 4330 if (!(nv50->dirty & NV50_NEW_FRAGPROG)) 4331 return NULL; 4332 4333 so = so_new(6, 7, 2); 4334 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 4335 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 4336 NOUVEAU_BO_HIGH, 0, 0); 4337 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 4338 NOUVEAU_BO_LOW, 0, 0); 4339 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); 4340 so_data (so, p->cfg.high_temp); 4341 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 4342 so_data (so, p->cfg.high_result); 4343 so_method(so, tesla, NV50TCL_FP_CONTROL, 1); 4344 so_data (so, p->cfg.regs[2]); 4345 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 4346 so_data (so, p->cfg.regs[3]); 4347 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 4348 so_data (so, 0); /* program start offset */ 4349 return so; 4350} 4351 4352struct nouveau_stateobj * 4353nv50_geomprog_validate(struct nv50_context *nv50) 4354{ 4355 struct nouveau_grobj *tesla = nv50->screen->tesla; 4356 struct nv50_program *p = nv50->geomprog; 4357 struct nouveau_stateobj *so; 4358 4359 if (!p->translated) { 4360 nv50_program_validate(nv50, p); 4361 if (!p->translated) 4362 assert(0); 4363 } 4364 4365 nv50_program_validate_data(nv50, p); 4366 nv50_program_validate_code(nv50, p); 4367 4368 if (!(nv50->dirty & NV50_NEW_GEOMPROG)) 4369 return NULL; 4370 4371 so = so_new(6, 7, 2); 4372 so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); 4373 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 4374 NOUVEAU_BO_HIGH, 0, 0); 4375 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 4376 NOUVEAU_BO_LOW, 0, 0); 4377 so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); 4378 so_data (so, p->cfg.high_temp); 4379 so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); 4380 so_data (so, p->cfg.high_result); 4381 so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); 4382 so_data (so, p->cfg.prim_type); 4383 so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); 4384 so_data (so, p->cfg.vert_count); 4385 so_method(so, tesla, NV50TCL_GP_START_ID, 1); 4386 so_data (so, 0); 4387 return so; 4388} 4389 4390static uint32_t 4391nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) 4392{ 4393 struct nv50_program *vp; 4394 struct nv50_program *fp = nv50->fragprog; 4395 unsigned i, c, m = base; 4396 uint32_t origin = 0x00000010; 4397 4398 vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog; 4399 4400 /* XXX: this might not work correctly in all cases yet - we'll 4401 * just assume that an FP generic input that is not written in 4402 * the VP is PointCoord. 4403 */ 4404 memset(pntc, 0, 8 * sizeof(uint32_t)); 4405 4406 for (i = 0; i < fp->cfg.in_nr; i++) { 4407 unsigned j, n = popcnt4(fp->cfg.in[i].mask); 4408 4409 if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) { 4410 m += n; 4411 continue; 4412 } 4413 4414 for (j = 0; j < vp->cfg.out_nr; ++j) 4415 if (vp->cfg.out[j].sn == fp->cfg.in[i].sn && 4416 vp->cfg.out[j].si == fp->cfg.in[i].si) 4417 break; 4418 4419 if (j < vp->info.num_outputs) { 4420 ubyte enable = 4421 (nv50->rasterizer->pipe.sprite_coord_enable >> vp->cfg.out[j].si) & 1; 4422 4423 if (enable == 0) { 4424 m += n; 4425 continue; 4426 } 4427 } 4428 4429 /* this is either PointCoord or replaced by sprite coords */ 4430 for (c = 0; c < 4; c++) { 4431 if (!(fp->cfg.in[i].mask & (1 << c))) 4432 continue; 4433 pntc[m / 8] |= (c + 1) << ((m % 8) * 4); 4434 ++m; 4435 } 4436 } 4437 return (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT ? 0 : origin); 4438} 4439 4440static int 4441nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4], 4442 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) 4443{ 4444 int c; 4445 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; 4446 uint8_t *map = (uint8_t *)map32; 4447 4448 for (c = 0; c < 4; ++c) { 4449 if (mf & 1) { 4450 if (fpi->linear == TRUE) 4451 lin[mid / 32] |= 1 << (mid % 32); 4452 if (mv & 1) 4453 map[mid] = oid; 4454 else 4455 map[mid] = (c == 3) ? (zval + 1) : zval; 4456 ++mid; 4457 } 4458 4459 oid += mv & 1; 4460 mf >>= 1; 4461 mv >>= 1; 4462 } 4463 4464 return mid; 4465} 4466 4467struct nouveau_stateobj * 4468nv50_fp_linkage_validate(struct nv50_context *nv50) 4469{ 4470 struct nouveau_grobj *tesla = nv50->screen->tesla; 4471 struct nv50_program *vp = nv50->vertprog; 4472 struct nv50_program *fp = nv50->fragprog; 4473 struct nouveau_stateobj *so; 4474 struct nv50_sreg4 dummy; 4475 int i, n, c, m = 0; 4476 uint32_t map[16], lin[4], reg[6], pcrd[8]; 4477 uint8_t zval = 0x40; 4478 4479 if (nv50->geomprog) { 4480 vp = nv50->geomprog; 4481 zval = 0x80; 4482 } 4483 memset(map, 0, sizeof(map)); 4484 memset(lin, 0, sizeof(lin)); 4485 4486 reg[1] = 0x00000004; /* low and high clip distance map ids */ 4487 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ 4488 reg[3] = 0x00000000; /* point size map id & enable */ 4489 reg[5] = 0x00000000; /* primitive ID map slot */ 4490 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ 4491 reg[4] = fp->cfg.regs[1]; /* interpolant info */ 4492 4493 dummy.linear = FALSE; 4494 dummy.mask = 0xf; /* map all components of HPOS */ 4495 m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]); 4496 4497 dummy.mask = 0x0; 4498 4499 if (vp->cfg.clpd < 0x40) { 4500 for (c = 0; c < vp->cfg.clpd_nr; ++c) { 4501 map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8); 4502 ++m; 4503 } 4504 reg[1] = (m << 8); 4505 } 4506 4507 reg[0] |= m << 8; /* adjust BFC0 id */ 4508 4509 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ 4510 if (nv50->rasterizer->pipe.light_twoside) { 4511 struct nv50_sreg4 *vpo = &vp->cfg.two_side[0]; 4512 struct nv50_sreg4 *fpi = &fp->cfg.two_side[0]; 4513 4514 m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]); 4515 m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]); 4516 } 4517 4518 reg[0] += m - 4; /* adjust FFC0 id */ 4519 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ 4520 4521 for (i = 0; i < fp->cfg.in_nr; i++) { 4522 /* maybe even remove these from cfg.io */ 4523 if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION || 4524 fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE) 4525 continue; 4526 4527 for (n = 0; n < vp->cfg.out_nr; ++n) 4528 if (vp->cfg.out[n].sn == fp->cfg.in[i].sn && 4529 vp->cfg.out[n].si == fp->cfg.in[i].si) 4530 break; 4531 4532 m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i], 4533 (n < vp->cfg.out_nr) ? 4534 &vp->cfg.out[n] : &dummy); 4535 } 4536 /* PrimitiveID either is replaced by the system value, or 4537 * written by the geometry shader into an output register 4538 */ 4539 if (fp->cfg.prim_id < 0x40) { 4540 map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8); 4541 reg[5] = m++; 4542 } 4543 4544 if (nv50->rasterizer->pipe.point_size_per_vertex) { 4545 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); 4546 reg[3] = (m++ << 4) | 1; 4547 } 4548 4549 /* now fill the stateobj (at most 28 so_data) */ 4550 so = so_new(10, 54, 0); 4551 4552 n = (m + 3) / 4; 4553 assert(m <= 64); 4554 if (vp->type == PIPE_SHADER_GEOMETRY) { 4555 so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); 4556 so_data (so, m); 4557 so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); 4558 so_datap (so, map, n); 4559 } else { 4560 so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); 4561 so_data (so, vp->cfg.regs[0]); 4562 4563 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); 4564 so_data (so, reg[5]); 4565 4566 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); 4567 so_data (so, m); 4568 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); 4569 so_datap (so, map, n); 4570 } 4571 4572 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 4573 so_datap (so, reg, 4); 4574 4575 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); 4576 so_data (so, reg[4]); 4577 4578 so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); 4579 so_datap (so, lin, 4); 4580 4581 if (nv50->rasterizer->pipe.sprite_coord_enable) { 4582 so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); 4583 so_data (so, 4584 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff)); 4585 4586 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); 4587 so_datap (so, pcrd, 8); 4588 } 4589 4590 so_method(so, tesla, NV50TCL_GP_ENABLE, 1); 4591 so_data (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); 4592 4593 return so; 4594} 4595 4596static int 4597construct_vp_gp_mapping(uint32_t *map32, int m, 4598 struct nv50_program *vp, struct nv50_program *gp) 4599{ 4600 uint8_t *map = (uint8_t *)map32; 4601 int i, j, c; 4602 4603 for (i = 0; i < gp->cfg.in_nr; ++i) { 4604 uint8_t oid = 0, mv = 0, mg = gp->cfg.in[i].mask; 4605 4606 for (j = 0; j < vp->cfg.out_nr; ++j) { 4607 if (vp->cfg.out[j].sn == gp->cfg.in[i].sn && 4608 vp->cfg.out[j].si == gp->cfg.in[i].si) { 4609 mv = vp->cfg.out[j].mask; 4610 oid = vp->cfg.out[j].hw; 4611 break; 4612 } 4613 } 4614 4615 for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { 4616 if (mg & mv & 1) 4617 map[m++] = oid; 4618 else 4619 if (mg & 1) 4620 map[m++] = (c == 3) ? 0x41 : 0x40; 4621 oid += mv & 1; 4622 } 4623 } 4624 return m; 4625} 4626 4627struct nouveau_stateobj * 4628nv50_gp_linkage_validate(struct nv50_context *nv50) 4629{ 4630 struct nouveau_grobj *tesla = nv50->screen->tesla; 4631 struct nouveau_stateobj *so; 4632 struct nv50_program *vp = nv50->vertprog; 4633 struct nv50_program *gp = nv50->geomprog; 4634 uint32_t map[16]; 4635 int m = 0; 4636 4637 if (!gp) 4638 return NULL; 4639 memset(map, 0, sizeof(map)); 4640 4641 m = construct_vp_gp_mapping(map, m, vp, gp); 4642 4643 so = so_new(3, 24 - 3, 0); 4644 4645 so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); 4646 so_data (so, vp->cfg.regs[0] | gp->cfg.regs[0]); 4647 4648 assert(m <= 32); 4649 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); 4650 so_data (so, m); 4651 4652 m = (m + 3) / 4; 4653 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); 4654 so_datap (so, map, m); 4655 4656 return so; 4657} 4658 4659void 4660nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 4661{ 4662 while (p->exec_head) { 4663 struct nv50_program_exec *e = p->exec_head; 4664 4665 p->exec_head = e->next; 4666 FREE(e); 4667 } 4668 p->exec_tail = NULL; 4669 p->exec_size = 0; 4670 4671 nouveau_bo_ref(NULL, &p->bo); 4672 4673 FREE(p->immd); 4674 nouveau_resource_free(&p->data[0]); 4675 4676 p->translated = 0; 4677} 4678