nv50_program.c revision f204eb184237b387432413212a3a20d83c87594b
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "pipe/p_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33 34#define NV50_SU_MAX_TEMP 64 35//#define NV50_PROGRAM_DUMP 36 37/* ARL - gallium craps itself on progs/vp/arl.txt 38 * 39 * MSB - Like MAD, but MUL+SUB 40 * - Fuck it off, introduce a way to negate args for ops that 41 * support it. 42 * 43 * Look into inlining IMMD for ops other than MOV (make it general?) 44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 46 * 47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 48 * case, if the emit_src() causes the inst to suddenly become long. 49 * 50 * Verify half-insns work where expected - and force disable them where they 51 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 52 * 53 * FUCK! watch dst==src vectors, can overwrite components that are needed. 54 * ie. SUB R0, R0.yzxw, R0 55 * 56 * Things to check with renouveau: 57 * FP attr/result assignment - how? 58 * attrib 59 * - 0x16bc maps vp output onto fp hpos 60 * - 0x16c0 maps vp output onto fp col0 61 * result 62 * - colr always 0-3 63 * - depr always 4 64 * 0x16bc->0x16e8 --> some binding between vp/fp regs 65 * 0x16b8 --> VP output count 66 * 67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 68 * "MOV rcol.x, fcol.y" = 0x00000004 69 * 0x19a8 --> as above but 0x00000100 and 0x00000000 70 * - 0x00100000 used when KIL used 71 * 0x196c --> as above but 0x00000011 and 0x00000000 72 * 73 * 0x1988 --> 0xXXNNNNNN 74 * - XX == FP high something 75 */ 76struct nv50_reg { 77 enum { 78 P_TEMP, 79 P_ATTR, 80 P_RESULT, 81 P_CONST, 82 P_IMMD 83 } type; 84 int index; 85 86 int hw; 87 int neg; 88 89 int rhw; /* result hw for FP outputs, or interpolant index */ 90 int acc; /* instruction where this reg is last read (first insn == 1) */ 91}; 92 93/* arbitrary limits */ 94#define MAX_IF_DEPTH 4 95#define MAX_LOOP_DEPTH 4 96 97struct nv50_pc { 98 struct nv50_program *p; 99 100 /* hw resources */ 101 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 102 103 /* tgsi resources */ 104 struct nv50_reg *temp; 105 int temp_nr; 106 struct nv50_reg *attr; 107 int attr_nr; 108 struct nv50_reg *result; 109 int result_nr; 110 struct nv50_reg *param; 111 int param_nr; 112 struct nv50_reg *immd; 113 float *immd_buf; 114 int immd_nr; 115 116 struct nv50_reg *temp_temp[16]; 117 unsigned temp_temp_nr; 118 119 /* broadcast and destination replacement regs */ 120 struct nv50_reg *r_brdc; 121 struct nv50_reg *r_dst[4]; 122 123 unsigned interp_mode[32]; 124 /* perspective interpolation registers */ 125 struct nv50_reg *iv_p; 126 struct nv50_reg *iv_c; 127 128 struct nv50_program_exec *if_cond; 129 struct nv50_program_exec *if_insn[MAX_IF_DEPTH]; 130 struct nv50_program_exec *br_join[MAX_IF_DEPTH]; 131 struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */ 132 int if_lvl, loop_lvl; 133 unsigned loop_pos[MAX_LOOP_DEPTH]; 134 135 /* current instruction and total number of insns */ 136 unsigned insn_cur; 137 unsigned insn_nr; 138 139 boolean allow32; 140}; 141 142static INLINE void 143ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) 144{ 145 reg->type = type; 146 reg->index = index; 147 reg->hw = hw; 148 reg->neg = 0; 149 reg->rhw = -1; 150 reg->acc = 0; 151} 152 153static INLINE unsigned 154popcnt4(uint32_t val) 155{ 156 static const unsigned cnt[16] 157 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 158 return cnt[val & 0xf]; 159} 160 161static void 162alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 163{ 164 int i = 0; 165 166 if (reg->type == P_RESULT) { 167 if (pc->p->cfg.high_result < (reg->hw + 1)) 168 pc->p->cfg.high_result = reg->hw + 1; 169 } 170 171 if (reg->type != P_TEMP) 172 return; 173 174 if (reg->hw >= 0) { 175 /*XXX: do this here too to catch FP temp-as-attr usage.. 176 * not clean, but works */ 177 if (pc->p->cfg.high_temp < (reg->hw + 1)) 178 pc->p->cfg.high_temp = reg->hw + 1; 179 return; 180 } 181 182 if (reg->rhw != -1) { 183 /* try to allocate temporary with index rhw first */ 184 if (!(pc->r_temp[reg->rhw])) { 185 pc->r_temp[reg->rhw] = reg; 186 reg->hw = reg->rhw; 187 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 188 pc->p->cfg.high_temp = reg->rhw + 1; 189 return; 190 } 191 /* make sure we don't get things like $r0 needs to go 192 * in $r1 and $r1 in $r0 193 */ 194 i = pc->result_nr * 4; 195 } 196 197 for (; i < NV50_SU_MAX_TEMP; i++) { 198 if (!(pc->r_temp[i])) { 199 pc->r_temp[i] = reg; 200 reg->hw = i; 201 if (pc->p->cfg.high_temp < (i + 1)) 202 pc->p->cfg.high_temp = i + 1; 203 return; 204 } 205 } 206 207 assert(0); 208} 209 210/* XXX: For shaders that aren't executed linearly (e.g. shaders that 211 * contain loops), we need to assign all hw regs to TGSI TEMPs early, 212 * lest we risk temp_temps overwriting regs alloc'd "later". 213 */ 214static struct nv50_reg * 215alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 216{ 217 struct nv50_reg *r; 218 int i; 219 220 if (dst && dst->type == P_TEMP && dst->hw == -1) 221 return dst; 222 223 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 224 if (!pc->r_temp[i]) { 225 r = MALLOC_STRUCT(nv50_reg); 226 ctor_reg(r, P_TEMP, -1, i); 227 pc->r_temp[i] = r; 228 return r; 229 } 230 } 231 232 assert(0); 233 return NULL; 234} 235 236/* Assign the hw of the discarded temporary register src 237 * to the tgsi register dst and free src. 238 */ 239static void 240assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 241{ 242 assert(src->index == -1 && src->hw != -1); 243 244 if (dst->hw != -1) 245 pc->r_temp[dst->hw] = NULL; 246 pc->r_temp[src->hw] = dst; 247 dst->hw = src->hw; 248 249 FREE(src); 250} 251 252/* release the hardware resource held by r */ 253static void 254release_hw(struct nv50_pc *pc, struct nv50_reg *r) 255{ 256 assert(r->type == P_TEMP); 257 if (r->hw == -1) 258 return; 259 260 assert(pc->r_temp[r->hw] == r); 261 pc->r_temp[r->hw] = NULL; 262 263 r->acc = 0; 264 if (r->index == -1) 265 FREE(r); 266} 267 268static void 269free_temp(struct nv50_pc *pc, struct nv50_reg *r) 270{ 271 if (r->index == -1) { 272 unsigned hw = r->hw; 273 274 FREE(pc->r_temp[hw]); 275 pc->r_temp[hw] = NULL; 276 } 277} 278 279static int 280alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 281{ 282 int i; 283 284 if ((idx + 4) >= NV50_SU_MAX_TEMP) 285 return 1; 286 287 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 288 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 289 return alloc_temp4(pc, dst, idx + 4); 290 291 for (i = 0; i < 4; i++) { 292 dst[i] = MALLOC_STRUCT(nv50_reg); 293 ctor_reg(dst[i], P_TEMP, -1, idx + i); 294 pc->r_temp[idx + i] = dst[i]; 295 } 296 297 return 0; 298} 299 300static void 301free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 302{ 303 int i; 304 305 for (i = 0; i < 4; i++) 306 free_temp(pc, reg[i]); 307} 308 309static struct nv50_reg * 310temp_temp(struct nv50_pc *pc) 311{ 312 if (pc->temp_temp_nr >= 16) 313 assert(0); 314 315 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 316 return pc->temp_temp[pc->temp_temp_nr++]; 317} 318 319static void 320kill_temp_temp(struct nv50_pc *pc) 321{ 322 int i; 323 324 for (i = 0; i < pc->temp_temp_nr; i++) 325 free_temp(pc, pc->temp_temp[i]); 326 pc->temp_temp_nr = 0; 327} 328 329static int 330ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w) 331{ 332 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)), 333 (pc->immd_nr + 1) * 4 * sizeof(float)); 334 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 335 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 336 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 337 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 338 339 return pc->immd_nr++; 340} 341 342static struct nv50_reg * 343alloc_immd(struct nv50_pc *pc, float f) 344{ 345 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); 346 unsigned hw; 347 348 for (hw = 0; hw < pc->immd_nr * 4; hw++) 349 if (pc->immd_buf[hw] == f) 350 break; 351 352 if (hw == pc->immd_nr * 4) 353 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4; 354 355 ctor_reg(r, P_IMMD, -1, hw); 356 return r; 357} 358 359static struct nv50_program_exec * 360exec(struct nv50_pc *pc) 361{ 362 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 363 364 e->param.index = -1; 365 return e; 366} 367 368static void 369emit(struct nv50_pc *pc, struct nv50_program_exec *e) 370{ 371 struct nv50_program *p = pc->p; 372 373 if (p->exec_tail) 374 p->exec_tail->next = e; 375 if (!p->exec_head) 376 p->exec_head = e; 377 p->exec_tail = e; 378 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 379} 380 381static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 382 383static boolean 384is_long(struct nv50_program_exec *e) 385{ 386 if (e->inst[0] & 1) 387 return TRUE; 388 return FALSE; 389} 390 391static boolean 392is_immd(struct nv50_program_exec *e) 393{ 394 if (is_long(e) && (e->inst[1] & 3) == 3) 395 return TRUE; 396 return FALSE; 397} 398 399static INLINE void 400set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 401 struct nv50_program_exec *e) 402{ 403 set_long(pc, e); 404 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 405 e->inst[1] |= (pred << 7) | (idx << 12); 406} 407 408static INLINE void 409set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 410 struct nv50_program_exec *e) 411{ 412 set_long(pc, e); 413 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 414 e->inst[1] |= (idx << 4) | (on << 6); 415} 416 417static INLINE void 418set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 419{ 420 if (is_long(e)) 421 return; 422 423 e->inst[0] |= 1; 424 set_pred(pc, 0xf, 0, e); 425 set_pred_wr(pc, 0, 0, e); 426} 427 428static INLINE void 429set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 430{ 431 if (dst->type == P_RESULT) { 432 set_long(pc, e); 433 e->inst[1] |= 0x00000008; 434 } 435 436 alloc_reg(pc, dst); 437 e->inst[0] |= (dst->hw << 2); 438} 439 440static INLINE void 441set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 442{ 443 float f = pc->immd_buf[imm->hw]; 444 unsigned val = fui(imm->neg ? -f : f); 445 446 set_long(pc, e); 447 /*XXX: can't be predicated - bits overlap.. catch cases where both 448 * are required and avoid them. */ 449 set_pred(pc, 0, 0, e); 450 set_pred_wr(pc, 0, 0, e); 451 452 e->inst[1] |= 0x00000002 | 0x00000001; 453 e->inst[0] |= (val & 0x3f) << 16; 454 e->inst[1] |= (val >> 6) << 2; 455} 456 457 458#define INTERP_LINEAR 0 459#define INTERP_FLAT 1 460#define INTERP_PERSPECTIVE 2 461#define INTERP_CENTROID 4 462 463/* interpolant index has been stored in dst->rhw */ 464static void 465emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 466 unsigned mode) 467{ 468 assert(dst->rhw != -1); 469 struct nv50_program_exec *e = exec(pc); 470 471 e->inst[0] |= 0x80000000; 472 set_dst(pc, dst, e); 473 e->inst[0] |= (dst->rhw << 16); 474 475 if (mode & INTERP_FLAT) { 476 e->inst[0] |= (1 << 8); 477 } else { 478 if (mode & INTERP_PERSPECTIVE) { 479 e->inst[0] |= (1 << 25); 480 alloc_reg(pc, iv); 481 e->inst[0] |= (iv->hw << 9); 482 } 483 484 if (mode & INTERP_CENTROID) 485 e->inst[0] |= (1 << 24); 486 } 487 488 emit(pc, e); 489} 490 491static void 492set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 493 struct nv50_program_exec *e) 494{ 495 set_long(pc, e); 496 497 e->param.index = src->hw; 498 e->param.shift = s; 499 e->param.mask = m << (s % 32); 500 501 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 502} 503 504static void 505emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 506{ 507 struct nv50_program_exec *e = exec(pc); 508 509 e->inst[0] |= 0x10000000; 510 511 set_dst(pc, dst, e); 512 513 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) { 514 set_immd(pc, src, e); 515 /*XXX: 32-bit, but steals part of "half" reg space - need to 516 * catch and handle this case if/when we do half-regs 517 */ 518 } else 519 if (src->type == P_IMMD || src->type == P_CONST) { 520 set_long(pc, e); 521 set_data(pc, src, 0x7f, 9, e); 522 e->inst[1] |= 0x20000000; /* src0 const? */ 523 } else { 524 if (src->type == P_ATTR) { 525 set_long(pc, e); 526 e->inst[1] |= 0x00200000; 527 } 528 529 alloc_reg(pc, src); 530 e->inst[0] |= (src->hw << 9); 531 } 532 533 if (is_long(e) && !is_immd(e)) { 534 e->inst[1] |= 0x04000000; /* 32-bit */ 535 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */ 536 if (!(e->inst[1] & 0x20000000)) 537 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */ 538 } else 539 e->inst[0] |= 0x00008000; 540 541 emit(pc, e); 542} 543 544static INLINE void 545emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 546{ 547 struct nv50_reg *imm = alloc_immd(pc, f); 548 emit_mov(pc, dst, imm); 549 FREE(imm); 550} 551 552static boolean 553check_swap_src_0_1(struct nv50_pc *pc, 554 struct nv50_reg **s0, struct nv50_reg **s1) 555{ 556 struct nv50_reg *src0 = *s0, *src1 = *s1; 557 558 if (src0->type == P_CONST) { 559 if (src1->type != P_CONST) { 560 *s0 = src1; 561 *s1 = src0; 562 return TRUE; 563 } 564 } else 565 if (src1->type == P_ATTR) { 566 if (src0->type != P_ATTR) { 567 *s0 = src1; 568 *s1 = src0; 569 return TRUE; 570 } 571 } 572 573 return FALSE; 574} 575 576static void 577set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, 578 struct nv50_program_exec *e) 579{ 580 struct nv50_reg *temp; 581 582 if (src->type != P_TEMP) { 583 temp = temp_temp(pc); 584 emit_mov(pc, temp, src); 585 src = temp; 586 } 587 588 alloc_reg(pc, src); 589 e->inst[0] |= (src->hw << 9); 590} 591 592static void 593set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 594{ 595 if (src->type == P_ATTR) { 596 set_long(pc, e); 597 e->inst[1] |= 0x00200000; 598 } else 599 if (src->type == P_CONST || src->type == P_IMMD) { 600 struct nv50_reg *temp = temp_temp(pc); 601 602 emit_mov(pc, temp, src); 603 src = temp; 604 } 605 606 alloc_reg(pc, src); 607 e->inst[0] |= (src->hw << 9); 608} 609 610static void 611set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 612{ 613 if (src->type == P_ATTR) { 614 struct nv50_reg *temp = temp_temp(pc); 615 616 emit_mov(pc, temp, src); 617 src = temp; 618 } else 619 if (src->type == P_CONST || src->type == P_IMMD) { 620 assert(!(e->inst[0] & 0x00800000)); 621 if (e->inst[0] & 0x01000000) { 622 struct nv50_reg *temp = temp_temp(pc); 623 624 emit_mov(pc, temp, src); 625 src = temp; 626 } else { 627 set_data(pc, src, 0x7f, 16, e); 628 e->inst[0] |= 0x00800000; 629 } 630 } 631 632 alloc_reg(pc, src); 633 e->inst[0] |= (src->hw << 16); 634} 635 636static void 637set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 638{ 639 set_long(pc, e); 640 641 if (src->type == P_ATTR) { 642 struct nv50_reg *temp = temp_temp(pc); 643 644 emit_mov(pc, temp, src); 645 src = temp; 646 } else 647 if (src->type == P_CONST || src->type == P_IMMD) { 648 assert(!(e->inst[0] & 0x01000000)); 649 if (e->inst[0] & 0x00800000) { 650 struct nv50_reg *temp = temp_temp(pc); 651 652 emit_mov(pc, temp, src); 653 src = temp; 654 } else { 655 set_data(pc, src, 0x7f, 32+14, e); 656 e->inst[0] |= 0x01000000; 657 } 658 } 659 660 alloc_reg(pc, src); 661 e->inst[1] |= (src->hw << 14); 662} 663 664static void 665emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 666 struct nv50_reg *src1) 667{ 668 struct nv50_program_exec *e = exec(pc); 669 670 e->inst[0] |= 0xc0000000; 671 672 if (!pc->allow32) 673 set_long(pc, e); 674 675 check_swap_src_0_1(pc, &src0, &src1); 676 set_dst(pc, dst, e); 677 set_src_0(pc, src0, e); 678 if (src1->type == P_IMMD && !is_long(e)) { 679 if (src0->neg) 680 e->inst[0] |= 0x00008000; 681 set_immd(pc, src1, e); 682 } else { 683 set_src_1(pc, src1, e); 684 if (src0->neg ^ src1->neg) { 685 if (is_long(e)) 686 e->inst[1] |= 0x08000000; 687 else 688 e->inst[0] |= 0x00008000; 689 } 690 } 691 692 emit(pc, e); 693} 694 695static void 696emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 697 struct nv50_reg *src0, struct nv50_reg *src1) 698{ 699 struct nv50_program_exec *e = exec(pc); 700 701 e->inst[0] |= 0xb0000000; 702 703 check_swap_src_0_1(pc, &src0, &src1); 704 705 if (!pc->allow32 || src0->neg || src1->neg) { 706 set_long(pc, e); 707 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27); 708 } 709 710 set_dst(pc, dst, e); 711 set_src_0(pc, src0, e); 712 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 713 set_src_2(pc, src1, e); 714 else 715 if (src1->type == P_IMMD) 716 set_immd(pc, src1, e); 717 else 718 set_src_1(pc, src1, e); 719 720 emit(pc, e); 721} 722 723static void 724emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 725 struct nv50_reg *src0, struct nv50_reg *src1) 726{ 727 struct nv50_program_exec *e = exec(pc); 728 729 set_long(pc, e); 730 e->inst[0] |= 0xb0000000; 731 e->inst[1] |= (sub << 29); 732 733 check_swap_src_0_1(pc, &src0, &src1); 734 set_dst(pc, dst, e); 735 set_src_0(pc, src0, e); 736 set_src_1(pc, src1, e); 737 738 emit(pc, e); 739} 740 741static INLINE void 742emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 743 struct nv50_reg *src1) 744{ 745 src1->neg ^= 1; 746 emit_add(pc, dst, src0, src1); 747 src1->neg ^= 1; 748} 749 750static void 751emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 752 struct nv50_reg *src1, struct nv50_reg *src2) 753{ 754 struct nv50_program_exec *e = exec(pc); 755 756 e->inst[0] |= 0xe0000000; 757 758 check_swap_src_0_1(pc, &src0, &src1); 759 set_dst(pc, dst, e); 760 set_src_0(pc, src0, e); 761 set_src_1(pc, src1, e); 762 set_src_2(pc, src2, e); 763 764 if (src0->neg ^ src1->neg) 765 e->inst[1] |= 0x04000000; 766 if (src2->neg) 767 e->inst[1] |= 0x08000000; 768 769 emit(pc, e); 770} 771 772static INLINE void 773emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 774 struct nv50_reg *src1, struct nv50_reg *src2) 775{ 776 src2->neg ^= 1; 777 emit_mad(pc, dst, src0, src1, src2); 778 src2->neg ^= 1; 779} 780 781static void 782emit_flop(struct nv50_pc *pc, unsigned sub, 783 struct nv50_reg *dst, struct nv50_reg *src) 784{ 785 struct nv50_program_exec *e = exec(pc); 786 787 e->inst[0] |= 0x90000000; 788 if (sub) { 789 set_long(pc, e); 790 e->inst[1] |= (sub << 29); 791 } 792 793 set_dst(pc, dst, e); 794 795 if (sub == 0 || sub == 2) 796 set_src_0_restricted(pc, src, e); 797 else 798 set_src_0(pc, src, e); 799 800 emit(pc, e); 801} 802 803static void 804emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 805{ 806 struct nv50_program_exec *e = exec(pc); 807 808 e->inst[0] |= 0xb0000000; 809 810 set_dst(pc, dst, e); 811 set_src_0(pc, src, e); 812 set_long(pc, e); 813 e->inst[1] |= (6 << 29) | 0x00004000; 814 815 emit(pc, e); 816} 817 818static void 819emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 820{ 821 struct nv50_program_exec *e = exec(pc); 822 823 e->inst[0] |= 0xb0000000; 824 825 set_dst(pc, dst, e); 826 set_src_0(pc, src, e); 827 set_long(pc, e); 828 e->inst[1] |= (6 << 29); 829 830 emit(pc, e); 831} 832 833#define CVTOP_RN 0x01 834#define CVTOP_FLOOR 0x03 835#define CVTOP_CEIL 0x05 836#define CVTOP_TRUNC 0x07 837#define CVTOP_SAT 0x08 838#define CVTOP_ABS 0x10 839 840/* 0x04 == 32 bit dst */ 841/* 0x40 == dst is float */ 842/* 0x80 == src is float */ 843#define CVT_F32_F32 0xc4 844#define CVT_F32_S32 0x44 845#define CVT_F32_U32 0x64 846#define CVT_S32_F32 0x8c 847#define CVT_S32_S32 0x0c 848#define CVT_NEG 0x20 849#define CVT_RI 0x08 850 851static void 852emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 853 int wp, unsigned cvn, unsigned fmt) 854{ 855 struct nv50_program_exec *e; 856 857 e = exec(pc); 858 set_long(pc, e); 859 860 e->inst[0] |= 0xa0000000; 861 e->inst[1] |= 0x00004000; /* 32 bit src */ 862 e->inst[1] |= (cvn << 16); 863 e->inst[1] |= (fmt << 24); 864 set_src_0(pc, src, e); 865 866 if (wp >= 0) 867 set_pred_wr(pc, 1, wp, e); 868 869 if (dst) 870 set_dst(pc, dst, e); 871 else { 872 e->inst[0] |= 0x000001fc; 873 e->inst[1] |= 0x00000008; 874 } 875 876 emit(pc, e); 877} 878 879/* nv50 Condition codes: 880 * 0x1 = LT 881 * 0x2 = EQ 882 * 0x3 = LE 883 * 0x4 = GT 884 * 0x5 = NE 885 * 0x6 = GE 886 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) 887 * 0x8 = unordered bit (allows NaN) 888 */ 889static void 890emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, 891 struct nv50_reg *src0, struct nv50_reg *src1) 892{ 893 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 894 895 struct nv50_program_exec *e = exec(pc); 896 struct nv50_reg *rdst; 897 898 assert(ccode < 16); 899 if (check_swap_src_0_1(pc, &src0, &src1)) 900 ccode = cc_swapped[ccode & 7] | (ccode & 8); 901 902 rdst = dst; 903 if (dst && dst->type != P_TEMP) 904 dst = alloc_temp(pc, NULL); 905 906 /* set.u32 */ 907 set_long(pc, e); 908 e->inst[0] |= 0xb0000000; 909 e->inst[1] |= 0x60000000 | (ccode << 14); 910 911 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but 912 * that doesn't seem to match what the hw actually does 913 e->inst[1] |= 0x04000000; << breaks things, u32 by default ? 914 */ 915 916 if (wp >= 0) 917 set_pred_wr(pc, 1, wp, e); 918 if (dst) 919 set_dst(pc, dst, e); 920 else { 921 e->inst[0] |= 0x000001fc; 922 e->inst[1] |= 0x00000008; 923 } 924 925 set_src_0(pc, src0, e); 926 set_src_1(pc, src1, e); 927 928 emit(pc, e); 929 pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */ 930 931 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */ 932 if (rdst) 933 emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32); 934 if (rdst && rdst != dst) 935 free_temp(pc, dst); 936} 937 938static INLINE unsigned 939map_tgsi_setop_cc(unsigned op) 940{ 941 switch (op) { 942 case TGSI_OPCODE_SLT: return 0x1; 943 case TGSI_OPCODE_SGE: return 0x6; 944 case TGSI_OPCODE_SEQ: return 0x2; 945 case TGSI_OPCODE_SGT: return 0x4; 946 case TGSI_OPCODE_SLE: return 0x3; 947 case TGSI_OPCODE_SNE: return 0xd; 948 default: 949 assert(0); 950 return 0; 951 } 952} 953 954static INLINE void 955emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 956{ 957 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI); 958} 959 960static void 961emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 962 struct nv50_reg *v, struct nv50_reg *e) 963{ 964 struct nv50_reg *temp = alloc_temp(pc, NULL); 965 966 emit_flop(pc, 3, temp, v); 967 emit_mul(pc, temp, temp, e); 968 emit_preex2(pc, temp, temp); 969 emit_flop(pc, 6, dst, temp); 970 971 free_temp(pc, temp); 972} 973 974static INLINE void 975emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 976{ 977 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32); 978} 979 980static INLINE void 981emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 982{ 983 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32); 984} 985 986static void 987emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 988 struct nv50_reg **src) 989{ 990 struct nv50_reg *one = alloc_immd(pc, 1.0); 991 struct nv50_reg *zero = alloc_immd(pc, 0.0); 992 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 993 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 994 struct nv50_reg *tmp[4]; 995 boolean allow32 = pc->allow32; 996 997 pc->allow32 = FALSE; 998 999 if (mask & (3 << 1)) { 1000 tmp[0] = alloc_temp(pc, NULL); 1001 emit_minmax(pc, 4, tmp[0], src[0], zero); 1002 } 1003 1004 if (mask & (1 << 2)) { 1005 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 1006 1007 tmp[1] = temp_temp(pc); 1008 emit_minmax(pc, 4, tmp[1], src[1], zero); 1009 1010 tmp[3] = temp_temp(pc); 1011 emit_minmax(pc, 4, tmp[3], src[3], neg128); 1012 emit_minmax(pc, 5, tmp[3], tmp[3], pos128); 1013 1014 emit_pow(pc, dst[2], tmp[1], tmp[3]); 1015 emit_mov(pc, dst[2], zero); 1016 set_pred(pc, 3, 0, pc->p->exec_tail); 1017 } 1018 1019 if (mask & (1 << 1)) 1020 assimilate_temp(pc, dst[1], tmp[0]); 1021 else 1022 if (mask & (1 << 2)) 1023 free_temp(pc, tmp[0]); 1024 1025 pc->allow32 = allow32; 1026 1027 /* do this last, in case src[i,j] == dst[0,3] */ 1028 if (mask & (1 << 0)) 1029 emit_mov(pc, dst[0], one); 1030 1031 if (mask & (1 << 3)) 1032 emit_mov(pc, dst[3], one); 1033 1034 FREE(pos128); 1035 FREE(neg128); 1036 FREE(zero); 1037 FREE(one); 1038} 1039 1040static INLINE void 1041emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1042{ 1043 emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG); 1044} 1045 1046static void 1047emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 1048{ 1049 struct nv50_program_exec *e; 1050 const int r_pred = 1; 1051 1052 /* Sets predicate reg ? */ 1053 e = exec(pc); 1054 e->inst[0] = 0xa00001fd; 1055 e->inst[1] = 0xc4014788; 1056 set_src_0(pc, src, e); 1057 set_pred_wr(pc, 1, r_pred, e); 1058 if (src->neg) 1059 e->inst[1] |= 0x20000000; 1060 emit(pc, e); 1061 1062 /* This is probably KILP */ 1063 e = exec(pc); 1064 e->inst[0] = 0x000001fe; 1065 set_long(pc, e); 1066 set_pred(pc, 1 /* LT? */, r_pred, e); 1067 emit(pc, e); 1068} 1069 1070static void 1071emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1072 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj) 1073{ 1074 struct nv50_reg *temp, *t[4]; 1075 struct nv50_program_exec *e; 1076 1077 unsigned c, mode, dim; 1078 1079 switch (type) { 1080 case TGSI_TEXTURE_1D: 1081 dim = 1; 1082 break; 1083 case TGSI_TEXTURE_UNKNOWN: 1084 case TGSI_TEXTURE_2D: 1085 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */ 1086 case TGSI_TEXTURE_RECT: 1087 dim = 2; 1088 break; 1089 case TGSI_TEXTURE_3D: 1090 case TGSI_TEXTURE_CUBE: 1091 case TGSI_TEXTURE_SHADOW2D: 1092 case TGSI_TEXTURE_SHADOWRECT: /* XXX */ 1093 dim = 3; 1094 break; 1095 default: 1096 assert(0); 1097 break; 1098 } 1099 1100 /* some cards need t[0]'s hw index to be a multiple of 4 */ 1101 alloc_temp4(pc, t, 0); 1102 1103 if (proj) { 1104 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1105 mode = pc->interp_mode[src[0]->index]; 1106 1107 t[3]->rhw = src[3]->rhw; 1108 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1109 emit_flop(pc, 0, t[3], t[3]); 1110 1111 for (c = 0; c < dim; c++) { 1112 t[c]->rhw = src[c]->rhw; 1113 emit_interp(pc, t[c], t[3], 1114 (mode | INTERP_PERSPECTIVE)); 1115 } 1116 } else { 1117 emit_flop(pc, 0, t[3], src[3]); 1118 for (c = 0; c < dim; c++) 1119 emit_mul(pc, t[c], src[c], t[3]); 1120 1121 /* XXX: for some reason the blob sometimes uses MAD: 1122 * emit_mad(pc, t[c], src[0][c], t[3], t[3]) 1123 * pc->p->exec_tail->inst[1] |= 0x080fc000; 1124 */ 1125 } 1126 } else { 1127 if (type == TGSI_TEXTURE_CUBE) { 1128 temp = temp_temp(pc); 1129 emit_minmax(pc, 4, temp, src[0], src[1]); 1130 emit_minmax(pc, 4, temp, temp, src[2]); 1131 emit_flop(pc, 0, temp, temp); 1132 for (c = 0; c < 3; c++) 1133 emit_mul(pc, t[c], src[c], temp); 1134 } else { 1135 for (c = 0; c < dim; c++) 1136 emit_mov(pc, t[c], src[c]); 1137 } 1138 } 1139 1140 e = exec(pc); 1141 set_long(pc, e); 1142 e->inst[0] |= 0xf0000000; 1143 e->inst[1] |= 0x00000004; 1144 set_dst(pc, t[0], e); 1145 e->inst[0] |= (unit << 9); 1146 1147 if (dim == 2) 1148 e->inst[0] |= 0x00400000; 1149 else 1150 if (dim == 3) 1151 e->inst[0] |= 0x00800000; 1152 1153 e->inst[0] |= (mask & 0x3) << 25; 1154 e->inst[1] |= (mask & 0xc) << 12; 1155 1156 emit(pc, e); 1157 1158#if 1 1159 c = 0; 1160 if (mask & 1) emit_mov(pc, dst[0], t[c++]); 1161 if (mask & 2) emit_mov(pc, dst[1], t[c++]); 1162 if (mask & 4) emit_mov(pc, dst[2], t[c++]); 1163 if (mask & 8) emit_mov(pc, dst[3], t[c]); 1164 1165 free_temp4(pc, t); 1166#else 1167 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1168 * the texture coordinates, not the fetched values: latency ? */ 1169 1170 for (c = 0; c < 4; c++) { 1171 if (mask & (1 << c)) 1172 assimilate_temp(pc, dst[c], t[c]); 1173 else 1174 free_temp(pc, t[c]); 1175 } 1176#endif 1177} 1178 1179static void 1180emit_branch(struct nv50_pc *pc, int pred, unsigned cc, 1181 struct nv50_program_exec **join) 1182{ 1183 struct nv50_program_exec *e = exec(pc); 1184 1185 if (join) { 1186 set_long(pc, e); 1187 e->inst[0] |= 0xa0000002; 1188 emit(pc, e); 1189 *join = e; 1190 e = exec(pc); 1191 } 1192 1193 set_long(pc, e); 1194 e->inst[0] |= 0x10000002; 1195 if (pred >= 0) 1196 set_pred(pc, cc, pred, e); 1197 emit(pc, e); 1198} 1199 1200static void 1201emit_nop(struct nv50_pc *pc) 1202{ 1203 struct nv50_program_exec *e = exec(pc); 1204 1205 e->inst[0] = 0xf0000000; 1206 set_long(pc, e); 1207 e->inst[1] = 0xe0000000; 1208 emit(pc, e); 1209} 1210 1211static void 1212emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1213{ 1214 struct nv50_program_exec *e = exec(pc); 1215 1216 assert(src->type == P_TEMP); 1217 1218 e->inst[0] = 0xc0140000; 1219 e->inst[1] = 0x89800000; 1220 set_long(pc, e); 1221 set_dst(pc, dst, e); 1222 set_src_0(pc, src, e); 1223 set_src_2(pc, src, e); 1224 1225 emit(pc, e); 1226} 1227 1228static void 1229emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1230{ 1231 struct nv50_program_exec *e = exec(pc); 1232 1233 assert(src->type == P_TEMP); 1234 1235 if (!src->neg) /* ! double negation */ 1236 emit_neg(pc, src, src); 1237 1238 e->inst[0] = 0xc0150000; 1239 e->inst[1] = 0x8a400000; 1240 set_long(pc, e); 1241 set_dst(pc, dst, e); 1242 set_src_0(pc, src, e); 1243 set_src_2(pc, src, e); 1244 1245 emit(pc, e); 1246} 1247 1248static void 1249convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1250{ 1251 unsigned q = 0, m = ~0; 1252 1253 assert(!is_long(e)); 1254 1255 switch (e->inst[0] >> 28) { 1256 case 0x1: 1257 /* MOV */ 1258 q = 0x0403c000; 1259 m = 0xffff7fff; 1260 break; 1261 case 0x8: 1262 /* INTERP (move centroid, perspective and flat bits) */ 1263 m = ~0x03000100; 1264 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 1265 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 1266 break; 1267 case 0x9: 1268 /* RCP */ 1269 break; 1270 case 0xB: 1271 /* ADD */ 1272 m = ~(127 << 16); 1273 q = ((e->inst[0] & (~m)) >> 2); 1274 break; 1275 case 0xC: 1276 /* MUL */ 1277 m = ~0x00008000; 1278 q = ((e->inst[0] & (~m)) << 12); 1279 break; 1280 case 0xE: 1281 /* MAD (if src2 == dst) */ 1282 q = ((e->inst[0] & 0x1fc) << 12); 1283 break; 1284 default: 1285 assert(0); 1286 break; 1287 } 1288 1289 set_long(pc, e); 1290 pc->p->exec_size++; 1291 1292 e->inst[0] &= m; 1293 e->inst[1] |= q; 1294} 1295 1296/* Some operations support an optional negation flag. */ 1297static boolean 1298negate_supported(const struct tgsi_full_instruction *insn, int i) 1299{ 1300 int s; 1301 1302 switch (insn->Instruction.Opcode) { 1303 case TGSI_OPCODE_DDY: 1304 case TGSI_OPCODE_DP3: 1305 case TGSI_OPCODE_DP4: 1306 case TGSI_OPCODE_MUL: 1307 case TGSI_OPCODE_KIL: 1308 case TGSI_OPCODE_ADD: 1309 case TGSI_OPCODE_SUB: 1310 case TGSI_OPCODE_MAD: 1311 break; 1312 case TGSI_OPCODE_POW: 1313 if (i == 1) 1314 break; 1315 return FALSE; 1316 default: 1317 return FALSE; 1318 } 1319 1320 /* Watch out for possible multiple uses of an nv50_reg, we 1321 * can't use nv50_reg::neg in these cases. 1322 */ 1323 for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) { 1324 if (s == i) 1325 continue; 1326 if ((insn->FullSrcRegisters[s].SrcRegister.Index == 1327 insn->FullSrcRegisters[i].SrcRegister.Index) && 1328 (insn->FullSrcRegisters[s].SrcRegister.File == 1329 insn->FullSrcRegisters[i].SrcRegister.File)) 1330 return FALSE; 1331 } 1332 1333 return TRUE; 1334} 1335 1336/* Return a read mask for source registers deduced from opcode & write mask. */ 1337static unsigned 1338nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 1339{ 1340 unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask; 1341 1342 switch (insn->Instruction.Opcode) { 1343 case TGSI_OPCODE_COS: 1344 case TGSI_OPCODE_SIN: 1345 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 1346 case TGSI_OPCODE_DP3: 1347 return 0x7; 1348 case TGSI_OPCODE_DP4: 1349 case TGSI_OPCODE_DPH: 1350 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 1351 return 0xf; 1352 case TGSI_OPCODE_DST: 1353 return mask & (c ? 0xa : 0x6); 1354 case TGSI_OPCODE_EX2: 1355 case TGSI_OPCODE_LG2: 1356 case TGSI_OPCODE_POW: 1357 case TGSI_OPCODE_RCP: 1358 case TGSI_OPCODE_RSQ: 1359 case TGSI_OPCODE_SCS: 1360 return 0x1; 1361 case TGSI_OPCODE_LIT: 1362 return 0xb; 1363 case TGSI_OPCODE_TEX: 1364 case TGSI_OPCODE_TXP: 1365 { 1366 const struct tgsi_instruction_ext_texture *tex; 1367 1368 assert(insn->Instruction.Extended); 1369 tex = &insn->InstructionExtTexture; 1370 1371 mask = 0x7; 1372 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) 1373 mask |= 0x8; 1374 1375 switch (tex->Texture) { 1376 case TGSI_TEXTURE_1D: 1377 mask &= 0x9; 1378 break; 1379 case TGSI_TEXTURE_2D: 1380 mask &= 0xb; 1381 break; 1382 default: 1383 break; 1384 } 1385 } 1386 return mask; 1387 case TGSI_OPCODE_XPD: 1388 x = 0; 1389 if (mask & 1) x |= 0x6; 1390 if (mask & 2) x |= 0x5; 1391 if (mask & 4) x |= 0x3; 1392 return x; 1393 default: 1394 break; 1395 } 1396 1397 return mask; 1398} 1399 1400static struct nv50_reg * 1401tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 1402{ 1403 switch (dst->DstRegister.File) { 1404 case TGSI_FILE_TEMPORARY: 1405 return &pc->temp[dst->DstRegister.Index * 4 + c]; 1406 case TGSI_FILE_OUTPUT: 1407 return &pc->result[dst->DstRegister.Index * 4 + c]; 1408 case TGSI_FILE_NULL: 1409 return NULL; 1410 default: 1411 break; 1412 } 1413 1414 return NULL; 1415} 1416 1417static struct nv50_reg * 1418tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 1419 boolean neg) 1420{ 1421 struct nv50_reg *r = NULL; 1422 struct nv50_reg *temp; 1423 unsigned sgn, c; 1424 1425 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 1426 1427 c = tgsi_util_get_full_src_register_extswizzle(src, chan); 1428 switch (c) { 1429 case TGSI_EXTSWIZZLE_X: 1430 case TGSI_EXTSWIZZLE_Y: 1431 case TGSI_EXTSWIZZLE_Z: 1432 case TGSI_EXTSWIZZLE_W: 1433 switch (src->SrcRegister.File) { 1434 case TGSI_FILE_INPUT: 1435 r = &pc->attr[src->SrcRegister.Index * 4 + c]; 1436 break; 1437 case TGSI_FILE_TEMPORARY: 1438 r = &pc->temp[src->SrcRegister.Index * 4 + c]; 1439 break; 1440 case TGSI_FILE_CONSTANT: 1441 r = &pc->param[src->SrcRegister.Index * 4 + c]; 1442 break; 1443 case TGSI_FILE_IMMEDIATE: 1444 r = &pc->immd[src->SrcRegister.Index * 4 + c]; 1445 break; 1446 case TGSI_FILE_SAMPLER: 1447 break; 1448 default: 1449 assert(0); 1450 break; 1451 } 1452 break; 1453 case TGSI_EXTSWIZZLE_ZERO: 1454 r = alloc_immd(pc, 0.0); 1455 return r; 1456 case TGSI_EXTSWIZZLE_ONE: 1457 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET) 1458 return alloc_immd(pc, -1.0); 1459 return alloc_immd(pc, 1.0); 1460 default: 1461 assert(0); 1462 break; 1463 } 1464 1465 switch (sgn) { 1466 case TGSI_UTIL_SIGN_KEEP: 1467 break; 1468 case TGSI_UTIL_SIGN_CLEAR: 1469 temp = temp_temp(pc); 1470 emit_abs(pc, temp, r); 1471 r = temp; 1472 break; 1473 case TGSI_UTIL_SIGN_TOGGLE: 1474 if (neg) 1475 r->neg = 1; 1476 else { 1477 temp = temp_temp(pc); 1478 emit_neg(pc, temp, r); 1479 r = temp; 1480 } 1481 break; 1482 case TGSI_UTIL_SIGN_SET: 1483 temp = temp_temp(pc); 1484 emit_abs(pc, temp, r); 1485 if (neg) 1486 temp->neg = 1; 1487 else 1488 emit_neg(pc, temp, temp); 1489 r = temp; 1490 break; 1491 default: 1492 assert(0); 1493 break; 1494 } 1495 1496 return r; 1497} 1498 1499/* return TRUE for ops that produce only a single result */ 1500static boolean 1501is_scalar_op(unsigned op) 1502{ 1503 switch (op) { 1504 case TGSI_OPCODE_COS: 1505 case TGSI_OPCODE_DP2: 1506 case TGSI_OPCODE_DP3: 1507 case TGSI_OPCODE_DP4: 1508 case TGSI_OPCODE_DPH: 1509 case TGSI_OPCODE_EX2: 1510 case TGSI_OPCODE_LG2: 1511 case TGSI_OPCODE_POW: 1512 case TGSI_OPCODE_RCP: 1513 case TGSI_OPCODE_RSQ: 1514 case TGSI_OPCODE_SIN: 1515 /* 1516 case TGSI_OPCODE_KIL: 1517 case TGSI_OPCODE_LIT: 1518 case TGSI_OPCODE_SCS: 1519 */ 1520 return TRUE; 1521 default: 1522 return FALSE; 1523 } 1524} 1525 1526/* Returns a bitmask indicating which dst components depend 1527 * on source s, component c (reverse of nv50_tgsi_src_mask). 1528 */ 1529static unsigned 1530nv50_tgsi_dst_revdep(unsigned op, int s, int c) 1531{ 1532 if (is_scalar_op(op)) 1533 return 0x1; 1534 1535 switch (op) { 1536 case TGSI_OPCODE_DST: 1537 return (1 << c) & (s ? 0xa : 0x6); 1538 case TGSI_OPCODE_XPD: 1539 switch (c) { 1540 case 0: return 0x6; 1541 case 1: return 0x5; 1542 case 2: return 0x3; 1543 case 3: return 0x0; 1544 default: 1545 assert(0); 1546 return 0x0; 1547 } 1548 case TGSI_OPCODE_LIT: 1549 case TGSI_OPCODE_SCS: 1550 case TGSI_OPCODE_TEX: 1551 case TGSI_OPCODE_TXP: 1552 /* these take care of dangerous swizzles themselves */ 1553 return 0x0; 1554 case TGSI_OPCODE_IF: 1555 case TGSI_OPCODE_KIL: 1556 /* don't call this function for these ops */ 1557 assert(0); 1558 return 0; 1559 default: 1560 /* linear vector instruction */ 1561 return (1 << c); 1562 } 1563} 1564 1565static INLINE boolean 1566has_pred(struct nv50_program_exec *e, unsigned cc) 1567{ 1568 if (!is_long(e) || is_immd(e)) 1569 return FALSE; 1570 return ((e->inst[1] & 0x780) == (cc << 7)); 1571} 1572 1573/* on ENDIF see if we can do "@p0.neu single_op" instead of: 1574 * join_at ENDIF 1575 * @p0.eq bra ENDIF 1576 * single_op 1577 * ENDIF: nop.join 1578 */ 1579static boolean 1580nv50_kill_branch(struct nv50_pc *pc) 1581{ 1582 int lvl = pc->if_lvl; 1583 1584 if (pc->if_insn[lvl]->next != pc->p->exec_tail) 1585 return FALSE; 1586 1587 /* if ccode == 'true', the BRA is from an ELSE and the predicate 1588 * reg may no longer be valid, since we currently always use $p0 1589 */ 1590 if (has_pred(pc->if_insn[lvl], 0xf)) 1591 return FALSE; 1592 assert(pc->if_insn[lvl] && pc->br_join[lvl]); 1593 1594 /* We'll use the exec allocated for JOIN_AT (as we can't easily 1595 * update prev's next); if exec_tail is BRK, update the pointer. 1596 */ 1597 if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail) 1598 pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl]; 1599 1600 pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ 1601 1602 *pc->br_join[lvl] = *pc->p->exec_tail; 1603 1604 FREE(pc->if_insn[lvl]); 1605 FREE(pc->p->exec_tail); 1606 1607 pc->p->exec_tail = pc->br_join[lvl]; 1608 pc->p->exec_tail->next = NULL; 1609 set_pred(pc, 0xd, 0, pc->p->exec_tail); 1610 1611 return TRUE; 1612} 1613 1614static boolean 1615nv50_program_tx_insn(struct nv50_pc *pc, 1616 const struct tgsi_full_instruction *inst) 1617{ 1618 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 1619 unsigned mask, sat, unit; 1620 int i, c; 1621 1622 mask = inst->FullDstRegisters[0].DstRegister.WriteMask; 1623 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 1624 1625 memset(src, 0, sizeof(src)); 1626 1627 for (c = 0; c < 4; c++) { 1628 if ((mask & (1 << c)) && !pc->r_dst[c]) 1629 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); 1630 else 1631 dst[c] = pc->r_dst[c]; 1632 rdst[c] = dst[c]; 1633 } 1634 1635 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1636 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; 1637 unsigned src_mask; 1638 boolean neg_supp; 1639 1640 src_mask = nv50_tgsi_src_mask(inst, i); 1641 neg_supp = negate_supported(inst, i); 1642 1643 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) 1644 unit = fs->SrcRegister.Index; 1645 1646 for (c = 0; c < 4; c++) 1647 if (src_mask & (1 << c)) 1648 src[i][c] = tgsi_src(pc, c, fs, neg_supp); 1649 } 1650 1651 brdc = temp = pc->r_brdc; 1652 if (brdc && brdc->type != P_TEMP) { 1653 temp = temp_temp(pc); 1654 if (sat) 1655 brdc = temp; 1656 } else 1657 if (sat) { 1658 for (c = 0; c < 4; c++) { 1659 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 1660 continue; 1661 rdst[c] = dst[c]; 1662 dst[c] = temp_temp(pc); 1663 } 1664 } 1665 1666 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 1667 1668 switch (inst->Instruction.Opcode) { 1669 case TGSI_OPCODE_ABS: 1670 for (c = 0; c < 4; c++) { 1671 if (!(mask & (1 << c))) 1672 continue; 1673 emit_abs(pc, dst[c], src[0][c]); 1674 } 1675 break; 1676 case TGSI_OPCODE_ADD: 1677 for (c = 0; c < 4; c++) { 1678 if (!(mask & (1 << c))) 1679 continue; 1680 emit_add(pc, dst[c], src[0][c], src[1][c]); 1681 } 1682 break; 1683 case TGSI_OPCODE_BGNLOOP: 1684 pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; 1685 break; 1686 case TGSI_OPCODE_BRK: 1687 emit_branch(pc, -1, 0, NULL); 1688 assert(pc->loop_lvl > 0); 1689 pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail; 1690 break; 1691 case TGSI_OPCODE_CEIL: 1692 for (c = 0; c < 4; c++) { 1693 if (!(mask & (1 << c))) 1694 continue; 1695 emit_cvt(pc, dst[c], src[0][c], -1, 1696 CVTOP_CEIL, CVT_F32_F32 | CVT_RI); 1697 } 1698 break; 1699 case TGSI_OPCODE_COS: 1700 if (mask & 8) { 1701 emit_precossin(pc, temp, src[0][3]); 1702 emit_flop(pc, 5, dst[3], temp); 1703 if (!(mask &= 7)) 1704 break; 1705 if (temp == dst[3]) 1706 temp = brdc = temp_temp(pc); 1707 } 1708 emit_precossin(pc, temp, src[0][0]); 1709 emit_flop(pc, 5, brdc, temp); 1710 break; 1711 case TGSI_OPCODE_DDX: 1712 for (c = 0; c < 4; c++) { 1713 if (!(mask & (1 << c))) 1714 continue; 1715 emit_ddx(pc, dst[c], src[0][c]); 1716 } 1717 break; 1718 case TGSI_OPCODE_DDY: 1719 for (c = 0; c < 4; c++) { 1720 if (!(mask & (1 << c))) 1721 continue; 1722 emit_ddy(pc, dst[c], src[0][c]); 1723 } 1724 break; 1725 case TGSI_OPCODE_DP3: 1726 emit_mul(pc, temp, src[0][0], src[1][0]); 1727 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1728 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 1729 break; 1730 case TGSI_OPCODE_DP4: 1731 emit_mul(pc, temp, src[0][0], src[1][0]); 1732 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1733 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1734 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 1735 break; 1736 case TGSI_OPCODE_DPH: 1737 emit_mul(pc, temp, src[0][0], src[1][0]); 1738 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1739 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1740 emit_add(pc, brdc, src[1][3], temp); 1741 break; 1742 case TGSI_OPCODE_DST: 1743 if (mask & (1 << 1)) 1744 emit_mul(pc, dst[1], src[0][1], src[1][1]); 1745 if (mask & (1 << 2)) 1746 emit_mov(pc, dst[2], src[0][2]); 1747 if (mask & (1 << 3)) 1748 emit_mov(pc, dst[3], src[1][3]); 1749 if (mask & (1 << 0)) 1750 emit_mov_immdval(pc, dst[0], 1.0f); 1751 break; 1752 case TGSI_OPCODE_ELSE: 1753 emit_branch(pc, -1, 0, NULL); 1754 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 1755 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; 1756 break; 1757 case TGSI_OPCODE_ENDIF: 1758 pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; 1759 1760 /* try to replace branch over 1 insn with a predicated insn */ 1761 if (nv50_kill_branch(pc) == TRUE) 1762 break; 1763 1764 if (pc->br_join[pc->if_lvl]) { 1765 pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size; 1766 pc->br_join[pc->if_lvl] = NULL; 1767 } 1768 /* emit a NOP as join point, we could set it on the next 1769 * one, but would have to make sure it is long and !immd 1770 */ 1771 emit_nop(pc); 1772 pc->p->exec_tail->inst[1] |= 2; 1773 break; 1774 case TGSI_OPCODE_ENDLOOP: 1775 emit_branch(pc, -1, 0, NULL); 1776 pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl]; 1777 pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size; 1778 break; 1779 case TGSI_OPCODE_EX2: 1780 emit_preex2(pc, temp, src[0][0]); 1781 emit_flop(pc, 6, brdc, temp); 1782 break; 1783 case TGSI_OPCODE_FLR: 1784 for (c = 0; c < 4; c++) { 1785 if (!(mask & (1 << c))) 1786 continue; 1787 emit_flr(pc, dst[c], src[0][c]); 1788 } 1789 break; 1790 case TGSI_OPCODE_FRC: 1791 temp = temp_temp(pc); 1792 for (c = 0; c < 4; c++) { 1793 if (!(mask & (1 << c))) 1794 continue; 1795 emit_flr(pc, temp, src[0][c]); 1796 emit_sub(pc, dst[c], src[0][c], temp); 1797 } 1798 break; 1799 case TGSI_OPCODE_IF: 1800 /* emitting a join_at may not be necessary */ 1801 assert(pc->if_lvl < MAX_IF_DEPTH); 1802 set_pred_wr(pc, 1, 0, pc->if_cond); 1803 emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]); 1804 pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; 1805 break; 1806 case TGSI_OPCODE_KIL: 1807 emit_kil(pc, src[0][0]); 1808 emit_kil(pc, src[0][1]); 1809 emit_kil(pc, src[0][2]); 1810 emit_kil(pc, src[0][3]); 1811 break; 1812 case TGSI_OPCODE_LIT: 1813 emit_lit(pc, &dst[0], mask, &src[0][0]); 1814 break; 1815 case TGSI_OPCODE_LG2: 1816 emit_flop(pc, 3, brdc, src[0][0]); 1817 break; 1818 case TGSI_OPCODE_LRP: 1819 temp = temp_temp(pc); 1820 for (c = 0; c < 4; c++) { 1821 if (!(mask & (1 << c))) 1822 continue; 1823 emit_sub(pc, temp, src[1][c], src[2][c]); 1824 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 1825 } 1826 break; 1827 case TGSI_OPCODE_MAD: 1828 for (c = 0; c < 4; c++) { 1829 if (!(mask & (1 << c))) 1830 continue; 1831 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 1832 } 1833 break; 1834 case TGSI_OPCODE_MAX: 1835 for (c = 0; c < 4; c++) { 1836 if (!(mask & (1 << c))) 1837 continue; 1838 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]); 1839 } 1840 break; 1841 case TGSI_OPCODE_MIN: 1842 for (c = 0; c < 4; c++) { 1843 if (!(mask & (1 << c))) 1844 continue; 1845 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]); 1846 } 1847 break; 1848 case TGSI_OPCODE_MOV: 1849 case TGSI_OPCODE_SWZ: 1850 for (c = 0; c < 4; c++) { 1851 if (!(mask & (1 << c))) 1852 continue; 1853 emit_mov(pc, dst[c], src[0][c]); 1854 } 1855 break; 1856 case TGSI_OPCODE_MUL: 1857 for (c = 0; c < 4; c++) { 1858 if (!(mask & (1 << c))) 1859 continue; 1860 emit_mul(pc, dst[c], src[0][c], src[1][c]); 1861 } 1862 break; 1863 case TGSI_OPCODE_POW: 1864 emit_pow(pc, brdc, src[0][0], src[1][0]); 1865 break; 1866 case TGSI_OPCODE_RCP: 1867 emit_flop(pc, 0, brdc, src[0][0]); 1868 break; 1869 case TGSI_OPCODE_RSQ: 1870 emit_flop(pc, 2, brdc, src[0][0]); 1871 break; 1872 case TGSI_OPCODE_SCS: 1873 temp = temp_temp(pc); 1874 if (mask & 3) 1875 emit_precossin(pc, temp, src[0][0]); 1876 if (mask & (1 << 0)) 1877 emit_flop(pc, 5, dst[0], temp); 1878 if (mask & (1 << 1)) 1879 emit_flop(pc, 4, dst[1], temp); 1880 if (mask & (1 << 2)) 1881 emit_mov_immdval(pc, dst[2], 0.0); 1882 if (mask & (1 << 3)) 1883 emit_mov_immdval(pc, dst[3], 1.0); 1884 break; 1885 case TGSI_OPCODE_SIN: 1886 if (mask & 8) { 1887 emit_precossin(pc, temp, src[0][3]); 1888 emit_flop(pc, 4, dst[3], temp); 1889 if (!(mask &= 7)) 1890 break; 1891 if (temp == dst[3]) 1892 temp = brdc = temp_temp(pc); 1893 } 1894 emit_precossin(pc, temp, src[0][0]); 1895 emit_flop(pc, 4, brdc, temp); 1896 break; 1897 case TGSI_OPCODE_SLT: 1898 case TGSI_OPCODE_SGE: 1899 case TGSI_OPCODE_SEQ: 1900 case TGSI_OPCODE_SGT: 1901 case TGSI_OPCODE_SLE: 1902 case TGSI_OPCODE_SNE: 1903 i = map_tgsi_setop_cc(inst->Instruction.Opcode); 1904 for (c = 0; c < 4; c++) { 1905 if (!(mask & (1 << c))) 1906 continue; 1907 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]); 1908 } 1909 break; 1910 case TGSI_OPCODE_SUB: 1911 for (c = 0; c < 4; c++) { 1912 if (!(mask & (1 << c))) 1913 continue; 1914 emit_sub(pc, dst[c], src[0][c], src[1][c]); 1915 } 1916 break; 1917 case TGSI_OPCODE_TEX: 1918 emit_tex(pc, dst, mask, src[0], unit, 1919 inst->InstructionExtTexture.Texture, FALSE); 1920 break; 1921 case TGSI_OPCODE_TXP: 1922 emit_tex(pc, dst, mask, src[0], unit, 1923 inst->InstructionExtTexture.Texture, TRUE); 1924 break; 1925 case TGSI_OPCODE_TRUNC: 1926 for (c = 0; c < 4; c++) { 1927 if (!(mask & (1 << c))) 1928 continue; 1929 emit_cvt(pc, dst[c], src[0][c], -1, 1930 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI); 1931 } 1932 break; 1933 case TGSI_OPCODE_XPD: 1934 temp = temp_temp(pc); 1935 if (mask & (1 << 0)) { 1936 emit_mul(pc, temp, src[0][2], src[1][1]); 1937 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 1938 } 1939 if (mask & (1 << 1)) { 1940 emit_mul(pc, temp, src[0][0], src[1][2]); 1941 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 1942 } 1943 if (mask & (1 << 2)) { 1944 emit_mul(pc, temp, src[0][1], src[1][0]); 1945 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 1946 } 1947 if (mask & (1 << 3)) 1948 emit_mov_immdval(pc, dst[3], 1.0); 1949 break; 1950 case TGSI_OPCODE_END: 1951 break; 1952 default: 1953 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 1954 return FALSE; 1955 } 1956 1957 if (brdc) { 1958 if (sat) 1959 emit_sat(pc, brdc, brdc); 1960 for (c = 0; c < 4; c++) 1961 if ((mask & (1 << c)) && dst[c] != brdc) 1962 emit_mov(pc, dst[c], brdc); 1963 } else 1964 if (sat) { 1965 for (c = 0; c < 4; c++) { 1966 if (!(mask & (1 << c))) 1967 continue; 1968 /* in this case we saturate later */ 1969 if (dst[c]->type == P_TEMP && dst[c]->index < 0) 1970 continue; 1971 emit_sat(pc, rdst[c], dst[c]); 1972 } 1973 } 1974 1975 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1976 for (c = 0; c < 4; c++) { 1977 if (!src[i][c]) 1978 continue; 1979 src[i][c]->neg = 0; 1980 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) 1981 FREE(src[i][c]); 1982 } 1983 } 1984 1985 kill_temp_temp(pc); 1986 return TRUE; 1987} 1988 1989static void 1990prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) 1991{ 1992 struct nv50_reg *reg = NULL; 1993 const struct tgsi_full_src_register *src; 1994 const struct tgsi_dst_register *dst; 1995 unsigned i, c, k, mask; 1996 1997 dst = &insn->FullDstRegisters[0].DstRegister; 1998 mask = dst->WriteMask; 1999 2000 if (dst->File == TGSI_FILE_TEMPORARY) 2001 reg = pc->temp; 2002 else 2003 if (dst->File == TGSI_FILE_OUTPUT) 2004 reg = pc->result; 2005 2006 if (reg) { 2007 for (c = 0; c < 4; c++) { 2008 if (!(mask & (1 << c))) 2009 continue; 2010 reg[dst->Index * 4 + c].acc = pc->insn_nr; 2011 } 2012 } 2013 2014 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2015 src = &insn->FullSrcRegisters[i]; 2016 2017 if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) 2018 reg = pc->temp; 2019 else 2020 if (src->SrcRegister.File == TGSI_FILE_INPUT) 2021 reg = pc->attr; 2022 else 2023 continue; 2024 2025 mask = nv50_tgsi_src_mask(insn, i); 2026 2027 for (c = 0; c < 4; c++) { 2028 if (!(mask & (1 << c))) 2029 continue; 2030 k = tgsi_util_get_full_src_register_extswizzle(src, c); 2031 2032 if (k > TGSI_EXTSWIZZLE_W) 2033 continue; 2034 2035 reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr; 2036 } 2037 } 2038} 2039 2040/* Returns a bitmask indicating which dst components need to be 2041 * written to temporaries first to avoid 'corrupting' sources. 2042 * 2043 * m[i] (out) indicate component to write in the i-th position 2044 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 2045 */ 2046static unsigned 2047nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 2048{ 2049 unsigned i, c, x, unsafe; 2050 2051 for (c = 0; c < 4; c++) 2052 m[c] = c; 2053 2054 /* Swap as long as a dst component written earlier is depended on 2055 * by one written later, but the next one isn't depended on by it. 2056 */ 2057 for (c = 0; c < 3; c++) { 2058 if (rdep[m[c + 1]] & (1 << m[c])) 2059 continue; /* if next one is depended on by us */ 2060 for (i = c + 1; i < 4; i++) 2061 /* if we are depended on by a later one */ 2062 if (rdep[m[c]] & (1 << m[i])) 2063 break; 2064 if (i == 4) 2065 continue; 2066 /* now, swap */ 2067 x = m[c]; 2068 m[c] = m[c + 1]; 2069 m[c + 1] = x; 2070 2071 /* restart */ 2072 c = 0; 2073 } 2074 2075 /* mark dependencies that could not be resolved by reordering */ 2076 for (i = 0; i < 3; ++i) 2077 for (c = i + 1; c < 4; ++c) 2078 if (rdep[m[i]] & (1 << m[c])) 2079 unsafe |= (1 << i); 2080 2081 /* NOTE: $unsafe is with respect to order, not component */ 2082 return unsafe; 2083} 2084 2085/* Select a suitable dst register for broadcasting scalar results, 2086 * or return NULL if we have to allocate an extra TEMP. 2087 * 2088 * If e.g. only 1 component is written, we may also emit the final 2089 * result to a write-only register. 2090 */ 2091static struct nv50_reg * 2092tgsi_broadcast_dst(struct nv50_pc *pc, 2093 const struct tgsi_full_dst_register *fd, unsigned mask) 2094{ 2095 if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) { 2096 int c = ffs(~mask & fd->DstRegister.WriteMask); 2097 if (c) 2098 return tgsi_dst(pc, c - 1, fd); 2099 } else { 2100 int c = ffs(fd->DstRegister.WriteMask) - 1; 2101 if ((1 << c) == fd->DstRegister.WriteMask) 2102 return tgsi_dst(pc, c, fd); 2103 } 2104 2105 return NULL; 2106} 2107 2108/* Scan source swizzles and return a bitmask indicating dst regs that 2109 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 2110 */ 2111static unsigned 2112nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 2113 unsigned rdep[4]) 2114{ 2115 const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0]; 2116 const struct tgsi_full_src_register *fs; 2117 unsigned i, deqs = 0; 2118 2119 for (i = 0; i < 4; ++i) 2120 rdep[i] = 0; 2121 2122 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 2123 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 2124 boolean neg_supp = negate_supported(insn, i); 2125 2126 fs = &insn->FullSrcRegisters[i]; 2127 if (fs->SrcRegister.File != fd->DstRegister.File || 2128 fs->SrcRegister.Index != fd->DstRegister.Index) 2129 continue; 2130 2131 for (chn = 0; chn < 4; ++chn) { 2132 unsigned s, c; 2133 2134 if (!(mask & (1 << chn))) /* src is not read */ 2135 continue; 2136 c = tgsi_util_get_full_src_register_extswizzle(fs, chn); 2137 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 2138 2139 if (c > TGSI_EXTSWIZZLE_W || 2140 !(fd->DstRegister.WriteMask & (1 << c))) 2141 continue; 2142 2143 /* no danger if src is copied to TEMP first */ 2144 if ((s != TGSI_UTIL_SIGN_KEEP) && 2145 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp)) 2146 continue; 2147 2148 rdep[c] |= nv50_tgsi_dst_revdep( 2149 insn->Instruction.Opcode, i, chn); 2150 deqs |= (1 << c); 2151 } 2152 } 2153 2154 return deqs; 2155} 2156 2157static boolean 2158nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 2159{ 2160 struct tgsi_full_instruction insn = tok->FullInstruction; 2161 const struct tgsi_full_dst_register *fd; 2162 unsigned i, deqs, rdep[4], m[4]; 2163 2164 fd = &tok->FullInstruction.FullDstRegisters[0]; 2165 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 2166 2167 if (is_scalar_op(insn.Instruction.Opcode)) { 2168 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 2169 if (!pc->r_brdc) 2170 pc->r_brdc = temp_temp(pc); 2171 return nv50_program_tx_insn(pc, &insn); 2172 } 2173 pc->r_brdc = NULL; 2174 2175 if (!deqs) 2176 return nv50_program_tx_insn(pc, &insn); 2177 2178 deqs = nv50_revdep_reorder(m, rdep); 2179 2180 for (i = 0; i < 4; ++i) { 2181 assert(pc->r_dst[m[i]] == NULL); 2182 2183 insn.FullDstRegisters[0].DstRegister.WriteMask = 2184 fd->DstRegister.WriteMask & (1 << m[i]); 2185 2186 if (!insn.FullDstRegisters[0].DstRegister.WriteMask) 2187 continue; 2188 2189 if (deqs & (1 << i)) 2190 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 2191 2192 if (!nv50_program_tx_insn(pc, &insn)) 2193 return FALSE; 2194 } 2195 2196 for (i = 0; i < 4; i++) { 2197 struct nv50_reg *reg = pc->r_dst[i]; 2198 if (!reg) 2199 continue; 2200 pc->r_dst[i] = NULL; 2201 2202 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 2203 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 2204 else 2205 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 2206 free_temp(pc, reg); 2207 } 2208 2209 return TRUE; 2210} 2211 2212static void 2213load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) 2214{ 2215 struct nv50_reg *iv, **ppiv; 2216 unsigned mode = pc->interp_mode[reg->index]; 2217 2218 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; 2219 iv = *ppiv; 2220 2221 if ((mode & INTERP_PERSPECTIVE) && !iv) { 2222 iv = *ppiv = alloc_temp(pc, NULL); 2223 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; 2224 2225 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); 2226 emit_flop(pc, 0, iv, iv); 2227 2228 /* XXX: when loading interpolants dynamically, move these 2229 * to the program head, or make sure it can't be skipped. 2230 */ 2231 } 2232 2233 emit_interp(pc, reg, iv, mode); 2234} 2235 2236static boolean 2237nv50_program_tx_prep(struct nv50_pc *pc) 2238{ 2239 struct tgsi_parse_context tp; 2240 struct nv50_program *p = pc->p; 2241 boolean ret = FALSE; 2242 unsigned i, c, flat_nr = 0; 2243 2244 tgsi_parse_init(&tp, pc->p->pipe.tokens); 2245 while (!tgsi_parse_end_of_tokens(&tp)) { 2246 const union tgsi_full_token *tok = &tp.FullToken; 2247 2248 tgsi_parse_token(&tp); 2249 switch (tok->Token.Type) { 2250 case TGSI_TOKEN_TYPE_IMMEDIATE: 2251 { 2252 const struct tgsi_full_immediate *imm = 2253 &tp.FullToken.FullImmediate; 2254 2255 ctor_immd(pc, imm->u[0].Float, 2256 imm->u[1].Float, 2257 imm->u[2].Float, 2258 imm->u[3].Float); 2259 } 2260 break; 2261 case TGSI_TOKEN_TYPE_DECLARATION: 2262 { 2263 const struct tgsi_full_declaration *d; 2264 unsigned si, last, first, mode; 2265 2266 d = &tp.FullToken.FullDeclaration; 2267 first = d->DeclarationRange.First; 2268 last = d->DeclarationRange.Last; 2269 2270 switch (d->Declaration.File) { 2271 case TGSI_FILE_TEMPORARY: 2272 break; 2273 case TGSI_FILE_OUTPUT: 2274 if (!d->Declaration.Semantic || 2275 p->type == PIPE_SHADER_FRAGMENT) 2276 break; 2277 2278 si = d->Semantic.SemanticIndex; 2279 switch (d->Semantic.SemanticName) { 2280 case TGSI_SEMANTIC_BCOLOR: 2281 p->cfg.two_side[si].hw = first; 2282 if (p->cfg.io_nr > first) 2283 p->cfg.io_nr = first; 2284 break; 2285 case TGSI_SEMANTIC_PSIZE: 2286 p->cfg.psiz = first; 2287 if (p->cfg.io_nr > first) 2288 p->cfg.io_nr = first; 2289 break; 2290 /* 2291 case TGSI_SEMANTIC_CLIP_DISTANCE: 2292 p->cfg.clpd = MIN2(p->cfg.clpd, first); 2293 break; 2294 */ 2295 default: 2296 break; 2297 } 2298 break; 2299 case TGSI_FILE_INPUT: 2300 { 2301 if (p->type != PIPE_SHADER_FRAGMENT) 2302 break; 2303 2304 switch (d->Declaration.Interpolate) { 2305 case TGSI_INTERPOLATE_CONSTANT: 2306 mode = INTERP_FLAT; 2307 flat_nr++; 2308 break; 2309 case TGSI_INTERPOLATE_PERSPECTIVE: 2310 mode = INTERP_PERSPECTIVE; 2311 p->cfg.regs[1] |= 0x08 << 24; 2312 break; 2313 default: 2314 mode = INTERP_LINEAR; 2315 break; 2316 } 2317 if (d->Declaration.Centroid) 2318 mode |= INTERP_CENTROID; 2319 2320 assert(last < 32); 2321 for (i = first; i <= last; i++) 2322 pc->interp_mode[i] = mode; 2323 } 2324 break; 2325 case TGSI_FILE_CONSTANT: 2326 break; 2327 case TGSI_FILE_SAMPLER: 2328 break; 2329 default: 2330 NOUVEAU_ERR("bad decl file %d\n", 2331 d->Declaration.File); 2332 goto out_err; 2333 } 2334 } 2335 break; 2336 case TGSI_TOKEN_TYPE_INSTRUCTION: 2337 pc->insn_nr++; 2338 prep_inspect_insn(pc, &tok->FullInstruction); 2339 break; 2340 default: 2341 break; 2342 } 2343 } 2344 2345 if (p->type == PIPE_SHADER_VERTEX) { 2346 int rid = 0; 2347 2348 for (i = 0; i < pc->attr_nr * 4; ++i) { 2349 if (pc->attr[i].acc) { 2350 pc->attr[i].hw = rid++; 2351 p->cfg.attr[i / 32] |= 1 << (i % 32); 2352 } 2353 } 2354 2355 for (i = 0, rid = 0; i < pc->result_nr; ++i) { 2356 p->cfg.io[i].hw = rid; 2357 p->cfg.io[i].id_vp = i; 2358 2359 for (c = 0; c < 4; ++c) { 2360 int n = i * 4 + c; 2361 if (!pc->result[n].acc) 2362 continue; 2363 pc->result[n].hw = rid++; 2364 p->cfg.io[i].mask |= 1 << c; 2365 } 2366 } 2367 2368 for (c = 0; c < 2; ++c) 2369 if (p->cfg.two_side[c].hw < 0x40) 2370 p->cfg.two_side[c] = p->cfg.io[ 2371 p->cfg.two_side[c].hw]; 2372 2373 if (p->cfg.psiz < 0x40) 2374 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw; 2375 } else 2376 if (p->type == PIPE_SHADER_FRAGMENT) { 2377 int rid, aid; 2378 unsigned n = 0, m = pc->attr_nr - flat_nr; 2379 2380 int base = (TGSI_SEMANTIC_POSITION == 2381 p->info.input_semantic_name[0]) ? 0 : 1; 2382 2383 /* non-flat interpolants have to be mapped to 2384 * the lower hardware IDs, so sort them: 2385 */ 2386 for (i = 0; i < pc->attr_nr; i++) { 2387 if (pc->interp_mode[i] == INTERP_FLAT) { 2388 p->cfg.io[m].id_vp = i + base; 2389 p->cfg.io[m++].id_fp = i; 2390 } else { 2391 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) 2392 p->cfg.io[n].linear = TRUE; 2393 p->cfg.io[n].id_vp = i + base; 2394 p->cfg.io[n++].id_fp = i; 2395 } 2396 } 2397 2398 if (!base) /* set w-coordinate mask from perspective interp */ 2399 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24; 2400 2401 aid = popcnt4( /* if fcrd isn't contained in cfg.io */ 2402 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask); 2403 2404 for (n = 0; n < pc->attr_nr; ++n) { 2405 p->cfg.io[n].hw = rid = aid; 2406 i = p->cfg.io[n].id_fp; 2407 2408 for (c = 0; c < 4; ++c) { 2409 if (!pc->attr[i * 4 + c].acc) 2410 continue; 2411 pc->attr[i * 4 + c].rhw = rid++; 2412 p->cfg.io[n].mask |= 1 << c; 2413 2414 load_interpolant(pc, &pc->attr[i * 4 + c]); 2415 } 2416 aid += popcnt4(p->cfg.io[n].mask); 2417 } 2418 2419 if (!base) 2420 p->cfg.regs[1] |= p->cfg.io[0].mask << 24; 2421 2422 m = popcnt4(p->cfg.regs[1] >> 24); 2423 2424 /* set count of non-position inputs and of non-flat 2425 * non-position inputs for FP_INTERPOLANT_CTRL 2426 */ 2427 p->cfg.regs[1] |= aid - m; 2428 2429 if (flat_nr) { 2430 i = p->cfg.io[pc->attr_nr - flat_nr].hw; 2431 p->cfg.regs[1] |= (i - m) << 16; 2432 } else 2433 p->cfg.regs[1] |= p->cfg.regs[1] << 16; 2434 2435 /* mark color semantic for light-twoside */ 2436 n = 0x40; 2437 for (i = 0; i < pc->attr_nr; i++) { 2438 ubyte si, sn; 2439 2440 sn = p->info.input_semantic_name[p->cfg.io[i].id_fp]; 2441 si = p->info.input_semantic_index[p->cfg.io[i].id_fp]; 2442 2443 if (sn == TGSI_SEMANTIC_COLOR) { 2444 p->cfg.two_side[si] = p->cfg.io[i]; 2445 2446 /* increase colour count */ 2447 p->cfg.regs[0] += popcnt4( 2448 p->cfg.two_side[si].mask) << 16; 2449 2450 n = MIN2(n, p->cfg.io[i].hw - m); 2451 } 2452 } 2453 if (n < 0x40) 2454 p->cfg.regs[0] += n; 2455 2456 /* Initialize FP results: 2457 * FragDepth is always first TGSI and last hw output 2458 */ 2459 i = p->info.writes_z ? 4 : 0; 2460 for (rid = 0; i < pc->result_nr * 4; i++) 2461 pc->result[i].rhw = rid++; 2462 if (p->info.writes_z) 2463 pc->result[2].rhw = rid; 2464 2465 p->cfg.high_result = rid; 2466 } 2467 2468 if (pc->immd_nr) { 2469 int rid = 0; 2470 2471 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); 2472 if (!pc->immd) 2473 goto out_err; 2474 2475 for (i = 0; i < pc->immd_nr; i++) { 2476 for (c = 0; c < 4; c++, rid++) 2477 ctor_reg(&pc->immd[rid], P_IMMD, i, rid); 2478 } 2479 } 2480 2481 ret = TRUE; 2482out_err: 2483 if (pc->iv_p) 2484 free_temp(pc, pc->iv_p); 2485 if (pc->iv_c) 2486 free_temp(pc, pc->iv_c); 2487 2488 tgsi_parse_free(&tp); 2489 return ret; 2490} 2491 2492static void 2493free_nv50_pc(struct nv50_pc *pc) 2494{ 2495 if (pc->immd) 2496 FREE(pc->immd); 2497 if (pc->param) 2498 FREE(pc->param); 2499 if (pc->result) 2500 FREE(pc->result); 2501 if (pc->attr) 2502 FREE(pc->attr); 2503 if (pc->temp) 2504 FREE(pc->temp); 2505 2506 FREE(pc); 2507} 2508 2509static boolean 2510ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) 2511{ 2512 int i, c; 2513 unsigned rtype[2] = { P_ATTR, P_RESULT }; 2514 2515 pc->p = p; 2516 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; 2517 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; 2518 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; 2519 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; 2520 2521 p->cfg.high_temp = 4; 2522 2523 p->cfg.two_side[0].hw = 0x40; 2524 p->cfg.two_side[1].hw = 0x40; 2525 2526 switch (p->type) { 2527 case PIPE_SHADER_VERTEX: 2528 p->cfg.psiz = 0x40; 2529 p->cfg.clpd = 0x40; 2530 p->cfg.io_nr = pc->result_nr; 2531 break; 2532 case PIPE_SHADER_FRAGMENT: 2533 rtype[0] = rtype[1] = P_TEMP; 2534 2535 p->cfg.regs[0] = 0x01000004; 2536 p->cfg.io_nr = pc->attr_nr; 2537 2538 if (p->info.writes_z) { 2539 p->cfg.regs[2] |= 0x00000100; 2540 p->cfg.regs[3] |= 0x00000011; 2541 } 2542 if (p->info.uses_kill) 2543 p->cfg.regs[2] |= 0x00100000; 2544 break; 2545 } 2546 2547 if (pc->temp_nr) { 2548 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); 2549 if (!pc->temp) 2550 return FALSE; 2551 2552 for (i = 0; i < pc->temp_nr * 4; ++i) 2553 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); 2554 } 2555 2556 if (pc->attr_nr) { 2557 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); 2558 if (!pc->attr) 2559 return FALSE; 2560 2561 for (i = 0; i < pc->attr_nr * 4; ++i) 2562 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); 2563 } 2564 2565 if (pc->result_nr) { 2566 unsigned nr = pc->result_nr * 4; 2567 2568 pc->result = MALLOC(nr * sizeof(struct nv50_reg)); 2569 if (!pc->result) 2570 return FALSE; 2571 2572 for (i = 0; i < nr; ++i) 2573 ctor_reg(&pc->result[i], rtype[1], i / 4, -1); 2574 } 2575 2576 if (pc->param_nr) { 2577 int rid = 0; 2578 2579 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); 2580 if (!pc->param) 2581 return FALSE; 2582 2583 for (i = 0; i < pc->param_nr; ++i) 2584 for (c = 0; c < 4; ++c, ++rid) 2585 ctor_reg(&pc->param[rid], P_CONST, i, rid); 2586 } 2587 2588 return TRUE; 2589} 2590 2591static void 2592nv50_fp_move_results(struct nv50_pc *pc) 2593{ 2594 struct nv50_reg reg; 2595 unsigned i; 2596 2597 ctor_reg(®, P_TEMP, -1, -1); 2598 2599 for (i = 0; i < pc->result_nr * 4; ++i) { 2600 if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) 2601 continue; 2602 if (pc->result[i].rhw != pc->result[i].hw) { 2603 reg.hw = pc->result[i].rhw; 2604 emit_mov(pc, ®, &pc->result[i]); 2605 } 2606 } 2607} 2608 2609static void 2610nv50_program_fixup_insns(struct nv50_pc *pc) 2611{ 2612 struct nv50_program_exec *e, *prev = NULL, **bra_list; 2613 unsigned i, n, pos; 2614 2615 bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); 2616 2617 /* Collect branch instructions, we need to adjust their offsets 2618 * when converting 32 bit instructions to 64 bit ones 2619 */ 2620 for (n = 0, e = pc->p->exec_head; e; e = e->next) 2621 if (e->param.index >= 0 && !e->param.mask) 2622 bra_list[n++] = e; 2623 2624 /* Make sure we don't have any single 32 bit instructions. */ 2625 for (e = pc->p->exec_head, pos = 0; e; e = e->next) { 2626 pos += is_long(e) ? 2 : 1; 2627 2628 if ((pos & 1) && (!e->next || is_long(e->next))) { 2629 for (i = 0; i < n; ++i) 2630 if (bra_list[i]->param.index >= pos) 2631 bra_list[i]->param.index += 1; 2632 convert_to_long(pc, e); 2633 ++pos; 2634 } 2635 if (e->next) 2636 prev = e; 2637 } 2638 2639 assert(!is_immd(pc->p->exec_head)); 2640 assert(!is_immd(pc->p->exec_tail)); 2641 2642 /* last instruction must be long so it can have the end bit set */ 2643 if (!is_long(pc->p->exec_tail)) { 2644 convert_to_long(pc, pc->p->exec_tail); 2645 if (prev) 2646 convert_to_long(pc, prev); 2647 } 2648 assert(!(pc->p->exec_tail->inst[1] & 2)); 2649 /* set the end-bit */ 2650 pc->p->exec_tail->inst[1] |= 1; 2651 2652 FREE(bra_list); 2653} 2654 2655static boolean 2656nv50_program_tx(struct nv50_program *p) 2657{ 2658 struct tgsi_parse_context parse; 2659 struct nv50_pc *pc; 2660 boolean ret; 2661 2662 pc = CALLOC_STRUCT(nv50_pc); 2663 if (!pc) 2664 return FALSE; 2665 2666 ret = ctor_nv50_pc(pc, p); 2667 if (ret == FALSE) 2668 goto out_cleanup; 2669 2670 ret = nv50_program_tx_prep(pc); 2671 if (ret == FALSE) 2672 goto out_cleanup; 2673 2674 tgsi_parse_init(&parse, pc->p->pipe.tokens); 2675 while (!tgsi_parse_end_of_tokens(&parse)) { 2676 const union tgsi_full_token *tok = &parse.FullToken; 2677 2678 /* don't allow half insn/immd on first and last instruction */ 2679 pc->allow32 = TRUE; 2680 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr) 2681 pc->allow32 = FALSE; 2682 2683 tgsi_parse_token(&parse); 2684 2685 switch (tok->Token.Type) { 2686 case TGSI_TOKEN_TYPE_INSTRUCTION: 2687 ++pc->insn_cur; 2688 ret = nv50_tgsi_insn(pc, tok); 2689 if (ret == FALSE) 2690 goto out_err; 2691 break; 2692 default: 2693 break; 2694 } 2695 } 2696 2697 if (pc->p->type == PIPE_SHADER_FRAGMENT) 2698 nv50_fp_move_results(pc); 2699 2700 nv50_program_fixup_insns(pc); 2701 2702 p->param_nr = pc->param_nr * 4; 2703 p->immd_nr = pc->immd_nr * 4; 2704 p->immd = pc->immd_buf; 2705 2706out_err: 2707 tgsi_parse_free(&parse); 2708 2709out_cleanup: 2710 free_nv50_pc(pc); 2711 return ret; 2712} 2713 2714static void 2715nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 2716{ 2717 if (nv50_program_tx(p) == FALSE) 2718 assert(0); 2719 p->translated = TRUE; 2720} 2721 2722static void 2723nv50_program_upload_data(struct nv50_context *nv50, float *map, 2724 unsigned start, unsigned count, unsigned cbuf) 2725{ 2726 struct nouveau_channel *chan = nv50->screen->base.channel; 2727 struct nouveau_grobj *tesla = nv50->screen->tesla; 2728 2729 while (count) { 2730 unsigned nr = count > 2047 ? 2047 : count; 2731 2732 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2733 OUT_RING (chan, (cbuf << 0) | (start << 8)); 2734 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2735 OUT_RINGp (chan, map, nr); 2736 2737 map += nr; 2738 start += nr; 2739 count -= nr; 2740 } 2741} 2742 2743static void 2744nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 2745{ 2746 struct pipe_screen *pscreen = nv50->pipe.screen; 2747 2748 if (!p->data[0] && p->immd_nr) { 2749 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 2750 2751 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 2752 while (heap->next && heap->size < p->immd_nr) { 2753 struct nv50_program *evict = heap->next->priv; 2754 nouveau_resource_free(&evict->data[0]); 2755 } 2756 2757 if (nouveau_resource_alloc(heap, p->immd_nr, p, 2758 &p->data[0])) 2759 assert(0); 2760 } 2761 2762 /* immediates only need to be uploaded again when freed */ 2763 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 2764 p->immd_nr, NV50_CB_PMISC); 2765 } 2766 2767 assert(p->param_nr <= 128); 2768 2769 if (p->param_nr) { 2770 unsigned cb; 2771 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], 2772 PIPE_BUFFER_USAGE_CPU_READ); 2773 2774 if (p->type == PIPE_SHADER_VERTEX) 2775 cb = NV50_CB_PVP; 2776 else 2777 cb = NV50_CB_PFP; 2778 2779 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); 2780 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); 2781 } 2782} 2783 2784static void 2785nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 2786{ 2787 struct nouveau_channel *chan = nv50->screen->base.channel; 2788 struct nouveau_grobj *tesla = nv50->screen->tesla; 2789 struct nv50_program_exec *e; 2790 struct nouveau_stateobj *so; 2791 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR; 2792 unsigned start, count, *up, *ptr; 2793 boolean upload = FALSE; 2794 2795 if (!p->bo) { 2796 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 2797 p->exec_size * 4, &p->bo); 2798 upload = TRUE; 2799 } 2800 2801 if (p->data[0] && p->data[0]->start != p->data_start[0]) 2802 upload = TRUE; 2803 2804 if (!upload) 2805 return; 2806 2807 for (e = p->exec_head; e; e = e->next) { 2808 unsigned ei, ci, bs; 2809 2810 if (e->param.index < 0) 2811 continue; 2812 2813 if (e->param.mask == 0) { 2814 assert(!(e->param.index & 1)); 2815 /* seem to be 8 byte steps */ 2816 ei = (e->param.index >> 1) + 0 /* START_ID */; 2817 2818 e->inst[0] &= 0xf0000fff; 2819 e->inst[0] |= ei << 12; 2820 continue; 2821 } 2822 2823 bs = (e->inst[1] >> 22) & 0x07; 2824 assert(bs < 2); 2825 ei = e->param.shift >> 5; 2826 ci = e->param.index; 2827 if (bs == 0) 2828 ci += p->data[bs]->start; 2829 2830 e->inst[ei] &= ~e->param.mask; 2831 e->inst[ei] |= (ci << e->param.shift); 2832 } 2833 2834 if (p->data[0]) 2835 p->data_start[0] = p->data[0]->start; 2836 2837#ifdef NV50_PROGRAM_DUMP 2838 NOUVEAU_ERR("-------\n"); 2839 for (e = p->exec_head; e; e = e->next) { 2840 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 2841 if (is_long(e)) 2842 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 2843 } 2844#endif 2845 2846 up = ptr = MALLOC(p->exec_size * 4); 2847 for (e = p->exec_head; e; e = e->next) { 2848 *(ptr++) = e->inst[0]; 2849 if (is_long(e)) 2850 *(ptr++) = e->inst[1]; 2851 } 2852 2853 so = so_new(4,2); 2854 so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); 2855 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0); 2856 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0); 2857 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4)); 2858 2859 start = 0; count = p->exec_size; 2860 while (count) { 2861 struct nouveau_channel *chan = nv50->screen->base.channel; 2862 unsigned nr; 2863 2864 so_emit(chan, so); 2865 2866 nr = MIN2(count, 2047); 2867 nr = MIN2(chan->pushbuf->remaining, nr); 2868 if (chan->pushbuf->remaining < (nr + 3)) { 2869 FIRE_RING(chan); 2870 continue; 2871 } 2872 2873 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2874 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD); 2875 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2876 OUT_RINGp (chan, up + start, nr); 2877 2878 start += nr; 2879 count -= nr; 2880 } 2881 2882 FREE(up); 2883 so_ref(NULL, &so); 2884} 2885 2886void 2887nv50_vertprog_validate(struct nv50_context *nv50) 2888{ 2889 struct nouveau_grobj *tesla = nv50->screen->tesla; 2890 struct nv50_program *p = nv50->vertprog; 2891 struct nouveau_stateobj *so; 2892 2893 if (!p->translated) { 2894 nv50_program_validate(nv50, p); 2895 if (!p->translated) 2896 assert(0); 2897 } 2898 2899 nv50_program_validate_data(nv50, p); 2900 nv50_program_validate_code(nv50, p); 2901 2902 so = so_new(13, 2); 2903 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 2904 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2905 NOUVEAU_BO_HIGH, 0, 0); 2906 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2907 NOUVEAU_BO_LOW, 0, 0); 2908 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 2909 so_data (so, p->cfg.attr[0]); 2910 so_data (so, p->cfg.attr[1]); 2911 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 2912 so_data (so, p->cfg.high_result); 2913 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); 2914 so_data (so, p->cfg.high_result); //8); 2915 so_data (so, p->cfg.high_temp); 2916 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 2917 so_data (so, 0); /* program start offset */ 2918 so_ref(so, &nv50->state.vertprog); 2919 so_ref(NULL, &so); 2920} 2921 2922void 2923nv50_fragprog_validate(struct nv50_context *nv50) 2924{ 2925 struct nouveau_grobj *tesla = nv50->screen->tesla; 2926 struct nv50_program *p = nv50->fragprog; 2927 struct nouveau_stateobj *so; 2928 2929 if (!p->translated) { 2930 nv50_program_validate(nv50, p); 2931 if (!p->translated) 2932 assert(0); 2933 } 2934 2935 nv50_program_validate_data(nv50, p); 2936 nv50_program_validate_code(nv50, p); 2937 2938 so = so_new(64, 2); 2939 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 2940 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2941 NOUVEAU_BO_HIGH, 0, 0); 2942 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2943 NOUVEAU_BO_LOW, 0, 0); 2944 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); 2945 so_data (so, p->cfg.high_temp); 2946 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 2947 so_data (so, p->cfg.high_result); 2948 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1); 2949 so_data (so, p->cfg.regs[2]); 2950 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 2951 so_data (so, p->cfg.regs[3]); 2952 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 2953 so_data (so, 0); /* program start offset */ 2954 so_ref(so, &nv50->state.fragprog); 2955 so_ref(NULL, &so); 2956} 2957 2958static void 2959nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) 2960{ 2961 struct nv50_program *fp = nv50->fragprog; 2962 struct nv50_program *vp = nv50->vertprog; 2963 unsigned i, c, m = base; 2964 2965 /* XXX: This can't work correctly in all cases yet, we either 2966 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has 2967 * to be per FP input instead of per VP output 2968 */ 2969 memset(pntc, 0, 8 * sizeof(uint32_t)); 2970 2971 for (i = 0; i < fp->cfg.io_nr; i++) { 2972 uint8_t sn, si; 2973 uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp; 2974 unsigned n = popcnt4(fp->cfg.io[i].mask); 2975 2976 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) { 2977 m += n; 2978 continue; 2979 } 2980 2981 sn = vp->info.input_semantic_name[j]; 2982 si = vp->info.input_semantic_index[j]; 2983 2984 if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) { 2985 ubyte mode = 2986 nv50->rasterizer->pipe.sprite_coord_mode[si]; 2987 2988 if (mode == PIPE_SPRITE_COORD_NONE) { 2989 m += n; 2990 continue; 2991 } 2992 } 2993 2994 /* this is either PointCoord or replaced by sprite coords */ 2995 for (c = 0; c < 4; c++) { 2996 if (!(fp->cfg.io[i].mask & (1 << c))) 2997 continue; 2998 pntc[m / 8] |= (c + 1) << ((m % 8) * 4); 2999 ++m; 3000 } 3001 } 3002} 3003 3004static int 3005nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4], 3006 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) 3007{ 3008 int c; 3009 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; 3010 uint8_t *map = (uint8_t *)p_map; 3011 3012 for (c = 0; c < 4; ++c) { 3013 if (mf & 1) { 3014 if (fpi->linear == TRUE) 3015 lin[mid / 32] |= 1 << (mid % 32); 3016 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40); 3017 } 3018 3019 oid += mv & 1; 3020 mf >>= 1; 3021 mv >>= 1; 3022 } 3023 3024 return mid; 3025} 3026 3027void 3028nv50_linkage_validate(struct nv50_context *nv50) 3029{ 3030 struct nouveau_grobj *tesla = nv50->screen->tesla; 3031 struct nv50_program *vp = nv50->vertprog; 3032 struct nv50_program *fp = nv50->fragprog; 3033 struct nouveau_stateobj *so; 3034 struct nv50_sreg4 dummy, *vpo; 3035 int i, n, c, m = 0; 3036 uint32_t map[16], lin[4], reg[5], pcrd[8]; 3037 3038 memset(map, 0, sizeof(map)); 3039 memset(lin, 0, sizeof(lin)); 3040 3041 reg[1] = 0x00000004; /* low and high clip distance map ids */ 3042 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ 3043 reg[3] = 0x00000000; /* point size map id & enable */ 3044 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ 3045 reg[4] = fp->cfg.regs[1]; /* interpolant info */ 3046 3047 dummy.linear = FALSE; 3048 dummy.mask = 0xf; /* map all components of HPOS */ 3049 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]); 3050 3051 dummy.mask = 0x0; 3052 3053 if (vp->cfg.clpd < 0x40) { 3054 for (c = 0; c < vp->cfg.clpd_nr; ++c) 3055 map[m++] = vp->cfg.clpd + c; 3056 reg[1] = (m << 8); 3057 } 3058 3059 reg[0] |= m << 8; /* adjust BFC0 id */ 3060 3061 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ 3062 if (nv50->rasterizer->pipe.light_twoside) { 3063 vpo = &vp->cfg.two_side[0]; 3064 3065 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]); 3066 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]); 3067 } 3068 3069 reg[0] += m - 4; /* adjust FFC0 id */ 3070 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ 3071 3072 i = 0; 3073 if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) 3074 i = 1; 3075 for (; i < fp->cfg.io_nr; i++) { 3076 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp]; 3077 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp]; 3078 3079 n = fp->cfg.io[i].id_vp; 3080 if (n >= vp->cfg.io_nr || 3081 vp->info.output_semantic_name[n] != sn || 3082 vp->info.output_semantic_index[n] != si) 3083 vpo = &dummy; 3084 else 3085 vpo = &vp->cfg.io[n]; 3086 3087 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo); 3088 } 3089 3090 if (nv50->rasterizer->pipe.point_size_per_vertex) { 3091 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); 3092 reg[3] = (m++ << 4) | 1; 3093 } 3094 3095 /* now fill the stateobj */ 3096 so = so_new(64, 0); 3097 3098 n = (m + 3) / 4; 3099 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); 3100 so_data (so, m); 3101 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); 3102 so_datap (so, map, n); 3103 3104 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 3105 so_datap (so, reg, 4); 3106 3107 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); 3108 so_data (so, reg[4]); 3109 3110 so_method(so, tesla, 0x1540, 4); 3111 so_datap (so, lin, 4); 3112 3113 if (nv50->rasterizer->pipe.point_sprite) { 3114 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff); 3115 3116 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); 3117 so_datap (so, pcrd, 8); 3118 } 3119 3120 so_ref(so, &nv50->state.programs); 3121 so_ref(NULL, &so); 3122} 3123 3124void 3125nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 3126{ 3127 while (p->exec_head) { 3128 struct nv50_program_exec *e = p->exec_head; 3129 3130 p->exec_head = e->next; 3131 FREE(e); 3132 } 3133 p->exec_tail = NULL; 3134 p->exec_size = 0; 3135 3136 nouveau_bo_ref(NULL, &p->bo); 3137 3138 nouveau_resource_free(&p->data[0]); 3139 3140 p->translated = 0; 3141} 3142