nv50_program.c revision d3a9cf54c0a95fb60ac8921e100d51b53c44541b
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "pipe/p_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33 34#define NV50_SU_MAX_TEMP 64 35//#define NV50_PROGRAM_DUMP 36 37/* ARL - gallium craps itself on progs/vp/arl.txt 38 * 39 * MSB - Like MAD, but MUL+SUB 40 * - Fuck it off, introduce a way to negate args for ops that 41 * support it. 42 * 43 * Look into inlining IMMD for ops other than MOV (make it general?) 44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 46 * 47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 48 * case, if the emit_src() causes the inst to suddenly become long. 49 * 50 * Verify half-insns work where expected - and force disable them where they 51 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 52 * 53 * FUCK! watch dst==src vectors, can overwrite components that are needed. 54 * ie. SUB R0, R0.yzxw, R0 55 * 56 * Things to check with renouveau: 57 * FP attr/result assignment - how? 58 * attrib 59 * - 0x16bc maps vp output onto fp hpos 60 * - 0x16c0 maps vp output onto fp col0 61 * result 62 * - colr always 0-3 63 * - depr always 4 64 * 0x16bc->0x16e8 --> some binding between vp/fp regs 65 * 0x16b8 --> VP output count 66 * 67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 68 * "MOV rcol.x, fcol.y" = 0x00000004 69 * 0x19a8 --> as above but 0x00000100 and 0x00000000 70 * - 0x00100000 used when KIL used 71 * 0x196c --> as above but 0x00000011 and 0x00000000 72 * 73 * 0x1988 --> 0xXXNNNNNN 74 * - XX == FP high something 75 */ 76struct nv50_reg { 77 enum { 78 P_TEMP, 79 P_ATTR, 80 P_RESULT, 81 P_CONST, 82 P_IMMD 83 } type; 84 int index; 85 86 int hw; 87 int neg; 88 89 int rhw; /* result hw for FP outputs, or interpolant index */ 90 int acc; /* instruction where this reg is last read (first insn == 1) */ 91}; 92 93struct nv50_pc { 94 struct nv50_program *p; 95 96 /* hw resources */ 97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 98 99 /* tgsi resources */ 100 struct nv50_reg *temp; 101 int temp_nr; 102 struct nv50_reg *attr; 103 int attr_nr; 104 struct nv50_reg *result; 105 int result_nr; 106 struct nv50_reg *param; 107 int param_nr; 108 struct nv50_reg *immd; 109 float *immd_buf; 110 int immd_nr; 111 112 struct nv50_reg *temp_temp[16]; 113 unsigned temp_temp_nr; 114 115 /* broadcast and destination replacement regs */ 116 struct nv50_reg *r_brdc; 117 struct nv50_reg *r_dst[4]; 118 119 unsigned interp_mode[32]; 120 /* perspective interpolation registers */ 121 struct nv50_reg *iv_p; 122 struct nv50_reg *iv_c; 123 124 /* current instruction and total number of insns */ 125 unsigned insn_cur; 126 unsigned insn_nr; 127 128 boolean allow32; 129}; 130 131static INLINE void 132ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) 133{ 134 reg->type = type; 135 reg->index = index; 136 reg->hw = hw; 137 reg->neg = 0; 138 reg->rhw = -1; 139 reg->acc = 0; 140} 141 142static INLINE unsigned 143popcnt4(uint32_t val) 144{ 145 static const unsigned cnt[16] 146 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 147 return cnt[val & 0xf]; 148} 149 150static void 151alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 152{ 153 int i = 0; 154 155 if (reg->type == P_RESULT) { 156 if (pc->p->cfg.high_result < (reg->hw + 1)) 157 pc->p->cfg.high_result = reg->hw + 1; 158 } 159 160 if (reg->type != P_TEMP) 161 return; 162 163 if (reg->hw >= 0) { 164 /*XXX: do this here too to catch FP temp-as-attr usage.. 165 * not clean, but works */ 166 if (pc->p->cfg.high_temp < (reg->hw + 1)) 167 pc->p->cfg.high_temp = reg->hw + 1; 168 return; 169 } 170 171 if (reg->rhw != -1) { 172 /* try to allocate temporary with index rhw first */ 173 if (!(pc->r_temp[reg->rhw])) { 174 pc->r_temp[reg->rhw] = reg; 175 reg->hw = reg->rhw; 176 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 177 pc->p->cfg.high_temp = reg->rhw + 1; 178 return; 179 } 180 /* make sure we don't get things like $r0 needs to go 181 * in $r1 and $r1 in $r0 182 */ 183 i = pc->result_nr * 4; 184 } 185 186 for (; i < NV50_SU_MAX_TEMP; i++) { 187 if (!(pc->r_temp[i])) { 188 pc->r_temp[i] = reg; 189 reg->hw = i; 190 if (pc->p->cfg.high_temp < (i + 1)) 191 pc->p->cfg.high_temp = i + 1; 192 return; 193 } 194 } 195 196 assert(0); 197} 198 199static struct nv50_reg * 200alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 201{ 202 struct nv50_reg *r; 203 int i; 204 205 if (dst && dst->type == P_TEMP && dst->hw == -1) 206 return dst; 207 208 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 209 if (!pc->r_temp[i]) { 210 r = MALLOC_STRUCT(nv50_reg); 211 ctor_reg(r, P_TEMP, -1, i); 212 pc->r_temp[i] = r; 213 return r; 214 } 215 } 216 217 assert(0); 218 return NULL; 219} 220 221/* Assign the hw of the discarded temporary register src 222 * to the tgsi register dst and free src. 223 */ 224static void 225assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 226{ 227 assert(src->index == -1 && src->hw != -1); 228 229 if (dst->hw != -1) 230 pc->r_temp[dst->hw] = NULL; 231 pc->r_temp[src->hw] = dst; 232 dst->hw = src->hw; 233 234 FREE(src); 235} 236 237/* release the hardware resource held by r */ 238static void 239release_hw(struct nv50_pc *pc, struct nv50_reg *r) 240{ 241 assert(r->type == P_TEMP); 242 if (r->hw == -1) 243 return; 244 245 assert(pc->r_temp[r->hw] == r); 246 pc->r_temp[r->hw] = NULL; 247 248 r->acc = 0; 249 if (r->index == -1) 250 FREE(r); 251} 252 253static void 254free_temp(struct nv50_pc *pc, struct nv50_reg *r) 255{ 256 if (r->index == -1) { 257 unsigned hw = r->hw; 258 259 FREE(pc->r_temp[hw]); 260 pc->r_temp[hw] = NULL; 261 } 262} 263 264static int 265alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 266{ 267 int i; 268 269 if ((idx + 4) >= NV50_SU_MAX_TEMP) 270 return 1; 271 272 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 273 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 274 return alloc_temp4(pc, dst, idx + 4); 275 276 for (i = 0; i < 4; i++) { 277 dst[i] = MALLOC_STRUCT(nv50_reg); 278 ctor_reg(dst[i], P_TEMP, -1, idx + i); 279 pc->r_temp[idx + i] = dst[i]; 280 } 281 282 return 0; 283} 284 285static void 286free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 287{ 288 int i; 289 290 for (i = 0; i < 4; i++) 291 free_temp(pc, reg[i]); 292} 293 294static struct nv50_reg * 295temp_temp(struct nv50_pc *pc) 296{ 297 if (pc->temp_temp_nr >= 16) 298 assert(0); 299 300 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 301 return pc->temp_temp[pc->temp_temp_nr++]; 302} 303 304static void 305kill_temp_temp(struct nv50_pc *pc) 306{ 307 int i; 308 309 for (i = 0; i < pc->temp_temp_nr; i++) 310 free_temp(pc, pc->temp_temp[i]); 311 pc->temp_temp_nr = 0; 312} 313 314static int 315ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w) 316{ 317 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)), 318 (pc->immd_nr + 1) * 4 * sizeof(float)); 319 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 320 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 321 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 322 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 323 324 return pc->immd_nr++; 325} 326 327static struct nv50_reg * 328alloc_immd(struct nv50_pc *pc, float f) 329{ 330 struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); 331 unsigned hw; 332 333 for (hw = 0; hw < pc->immd_nr * 4; hw++) 334 if (pc->immd_buf[hw] == f) 335 break; 336 337 if (hw == pc->immd_nr * 4) 338 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4; 339 340 ctor_reg(r, P_IMMD, -1, hw); 341 return r; 342} 343 344static struct nv50_program_exec * 345exec(struct nv50_pc *pc) 346{ 347 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 348 349 e->param.index = -1; 350 return e; 351} 352 353static void 354emit(struct nv50_pc *pc, struct nv50_program_exec *e) 355{ 356 struct nv50_program *p = pc->p; 357 358 if (p->exec_tail) 359 p->exec_tail->next = e; 360 if (!p->exec_head) 361 p->exec_head = e; 362 p->exec_tail = e; 363 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 364} 365 366static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 367 368static boolean 369is_long(struct nv50_program_exec *e) 370{ 371 if (e->inst[0] & 1) 372 return TRUE; 373 return FALSE; 374} 375 376static boolean 377is_immd(struct nv50_program_exec *e) 378{ 379 if (is_long(e) && (e->inst[1] & 3) == 3) 380 return TRUE; 381 return FALSE; 382} 383 384static INLINE void 385set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 386 struct nv50_program_exec *e) 387{ 388 set_long(pc, e); 389 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 390 e->inst[1] |= (pred << 7) | (idx << 12); 391} 392 393static INLINE void 394set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 395 struct nv50_program_exec *e) 396{ 397 set_long(pc, e); 398 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 399 e->inst[1] |= (idx << 4) | (on << 6); 400} 401 402static INLINE void 403set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 404{ 405 if (is_long(e)) 406 return; 407 408 e->inst[0] |= 1; 409 set_pred(pc, 0xf, 0, e); 410 set_pred_wr(pc, 0, 0, e); 411} 412 413static INLINE void 414set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 415{ 416 if (dst->type == P_RESULT) { 417 set_long(pc, e); 418 e->inst[1] |= 0x00000008; 419 } 420 421 alloc_reg(pc, dst); 422 e->inst[0] |= (dst->hw << 2); 423} 424 425static INLINE void 426set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 427{ 428 float f = pc->immd_buf[imm->hw]; 429 unsigned val = fui(imm->neg ? -f : f); 430 431 set_long(pc, e); 432 /*XXX: can't be predicated - bits overlap.. catch cases where both 433 * are required and avoid them. */ 434 set_pred(pc, 0, 0, e); 435 set_pred_wr(pc, 0, 0, e); 436 437 e->inst[1] |= 0x00000002 | 0x00000001; 438 e->inst[0] |= (val & 0x3f) << 16; 439 e->inst[1] |= (val >> 6) << 2; 440} 441 442 443#define INTERP_LINEAR 0 444#define INTERP_FLAT 1 445#define INTERP_PERSPECTIVE 2 446#define INTERP_CENTROID 4 447 448/* interpolant index has been stored in dst->rhw */ 449static void 450emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 451 unsigned mode) 452{ 453 assert(dst->rhw != -1); 454 struct nv50_program_exec *e = exec(pc); 455 456 e->inst[0] |= 0x80000000; 457 set_dst(pc, dst, e); 458 e->inst[0] |= (dst->rhw << 16); 459 460 if (mode & INTERP_FLAT) { 461 e->inst[0] |= (1 << 8); 462 } else { 463 if (mode & INTERP_PERSPECTIVE) { 464 e->inst[0] |= (1 << 25); 465 alloc_reg(pc, iv); 466 e->inst[0] |= (iv->hw << 9); 467 } 468 469 if (mode & INTERP_CENTROID) 470 e->inst[0] |= (1 << 24); 471 } 472 473 emit(pc, e); 474} 475 476static void 477set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 478 struct nv50_program_exec *e) 479{ 480 set_long(pc, e); 481 482 e->param.index = src->hw; 483 e->param.shift = s; 484 e->param.mask = m << (s % 32); 485 486 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 487} 488 489static void 490emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 491{ 492 struct nv50_program_exec *e = exec(pc); 493 494 e->inst[0] |= 0x10000000; 495 496 set_dst(pc, dst, e); 497 498 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) { 499 set_immd(pc, src, e); 500 /*XXX: 32-bit, but steals part of "half" reg space - need to 501 * catch and handle this case if/when we do half-regs 502 */ 503 } else 504 if (src->type == P_IMMD || src->type == P_CONST) { 505 set_long(pc, e); 506 set_data(pc, src, 0x7f, 9, e); 507 e->inst[1] |= 0x20000000; /* src0 const? */ 508 } else { 509 if (src->type == P_ATTR) { 510 set_long(pc, e); 511 e->inst[1] |= 0x00200000; 512 } 513 514 alloc_reg(pc, src); 515 e->inst[0] |= (src->hw << 9); 516 } 517 518 if (is_long(e) && !is_immd(e)) { 519 e->inst[1] |= 0x04000000; /* 32-bit */ 520 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */ 521 if (!(e->inst[1] & 0x20000000)) 522 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */ 523 } else 524 e->inst[0] |= 0x00008000; 525 526 emit(pc, e); 527} 528 529static INLINE void 530emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 531{ 532 struct nv50_reg *imm = alloc_immd(pc, f); 533 emit_mov(pc, dst, imm); 534 FREE(imm); 535} 536 537static boolean 538check_swap_src_0_1(struct nv50_pc *pc, 539 struct nv50_reg **s0, struct nv50_reg **s1) 540{ 541 struct nv50_reg *src0 = *s0, *src1 = *s1; 542 543 if (src0->type == P_CONST) { 544 if (src1->type != P_CONST) { 545 *s0 = src1; 546 *s1 = src0; 547 return TRUE; 548 } 549 } else 550 if (src1->type == P_ATTR) { 551 if (src0->type != P_ATTR) { 552 *s0 = src1; 553 *s1 = src0; 554 return TRUE; 555 } 556 } 557 558 return FALSE; 559} 560 561static void 562set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 563{ 564 if (src->type == P_ATTR) { 565 set_long(pc, e); 566 e->inst[1] |= 0x00200000; 567 } else 568 if (src->type == P_CONST || src->type == P_IMMD) { 569 struct nv50_reg *temp = temp_temp(pc); 570 571 emit_mov(pc, temp, src); 572 src = temp; 573 } 574 575 alloc_reg(pc, src); 576 e->inst[0] |= (src->hw << 9); 577} 578 579static void 580set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 581{ 582 if (src->type == P_ATTR) { 583 struct nv50_reg *temp = temp_temp(pc); 584 585 emit_mov(pc, temp, src); 586 src = temp; 587 } else 588 if (src->type == P_CONST || src->type == P_IMMD) { 589 assert(!(e->inst[0] & 0x00800000)); 590 if (e->inst[0] & 0x01000000) { 591 struct nv50_reg *temp = temp_temp(pc); 592 593 emit_mov(pc, temp, src); 594 src = temp; 595 } else { 596 set_data(pc, src, 0x7f, 16, e); 597 e->inst[0] |= 0x00800000; 598 } 599 } 600 601 alloc_reg(pc, src); 602 e->inst[0] |= (src->hw << 16); 603} 604 605static void 606set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 607{ 608 set_long(pc, e); 609 610 if (src->type == P_ATTR) { 611 struct nv50_reg *temp = temp_temp(pc); 612 613 emit_mov(pc, temp, src); 614 src = temp; 615 } else 616 if (src->type == P_CONST || src->type == P_IMMD) { 617 assert(!(e->inst[0] & 0x01000000)); 618 if (e->inst[0] & 0x00800000) { 619 struct nv50_reg *temp = temp_temp(pc); 620 621 emit_mov(pc, temp, src); 622 src = temp; 623 } else { 624 set_data(pc, src, 0x7f, 32+14, e); 625 e->inst[0] |= 0x01000000; 626 } 627 } 628 629 alloc_reg(pc, src); 630 e->inst[1] |= (src->hw << 14); 631} 632 633static void 634emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 635 struct nv50_reg *src1) 636{ 637 struct nv50_program_exec *e = exec(pc); 638 639 e->inst[0] |= 0xc0000000; 640 641 if (!pc->allow32) 642 set_long(pc, e); 643 644 check_swap_src_0_1(pc, &src0, &src1); 645 set_dst(pc, dst, e); 646 set_src_0(pc, src0, e); 647 if (src1->type == P_IMMD && !is_long(e)) { 648 if (src0->neg) 649 e->inst[0] |= 0x00008000; 650 set_immd(pc, src1, e); 651 } else { 652 set_src_1(pc, src1, e); 653 if (src0->neg ^ src1->neg) { 654 if (is_long(e)) 655 e->inst[1] |= 0x08000000; 656 else 657 e->inst[0] |= 0x00008000; 658 } 659 } 660 661 emit(pc, e); 662} 663 664static void 665emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 666 struct nv50_reg *src0, struct nv50_reg *src1) 667{ 668 struct nv50_program_exec *e = exec(pc); 669 670 e->inst[0] |= 0xb0000000; 671 672 check_swap_src_0_1(pc, &src0, &src1); 673 674 if (!pc->allow32 || src0->neg || src1->neg) { 675 set_long(pc, e); 676 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27); 677 } 678 679 set_dst(pc, dst, e); 680 set_src_0(pc, src0, e); 681 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 682 set_src_2(pc, src1, e); 683 else 684 if (src1->type == P_IMMD) 685 set_immd(pc, src1, e); 686 else 687 set_src_1(pc, src1, e); 688 689 emit(pc, e); 690} 691 692static void 693emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 694 struct nv50_reg *src0, struct nv50_reg *src1) 695{ 696 struct nv50_program_exec *e = exec(pc); 697 698 set_long(pc, e); 699 e->inst[0] |= 0xb0000000; 700 e->inst[1] |= (sub << 29); 701 702 check_swap_src_0_1(pc, &src0, &src1); 703 set_dst(pc, dst, e); 704 set_src_0(pc, src0, e); 705 set_src_1(pc, src1, e); 706 707 emit(pc, e); 708} 709 710static INLINE void 711emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 712 struct nv50_reg *src1) 713{ 714 src1->neg ^= 1; 715 emit_add(pc, dst, src0, src1); 716 src1->neg ^= 1; 717} 718 719static void 720emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 721 struct nv50_reg *src1, struct nv50_reg *src2) 722{ 723 struct nv50_program_exec *e = exec(pc); 724 725 e->inst[0] |= 0xe0000000; 726 727 check_swap_src_0_1(pc, &src0, &src1); 728 set_dst(pc, dst, e); 729 set_src_0(pc, src0, e); 730 set_src_1(pc, src1, e); 731 set_src_2(pc, src2, e); 732 733 if (src0->neg ^ src1->neg) 734 e->inst[1] |= 0x04000000; 735 if (src2->neg) 736 e->inst[1] |= 0x08000000; 737 738 emit(pc, e); 739} 740 741static INLINE void 742emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 743 struct nv50_reg *src1, struct nv50_reg *src2) 744{ 745 src2->neg ^= 1; 746 emit_mad(pc, dst, src0, src1, src2); 747 src2->neg ^= 1; 748} 749 750static void 751emit_flop(struct nv50_pc *pc, unsigned sub, 752 struct nv50_reg *dst, struct nv50_reg *src) 753{ 754 struct nv50_program_exec *e = exec(pc); 755 756 e->inst[0] |= 0x90000000; 757 if (sub) { 758 set_long(pc, e); 759 e->inst[1] |= (sub << 29); 760 } 761 762 set_dst(pc, dst, e); 763 set_src_0(pc, src, e); 764 765 emit(pc, e); 766} 767 768static void 769emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 770{ 771 struct nv50_program_exec *e = exec(pc); 772 773 e->inst[0] |= 0xb0000000; 774 775 set_dst(pc, dst, e); 776 set_src_0(pc, src, e); 777 set_long(pc, e); 778 e->inst[1] |= (6 << 29) | 0x00004000; 779 780 emit(pc, e); 781} 782 783static void 784emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 785{ 786 struct nv50_program_exec *e = exec(pc); 787 788 e->inst[0] |= 0xb0000000; 789 790 set_dst(pc, dst, e); 791 set_src_0(pc, src, e); 792 set_long(pc, e); 793 e->inst[1] |= (6 << 29); 794 795 emit(pc, e); 796} 797 798#define CVTOP_RN 0x01 799#define CVTOP_FLOOR 0x03 800#define CVTOP_CEIL 0x05 801#define CVTOP_TRUNC 0x07 802#define CVTOP_SAT 0x08 803#define CVTOP_ABS 0x10 804 805/* 0x04 == 32 bit */ 806/* 0x40 == dst is float */ 807/* 0x80 == src is float */ 808#define CVT_F32_F32 0xc4 809#define CVT_F32_S32 0x44 810#define CVT_F32_U32 0x64 811#define CVT_S32_F32 0x8c 812#define CVT_S32_S32 0x0c 813#define CVT_F32_F32_ROP 0xcc 814 815static void 816emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 817 int wp, unsigned cvn, unsigned fmt) 818{ 819 struct nv50_program_exec *e; 820 821 e = exec(pc); 822 set_long(pc, e); 823 824 e->inst[0] |= 0xa0000000; 825 e->inst[1] |= 0x00004000; 826 e->inst[1] |= (cvn << 16); 827 e->inst[1] |= (fmt << 24); 828 set_src_0(pc, src, e); 829 830 if (wp >= 0) 831 set_pred_wr(pc, 1, wp, e); 832 833 if (dst) 834 set_dst(pc, dst, e); 835 else { 836 e->inst[0] |= 0x000001fc; 837 e->inst[1] |= 0x00000008; 838 } 839 840 emit(pc, e); 841} 842 843/* nv50 Condition codes: 844 * 0x1 = LT 845 * 0x2 = EQ 846 * 0x3 = LE 847 * 0x4 = GT 848 * 0x5 = NE 849 * 0x6 = GE 850 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) 851 * 0x8 = unordered bit (allows NaN) 852 */ 853static void 854emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, 855 struct nv50_reg *src0, struct nv50_reg *src1) 856{ 857 static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 858 859 struct nv50_program_exec *e = exec(pc); 860 struct nv50_reg *rdst; 861 862 assert(ccode < 16); 863 if (check_swap_src_0_1(pc, &src0, &src1)) 864 ccode = cc_swapped[ccode & 7] | (ccode & 8); 865 866 rdst = dst; 867 if (dst && dst->type != P_TEMP) 868 dst = alloc_temp(pc, NULL); 869 870 /* set.u32 */ 871 set_long(pc, e); 872 e->inst[0] |= 0xb0000000; 873 e->inst[1] |= 0x60000000 | (ccode << 14); 874 875 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but 876 * that doesn't seem to match what the hw actually does 877 e->inst[1] |= 0x04000000; << breaks things, u32 by default ? 878 */ 879 880 if (wp >= 0) 881 set_pred_wr(pc, 1, wp, e); 882 if (dst) 883 set_dst(pc, dst, e); 884 else { 885 e->inst[0] |= 0x000001fc; 886 e->inst[1] |= 0x00000008; 887 } 888 889 set_src_0(pc, src0, e); 890 set_src_1(pc, src1, e); 891 892 emit(pc, e); 893 894 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */ 895 if (rdst) 896 emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32); 897 if (rdst && rdst != dst) 898 free_temp(pc, dst); 899} 900 901static INLINE unsigned 902map_tgsi_setop_cc(unsigned op) 903{ 904 switch (op) { 905 case TGSI_OPCODE_SLT: return 0x1; 906 case TGSI_OPCODE_SGE: return 0x6; 907 case TGSI_OPCODE_SEQ: return 0x2; 908 case TGSI_OPCODE_SGT: return 0x4; 909 case TGSI_OPCODE_SLE: return 0x3; 910 case TGSI_OPCODE_SNE: return 0xd; 911 default: 912 assert(0); 913 return 0; 914 } 915} 916 917static INLINE void 918emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 919{ 920 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP); 921} 922 923static void 924emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 925 struct nv50_reg *v, struct nv50_reg *e) 926{ 927 struct nv50_reg *temp = alloc_temp(pc, NULL); 928 929 emit_flop(pc, 3, temp, v); 930 emit_mul(pc, temp, temp, e); 931 emit_preex2(pc, temp, temp); 932 emit_flop(pc, 6, dst, temp); 933 934 free_temp(pc, temp); 935} 936 937static INLINE void 938emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 939{ 940 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32); 941} 942 943static INLINE void 944emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 945{ 946 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32); 947} 948 949static void 950emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 951 struct nv50_reg **src) 952{ 953 struct nv50_reg *one = alloc_immd(pc, 1.0); 954 struct nv50_reg *zero = alloc_immd(pc, 0.0); 955 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 956 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 957 struct nv50_reg *tmp[4]; 958 boolean allow32 = pc->allow32; 959 960 pc->allow32 = FALSE; 961 962 if (mask & (3 << 1)) { 963 tmp[0] = alloc_temp(pc, NULL); 964 emit_minmax(pc, 4, tmp[0], src[0], zero); 965 } 966 967 if (mask & (1 << 2)) { 968 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 969 970 tmp[1] = temp_temp(pc); 971 emit_minmax(pc, 4, tmp[1], src[1], zero); 972 973 tmp[3] = temp_temp(pc); 974 emit_minmax(pc, 4, tmp[3], src[3], neg128); 975 emit_minmax(pc, 5, tmp[3], tmp[3], pos128); 976 977 emit_pow(pc, dst[2], tmp[1], tmp[3]); 978 emit_mov(pc, dst[2], zero); 979 set_pred(pc, 3, 0, pc->p->exec_tail); 980 } 981 982 if (mask & (1 << 1)) 983 assimilate_temp(pc, dst[1], tmp[0]); 984 else 985 if (mask & (1 << 2)) 986 free_temp(pc, tmp[0]); 987 988 pc->allow32 = allow32; 989 990 /* do this last, in case src[i,j] == dst[0,3] */ 991 if (mask & (1 << 0)) 992 emit_mov(pc, dst[0], one); 993 994 if (mask & (1 << 3)) 995 emit_mov(pc, dst[3], one); 996 997 FREE(pos128); 998 FREE(neg128); 999 FREE(zero); 1000 FREE(one); 1001} 1002 1003static void 1004emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 1005{ 1006 struct nv50_program_exec *e = exec(pc); 1007 1008 set_long(pc, e); 1009 e->inst[0] |= 0xa0000000; /* delta */ 1010 e->inst[1] |= (7 << 29); /* delta */ 1011 e->inst[1] |= 0x04000000; /* negate arg0? probably not */ 1012 e->inst[1] |= (1 << 14); /* src .f32 */ 1013 set_dst(pc, dst, e); 1014 set_src_0(pc, src, e); 1015 1016 emit(pc, e); 1017} 1018 1019static void 1020emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 1021{ 1022 struct nv50_program_exec *e; 1023 const int r_pred = 1; 1024 1025 /* Sets predicate reg ? */ 1026 e = exec(pc); 1027 e->inst[0] = 0xa00001fd; 1028 e->inst[1] = 0xc4014788; 1029 set_src_0(pc, src, e); 1030 set_pred_wr(pc, 1, r_pred, e); 1031 if (src->neg) 1032 e->inst[1] |= 0x20000000; 1033 emit(pc, e); 1034 1035 /* This is probably KILP */ 1036 e = exec(pc); 1037 e->inst[0] = 0x000001fe; 1038 set_long(pc, e); 1039 set_pred(pc, 1 /* LT? */, r_pred, e); 1040 emit(pc, e); 1041} 1042 1043static void 1044emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1045 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj) 1046{ 1047 struct nv50_reg *temp, *t[4]; 1048 struct nv50_program_exec *e; 1049 1050 unsigned c, mode, dim; 1051 1052 switch (type) { 1053 case TGSI_TEXTURE_1D: 1054 dim = 1; 1055 break; 1056 case TGSI_TEXTURE_UNKNOWN: 1057 case TGSI_TEXTURE_2D: 1058 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */ 1059 case TGSI_TEXTURE_RECT: 1060 dim = 2; 1061 break; 1062 case TGSI_TEXTURE_3D: 1063 case TGSI_TEXTURE_CUBE: 1064 case TGSI_TEXTURE_SHADOW2D: 1065 case TGSI_TEXTURE_SHADOWRECT: /* XXX */ 1066 dim = 3; 1067 break; 1068 default: 1069 assert(0); 1070 break; 1071 } 1072 1073 /* some cards need t[0]'s hw index to be a multiple of 4 */ 1074 alloc_temp4(pc, t, 0); 1075 1076 if (proj) { 1077 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1078 mode = pc->interp_mode[src[0]->index]; 1079 1080 t[3]->rhw = src[3]->rhw; 1081 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1082 emit_flop(pc, 0, t[3], t[3]); 1083 1084 for (c = 0; c < dim; c++) { 1085 t[c]->rhw = src[c]->rhw; 1086 emit_interp(pc, t[c], t[3], 1087 (mode | INTERP_PERSPECTIVE)); 1088 } 1089 } else { 1090 emit_flop(pc, 0, t[3], src[3]); 1091 for (c = 0; c < dim; c++) 1092 emit_mul(pc, t[c], src[c], t[3]); 1093 1094 /* XXX: for some reason the blob sometimes uses MAD: 1095 * emit_mad(pc, t[c], src[0][c], t[3], t[3]) 1096 * pc->p->exec_tail->inst[1] |= 0x080fc000; 1097 */ 1098 } 1099 } else { 1100 if (type == TGSI_TEXTURE_CUBE) { 1101 temp = temp_temp(pc); 1102 emit_minmax(pc, 4, temp, src[0], src[1]); 1103 emit_minmax(pc, 4, temp, temp, src[2]); 1104 emit_flop(pc, 0, temp, temp); 1105 for (c = 0; c < 3; c++) 1106 emit_mul(pc, t[c], src[c], temp); 1107 } else { 1108 for (c = 0; c < dim; c++) 1109 emit_mov(pc, t[c], src[c]); 1110 } 1111 } 1112 1113 e = exec(pc); 1114 set_long(pc, e); 1115 e->inst[0] |= 0xf0000000; 1116 e->inst[1] |= 0x00000004; 1117 set_dst(pc, t[0], e); 1118 e->inst[0] |= (unit << 9); 1119 1120 if (dim == 2) 1121 e->inst[0] |= 0x00400000; 1122 else 1123 if (dim == 3) 1124 e->inst[0] |= 0x00800000; 1125 1126 e->inst[0] |= (mask & 0x3) << 25; 1127 e->inst[1] |= (mask & 0xc) << 12; 1128 1129 emit(pc, e); 1130 1131#if 1 1132 if (mask & 1) emit_mov(pc, dst[0], t[0]); 1133 if (mask & 2) emit_mov(pc, dst[1], t[1]); 1134 if (mask & 4) emit_mov(pc, dst[2], t[2]); 1135 if (mask & 8) emit_mov(pc, dst[3], t[3]); 1136 1137 free_temp4(pc, t); 1138#else 1139 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1140 * the texture coordinates, not the fetched values: latency ? */ 1141 1142 for (c = 0; c < 4; c++) { 1143 if (mask & (1 << c)) 1144 assimilate_temp(pc, dst[c], t[c]); 1145 else 1146 free_temp(pc, t[c]); 1147 } 1148#endif 1149} 1150 1151static void 1152convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1153{ 1154 unsigned q = 0, m = ~0; 1155 1156 assert(!is_long(e)); 1157 1158 switch (e->inst[0] >> 28) { 1159 case 0x1: 1160 /* MOV */ 1161 q = 0x0403c000; 1162 m = 0xffff7fff; 1163 break; 1164 case 0x8: 1165 /* INTERP (move centroid, perspective and flat bits) */ 1166 m = ~0x03000100; 1167 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 1168 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 1169 break; 1170 case 0x9: 1171 /* RCP */ 1172 break; 1173 case 0xB: 1174 /* ADD */ 1175 m = ~(127 << 16); 1176 q = ((e->inst[0] & (~m)) >> 2); 1177 break; 1178 case 0xC: 1179 /* MUL */ 1180 m = ~0x00008000; 1181 q = ((e->inst[0] & (~m)) << 12); 1182 break; 1183 case 0xE: 1184 /* MAD (if src2 == dst) */ 1185 q = ((e->inst[0] & 0x1fc) << 12); 1186 break; 1187 default: 1188 assert(0); 1189 break; 1190 } 1191 1192 set_long(pc, e); 1193 pc->p->exec_size++; 1194 1195 e->inst[0] &= m; 1196 e->inst[1] |= q; 1197} 1198 1199static boolean 1200negate_supported(const struct tgsi_full_instruction *insn, int i) 1201{ 1202 switch (insn->Instruction.Opcode) { 1203 case TGSI_OPCODE_DP3: 1204 case TGSI_OPCODE_DP4: 1205 case TGSI_OPCODE_MUL: 1206 case TGSI_OPCODE_KIL: 1207 case TGSI_OPCODE_ADD: 1208 case TGSI_OPCODE_SUB: 1209 case TGSI_OPCODE_MAD: 1210 return TRUE; 1211 case TGSI_OPCODE_POW: 1212 return (i == 1) ? TRUE : FALSE; 1213 default: 1214 return FALSE; 1215 } 1216} 1217 1218/* Return a read mask for source registers deduced from opcode & write mask. */ 1219static unsigned 1220nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 1221{ 1222 unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask; 1223 1224 switch (insn->Instruction.Opcode) { 1225 case TGSI_OPCODE_COS: 1226 case TGSI_OPCODE_SIN: 1227 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 1228 case TGSI_OPCODE_DP3: 1229 return 0x7; 1230 case TGSI_OPCODE_DP4: 1231 case TGSI_OPCODE_DPH: 1232 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 1233 return 0xf; 1234 case TGSI_OPCODE_DST: 1235 return mask & (c ? 0xa : 0x6); 1236 case TGSI_OPCODE_EX2: 1237 case TGSI_OPCODE_LG2: 1238 case TGSI_OPCODE_POW: 1239 case TGSI_OPCODE_RCP: 1240 case TGSI_OPCODE_RSQ: 1241 case TGSI_OPCODE_SCS: 1242 return 0x1; 1243 case TGSI_OPCODE_LIT: 1244 return 0xb; 1245 case TGSI_OPCODE_TEX: 1246 case TGSI_OPCODE_TXP: 1247 { 1248 const struct tgsi_instruction_ext_texture *tex; 1249 1250 assert(insn->Instruction.Extended); 1251 tex = &insn->InstructionExtTexture; 1252 1253 mask = 0x7; 1254 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) 1255 mask |= 0x8; 1256 1257 switch (tex->Texture) { 1258 case TGSI_TEXTURE_1D: 1259 mask &= 0x9; 1260 break; 1261 case TGSI_TEXTURE_2D: 1262 mask &= 0xb; 1263 break; 1264 default: 1265 break; 1266 } 1267 } 1268 return mask; 1269 case TGSI_OPCODE_XPD: 1270 x = 0; 1271 if (mask & 1) x |= 0x6; 1272 if (mask & 2) x |= 0x5; 1273 if (mask & 4) x |= 0x3; 1274 return x; 1275 default: 1276 break; 1277 } 1278 1279 return mask; 1280} 1281 1282static struct nv50_reg * 1283tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 1284{ 1285 switch (dst->DstRegister.File) { 1286 case TGSI_FILE_TEMPORARY: 1287 return &pc->temp[dst->DstRegister.Index * 4 + c]; 1288 case TGSI_FILE_OUTPUT: 1289 return &pc->result[dst->DstRegister.Index * 4 + c]; 1290 case TGSI_FILE_NULL: 1291 return NULL; 1292 default: 1293 break; 1294 } 1295 1296 return NULL; 1297} 1298 1299static struct nv50_reg * 1300tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 1301 boolean neg) 1302{ 1303 struct nv50_reg *r = NULL; 1304 struct nv50_reg *temp; 1305 unsigned sgn, c; 1306 1307 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 1308 1309 c = tgsi_util_get_full_src_register_extswizzle(src, chan); 1310 switch (c) { 1311 case TGSI_EXTSWIZZLE_X: 1312 case TGSI_EXTSWIZZLE_Y: 1313 case TGSI_EXTSWIZZLE_Z: 1314 case TGSI_EXTSWIZZLE_W: 1315 switch (src->SrcRegister.File) { 1316 case TGSI_FILE_INPUT: 1317 r = &pc->attr[src->SrcRegister.Index * 4 + c]; 1318 break; 1319 case TGSI_FILE_TEMPORARY: 1320 r = &pc->temp[src->SrcRegister.Index * 4 + c]; 1321 break; 1322 case TGSI_FILE_CONSTANT: 1323 r = &pc->param[src->SrcRegister.Index * 4 + c]; 1324 break; 1325 case TGSI_FILE_IMMEDIATE: 1326 r = &pc->immd[src->SrcRegister.Index * 4 + c]; 1327 break; 1328 case TGSI_FILE_SAMPLER: 1329 break; 1330 default: 1331 assert(0); 1332 break; 1333 } 1334 break; 1335 case TGSI_EXTSWIZZLE_ZERO: 1336 r = alloc_immd(pc, 0.0); 1337 return r; 1338 case TGSI_EXTSWIZZLE_ONE: 1339 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET) 1340 return alloc_immd(pc, -1.0); 1341 return alloc_immd(pc, 1.0); 1342 default: 1343 assert(0); 1344 break; 1345 } 1346 1347 switch (sgn) { 1348 case TGSI_UTIL_SIGN_KEEP: 1349 break; 1350 case TGSI_UTIL_SIGN_CLEAR: 1351 temp = temp_temp(pc); 1352 emit_abs(pc, temp, r); 1353 r = temp; 1354 break; 1355 case TGSI_UTIL_SIGN_TOGGLE: 1356 if (neg) 1357 r->neg = 1; 1358 else { 1359 temp = temp_temp(pc); 1360 emit_neg(pc, temp, r); 1361 r = temp; 1362 } 1363 break; 1364 case TGSI_UTIL_SIGN_SET: 1365 temp = temp_temp(pc); 1366 emit_abs(pc, temp, r); 1367 if (neg) 1368 temp->neg = 1; 1369 else 1370 emit_neg(pc, temp, temp); 1371 r = temp; 1372 break; 1373 default: 1374 assert(0); 1375 break; 1376 } 1377 1378 return r; 1379} 1380 1381/* return TRUE for ops that produce only a single result */ 1382static boolean 1383is_scalar_op(unsigned op) 1384{ 1385 switch (op) { 1386 case TGSI_OPCODE_COS: 1387 case TGSI_OPCODE_DP2: 1388 case TGSI_OPCODE_DP3: 1389 case TGSI_OPCODE_DP4: 1390 case TGSI_OPCODE_DPH: 1391 case TGSI_OPCODE_EX2: 1392 case TGSI_OPCODE_LG2: 1393 case TGSI_OPCODE_POW: 1394 case TGSI_OPCODE_RCP: 1395 case TGSI_OPCODE_RSQ: 1396 case TGSI_OPCODE_SIN: 1397 /* 1398 case TGSI_OPCODE_KIL: 1399 case TGSI_OPCODE_LIT: 1400 case TGSI_OPCODE_SCS: 1401 */ 1402 return TRUE; 1403 default: 1404 return FALSE; 1405 } 1406} 1407 1408/* Returns a bitmask indicating which dst components depend 1409 * on source s, component c (reverse of nv50_tgsi_src_mask). 1410 */ 1411static unsigned 1412nv50_tgsi_dst_revdep(unsigned op, int s, int c) 1413{ 1414 if (is_scalar_op(op)) 1415 return 0x1; 1416 1417 switch (op) { 1418 case TGSI_OPCODE_DST: 1419 return (1 << c) & (s ? 0xa : 0x6); 1420 case TGSI_OPCODE_XPD: 1421 switch (c) { 1422 case 0: return 0x6; 1423 case 1: return 0x5; 1424 case 2: return 0x3; 1425 case 3: return 0x0; 1426 default: 1427 assert(0); 1428 return 0x0; 1429 } 1430 case TGSI_OPCODE_LIT: 1431 case TGSI_OPCODE_SCS: 1432 case TGSI_OPCODE_TEX: 1433 case TGSI_OPCODE_TXP: 1434 /* these take care of dangerous swizzles themselves */ 1435 return 0x0; 1436 case TGSI_OPCODE_IF: 1437 case TGSI_OPCODE_KIL: 1438 /* don't call this function for these ops */ 1439 assert(0); 1440 return 0; 1441 default: 1442 /* linear vector instruction */ 1443 return (1 << c); 1444 } 1445} 1446 1447static boolean 1448nv50_program_tx_insn(struct nv50_pc *pc, 1449 const struct tgsi_full_instruction *inst) 1450{ 1451 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 1452 unsigned mask, sat, unit; 1453 int i, c; 1454 1455 mask = inst->FullDstRegisters[0].DstRegister.WriteMask; 1456 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 1457 1458 memset(src, 0, sizeof(src)); 1459 1460 for (c = 0; c < 4; c++) { 1461 if ((mask & (1 << c)) && !pc->r_dst[c]) 1462 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); 1463 else 1464 dst[c] = pc->r_dst[c]; 1465 rdst[c] = dst[c]; 1466 } 1467 1468 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1469 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; 1470 unsigned src_mask; 1471 boolean neg_supp; 1472 1473 src_mask = nv50_tgsi_src_mask(inst, i); 1474 neg_supp = negate_supported(inst, i); 1475 1476 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) 1477 unit = fs->SrcRegister.Index; 1478 1479 for (c = 0; c < 4; c++) 1480 if (src_mask & (1 << c)) 1481 src[i][c] = tgsi_src(pc, c, fs, neg_supp); 1482 } 1483 1484 brdc = temp = pc->r_brdc; 1485 if (brdc && brdc->type != P_TEMP) { 1486 temp = temp_temp(pc); 1487 if (sat) 1488 brdc = temp; 1489 } else 1490 if (sat) { 1491 for (c = 0; c < 4; c++) { 1492 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 1493 continue; 1494 rdst[c] = dst[c]; 1495 dst[c] = temp_temp(pc); 1496 } 1497 } 1498 1499 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 1500 1501 switch (inst->Instruction.Opcode) { 1502 case TGSI_OPCODE_ABS: 1503 for (c = 0; c < 4; c++) { 1504 if (!(mask & (1 << c))) 1505 continue; 1506 emit_abs(pc, dst[c], src[0][c]); 1507 } 1508 break; 1509 case TGSI_OPCODE_ADD: 1510 for (c = 0; c < 4; c++) { 1511 if (!(mask & (1 << c))) 1512 continue; 1513 emit_add(pc, dst[c], src[0][c], src[1][c]); 1514 } 1515 break; 1516 case TGSI_OPCODE_CEIL: 1517 for (c = 0; c < 4; c++) { 1518 if (!(mask & (1 << c))) 1519 continue; 1520 emit_cvt(pc, dst[c], src[0][c], -1, 1521 CVTOP_CEIL, CVT_F32_F32); 1522 } 1523 break; 1524 case TGSI_OPCODE_COS: 1525 if (mask & 8) { 1526 emit_precossin(pc, temp, src[0][3]); 1527 emit_flop(pc, 5, dst[3], temp); 1528 if (!(mask &= 7)) 1529 break; 1530 if (temp == dst[3]) 1531 temp = brdc = temp_temp(pc); 1532 } 1533 emit_precossin(pc, temp, src[0][0]); 1534 emit_flop(pc, 5, brdc, temp); 1535 break; 1536 case TGSI_OPCODE_DP3: 1537 emit_mul(pc, temp, src[0][0], src[1][0]); 1538 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1539 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 1540 break; 1541 case TGSI_OPCODE_DP4: 1542 emit_mul(pc, temp, src[0][0], src[1][0]); 1543 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1544 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1545 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 1546 break; 1547 case TGSI_OPCODE_DPH: 1548 emit_mul(pc, temp, src[0][0], src[1][0]); 1549 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1550 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1551 emit_add(pc, brdc, src[1][3], temp); 1552 break; 1553 case TGSI_OPCODE_DST: 1554 if (mask & (1 << 1)) 1555 emit_mul(pc, dst[1], src[0][1], src[1][1]); 1556 if (mask & (1 << 2)) 1557 emit_mov(pc, dst[2], src[0][2]); 1558 if (mask & (1 << 3)) 1559 emit_mov(pc, dst[3], src[1][3]); 1560 if (mask & (1 << 0)) 1561 emit_mov_immdval(pc, dst[0], 1.0f); 1562 break; 1563 case TGSI_OPCODE_EX2: 1564 emit_preex2(pc, temp, src[0][0]); 1565 emit_flop(pc, 6, brdc, temp); 1566 break; 1567 case TGSI_OPCODE_FLR: 1568 for (c = 0; c < 4; c++) { 1569 if (!(mask & (1 << c))) 1570 continue; 1571 emit_flr(pc, dst[c], src[0][c]); 1572 } 1573 break; 1574 case TGSI_OPCODE_FRC: 1575 temp = temp_temp(pc); 1576 for (c = 0; c < 4; c++) { 1577 if (!(mask & (1 << c))) 1578 continue; 1579 emit_flr(pc, temp, src[0][c]); 1580 emit_sub(pc, dst[c], src[0][c], temp); 1581 } 1582 break; 1583 case TGSI_OPCODE_KIL: 1584 emit_kil(pc, src[0][0]); 1585 emit_kil(pc, src[0][1]); 1586 emit_kil(pc, src[0][2]); 1587 emit_kil(pc, src[0][3]); 1588 break; 1589 case TGSI_OPCODE_LIT: 1590 emit_lit(pc, &dst[0], mask, &src[0][0]); 1591 break; 1592 case TGSI_OPCODE_LG2: 1593 emit_flop(pc, 3, brdc, src[0][0]); 1594 break; 1595 case TGSI_OPCODE_LRP: 1596 temp = temp_temp(pc); 1597 for (c = 0; c < 4; c++) { 1598 if (!(mask & (1 << c))) 1599 continue; 1600 emit_sub(pc, temp, src[1][c], src[2][c]); 1601 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 1602 } 1603 break; 1604 case TGSI_OPCODE_MAD: 1605 for (c = 0; c < 4; c++) { 1606 if (!(mask & (1 << c))) 1607 continue; 1608 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 1609 } 1610 break; 1611 case TGSI_OPCODE_MAX: 1612 for (c = 0; c < 4; c++) { 1613 if (!(mask & (1 << c))) 1614 continue; 1615 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]); 1616 } 1617 break; 1618 case TGSI_OPCODE_MIN: 1619 for (c = 0; c < 4; c++) { 1620 if (!(mask & (1 << c))) 1621 continue; 1622 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]); 1623 } 1624 break; 1625 case TGSI_OPCODE_MOV: 1626 case TGSI_OPCODE_SWZ: 1627 for (c = 0; c < 4; c++) { 1628 if (!(mask & (1 << c))) 1629 continue; 1630 emit_mov(pc, dst[c], src[0][c]); 1631 } 1632 break; 1633 case TGSI_OPCODE_MUL: 1634 for (c = 0; c < 4; c++) { 1635 if (!(mask & (1 << c))) 1636 continue; 1637 emit_mul(pc, dst[c], src[0][c], src[1][c]); 1638 } 1639 break; 1640 case TGSI_OPCODE_POW: 1641 emit_pow(pc, brdc, src[0][0], src[1][0]); 1642 break; 1643 case TGSI_OPCODE_RCP: 1644 emit_flop(pc, 0, brdc, src[0][0]); 1645 break; 1646 case TGSI_OPCODE_RSQ: 1647 emit_flop(pc, 2, brdc, src[0][0]); 1648 break; 1649 case TGSI_OPCODE_SCS: 1650 temp = temp_temp(pc); 1651 if (mask & 3) 1652 emit_precossin(pc, temp, src[0][0]); 1653 if (mask & (1 << 0)) 1654 emit_flop(pc, 5, dst[0], temp); 1655 if (mask & (1 << 1)) 1656 emit_flop(pc, 4, dst[1], temp); 1657 if (mask & (1 << 2)) 1658 emit_mov_immdval(pc, dst[2], 0.0); 1659 if (mask & (1 << 3)) 1660 emit_mov_immdval(pc, dst[3], 1.0); 1661 break; 1662 case TGSI_OPCODE_SIN: 1663 if (mask & 8) { 1664 emit_precossin(pc, temp, src[0][3]); 1665 emit_flop(pc, 4, dst[3], temp); 1666 if (!(mask &= 7)) 1667 break; 1668 if (temp == dst[3]) 1669 temp = brdc = temp_temp(pc); 1670 } 1671 emit_precossin(pc, temp, src[0][0]); 1672 emit_flop(pc, 4, brdc, temp); 1673 break; 1674 case TGSI_OPCODE_SLT: 1675 case TGSI_OPCODE_SGE: 1676 case TGSI_OPCODE_SEQ: 1677 case TGSI_OPCODE_SGT: 1678 case TGSI_OPCODE_SLE: 1679 case TGSI_OPCODE_SNE: 1680 i = map_tgsi_setop_cc(inst->Instruction.Opcode); 1681 for (c = 0; c < 4; c++) { 1682 if (!(mask & (1 << c))) 1683 continue; 1684 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]); 1685 } 1686 break; 1687 case TGSI_OPCODE_SUB: 1688 for (c = 0; c < 4; c++) { 1689 if (!(mask & (1 << c))) 1690 continue; 1691 emit_sub(pc, dst[c], src[0][c], src[1][c]); 1692 } 1693 break; 1694 case TGSI_OPCODE_TEX: 1695 emit_tex(pc, dst, mask, src[0], unit, 1696 inst->InstructionExtTexture.Texture, FALSE); 1697 break; 1698 case TGSI_OPCODE_TXP: 1699 emit_tex(pc, dst, mask, src[0], unit, 1700 inst->InstructionExtTexture.Texture, TRUE); 1701 break; 1702 case TGSI_OPCODE_TRUNC: 1703 for (c = 0; c < 4; c++) { 1704 if (!(mask & (1 << c))) 1705 continue; 1706 emit_cvt(pc, dst[c], src[0][c], -1, 1707 CVTOP_TRUNC, CVT_F32_F32); 1708 } 1709 break; 1710 case TGSI_OPCODE_XPD: 1711 temp = temp_temp(pc); 1712 if (mask & (1 << 0)) { 1713 emit_mul(pc, temp, src[0][2], src[1][1]); 1714 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 1715 } 1716 if (mask & (1 << 1)) { 1717 emit_mul(pc, temp, src[0][0], src[1][2]); 1718 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 1719 } 1720 if (mask & (1 << 2)) { 1721 emit_mul(pc, temp, src[0][1], src[1][0]); 1722 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 1723 } 1724 if (mask & (1 << 3)) 1725 emit_mov_immdval(pc, dst[3], 1.0); 1726 break; 1727 case TGSI_OPCODE_END: 1728 break; 1729 default: 1730 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 1731 return FALSE; 1732 } 1733 1734 if (brdc) { 1735 if (sat) 1736 emit_sat(pc, brdc, brdc); 1737 for (c = 0; c < 4; c++) 1738 if ((mask & (1 << c)) && dst[c] != brdc) 1739 emit_mov(pc, dst[c], brdc); 1740 } else 1741 if (sat) { 1742 for (c = 0; c < 4; c++) { 1743 if (!(mask & (1 << c))) 1744 continue; 1745 /* in this case we saturate later */ 1746 if (dst[c]->type == P_TEMP && dst[c]->index < 0) 1747 continue; 1748 emit_sat(pc, rdst[c], dst[c]); 1749 } 1750 } 1751 1752 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1753 for (c = 0; c < 4; c++) { 1754 if (!src[i][c]) 1755 continue; 1756 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) 1757 FREE(src[i][c]); 1758 } 1759 } 1760 1761 kill_temp_temp(pc); 1762 return TRUE; 1763} 1764 1765static void 1766prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) 1767{ 1768 struct nv50_reg *reg = NULL; 1769 const struct tgsi_full_src_register *src; 1770 const struct tgsi_dst_register *dst; 1771 unsigned i, c, k, mask; 1772 1773 dst = &insn->FullDstRegisters[0].DstRegister; 1774 mask = dst->WriteMask; 1775 1776 if (dst->File == TGSI_FILE_TEMPORARY) 1777 reg = pc->temp; 1778 else 1779 if (dst->File == TGSI_FILE_OUTPUT) 1780 reg = pc->result; 1781 1782 if (reg) { 1783 for (c = 0; c < 4; c++) { 1784 if (!(mask & (1 << c))) 1785 continue; 1786 reg[dst->Index * 4 + c].acc = pc->insn_nr; 1787 } 1788 } 1789 1790 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 1791 src = &insn->FullSrcRegisters[i]; 1792 1793 if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) 1794 reg = pc->temp; 1795 else 1796 if (src->SrcRegister.File == TGSI_FILE_INPUT) 1797 reg = pc->attr; 1798 else 1799 continue; 1800 1801 mask = nv50_tgsi_src_mask(insn, i); 1802 1803 for (c = 0; c < 4; c++) { 1804 if (!(mask & (1 << c))) 1805 continue; 1806 k = tgsi_util_get_full_src_register_extswizzle(src, c); 1807 1808 if (k > TGSI_EXTSWIZZLE_W) 1809 continue; 1810 1811 reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr; 1812 } 1813 } 1814} 1815 1816/* Returns a bitmask indicating which dst components need to be 1817 * written to temporaries first to avoid 'corrupting' sources. 1818 * 1819 * m[i] (out) indicate component to write in the i-th position 1820 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 1821 */ 1822static unsigned 1823nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 1824{ 1825 unsigned i, c, x, unsafe; 1826 1827 for (c = 0; c < 4; c++) 1828 m[c] = c; 1829 1830 /* Swap as long as a dst component written earlier is depended on 1831 * by one written later, but the next one isn't depended on by it. 1832 */ 1833 for (c = 0; c < 3; c++) { 1834 if (rdep[m[c + 1]] & (1 << m[c])) 1835 continue; /* if next one is depended on by us */ 1836 for (i = c + 1; i < 4; i++) 1837 /* if we are depended on by a later one */ 1838 if (rdep[m[c]] & (1 << m[i])) 1839 break; 1840 if (i == 4) 1841 continue; 1842 /* now, swap */ 1843 x = m[c]; 1844 m[c] = m[c + 1]; 1845 m[c + 1] = x; 1846 1847 /* restart */ 1848 c = 0; 1849 } 1850 1851 /* mark dependencies that could not be resolved by reordering */ 1852 for (i = 0; i < 3; ++i) 1853 for (c = i + 1; c < 4; ++c) 1854 if (rdep[m[i]] & (1 << m[c])) 1855 unsafe |= (1 << i); 1856 1857 /* NOTE: $unsafe is with respect to order, not component */ 1858 return unsafe; 1859} 1860 1861/* Select a suitable dst register for broadcasting scalar results, 1862 * or return NULL if we have to allocate an extra TEMP. 1863 * 1864 * If e.g. only 1 component is written, we may also emit the final 1865 * result to a write-only register. 1866 */ 1867static struct nv50_reg * 1868tgsi_broadcast_dst(struct nv50_pc *pc, 1869 const struct tgsi_full_dst_register *fd, unsigned mask) 1870{ 1871 if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) { 1872 int c = ffs(~mask & fd->DstRegister.WriteMask); 1873 if (c) 1874 return tgsi_dst(pc, c - 1, fd); 1875 } else { 1876 int c = ffs(fd->DstRegister.WriteMask) - 1; 1877 if ((1 << c) == fd->DstRegister.WriteMask) 1878 return tgsi_dst(pc, c, fd); 1879 } 1880 1881 return NULL; 1882} 1883 1884/* Scan source swizzles and return a bitmask indicating dst regs that 1885 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 1886 */ 1887static unsigned 1888nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 1889 unsigned rdep[4]) 1890{ 1891 const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0]; 1892 const struct tgsi_full_src_register *fs; 1893 unsigned i, deqs = 0; 1894 1895 for (i = 0; i < 4; ++i) 1896 rdep[i] = 0; 1897 1898 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 1899 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 1900 boolean neg_supp = negate_supported(insn, i); 1901 1902 fs = &insn->FullSrcRegisters[i]; 1903 if (fs->SrcRegister.File != fd->DstRegister.File || 1904 fs->SrcRegister.Index != fd->DstRegister.Index) 1905 continue; 1906 1907 for (chn = 0; chn < 4; ++chn) { 1908 unsigned s, c; 1909 1910 if (!(mask & (1 << chn))) /* src is not read */ 1911 continue; 1912 c = tgsi_util_get_full_src_register_extswizzle(fs, chn); 1913 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 1914 1915 if (c > TGSI_EXTSWIZZLE_W || 1916 !(fd->DstRegister.WriteMask & (1 << c))) 1917 continue; 1918 1919 /* no danger if src is copied to TEMP first */ 1920 if ((s != TGSI_UTIL_SIGN_KEEP) && 1921 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp)) 1922 continue; 1923 1924 rdep[c] |= nv50_tgsi_dst_revdep( 1925 insn->Instruction.Opcode, i, chn); 1926 deqs |= (1 << c); 1927 } 1928 } 1929 1930 return deqs; 1931} 1932 1933static boolean 1934nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 1935{ 1936 struct tgsi_full_instruction insn = tok->FullInstruction; 1937 const struct tgsi_full_dst_register *fd; 1938 unsigned i, deqs, rdep[4], m[4]; 1939 1940 fd = &tok->FullInstruction.FullDstRegisters[0]; 1941 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 1942 1943 if (is_scalar_op(insn.Instruction.Opcode)) { 1944 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 1945 if (!pc->r_brdc) 1946 pc->r_brdc = temp_temp(pc); 1947 return nv50_program_tx_insn(pc, &insn); 1948 } 1949 pc->r_brdc = NULL; 1950 1951 if (!deqs) 1952 return nv50_program_tx_insn(pc, &insn); 1953 1954 deqs = nv50_revdep_reorder(m, rdep); 1955 1956 for (i = 0; i < 4; ++i) { 1957 assert(pc->r_dst[m[i]] == NULL); 1958 1959 insn.FullDstRegisters[0].DstRegister.WriteMask = 1960 fd->DstRegister.WriteMask & (1 << m[i]); 1961 1962 if (!insn.FullDstRegisters[0].DstRegister.WriteMask) 1963 continue; 1964 1965 if (deqs & (1 << i)) 1966 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 1967 1968 if (!nv50_program_tx_insn(pc, &insn)) 1969 return FALSE; 1970 } 1971 1972 for (i = 0; i < 4; i++) { 1973 struct nv50_reg *reg = pc->r_dst[i]; 1974 if (!reg) 1975 continue; 1976 pc->r_dst[i] = NULL; 1977 1978 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 1979 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 1980 else 1981 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 1982 free_temp(pc, reg); 1983 } 1984 1985 return TRUE; 1986} 1987 1988static void 1989load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) 1990{ 1991 struct nv50_reg *iv, **ppiv; 1992 unsigned mode = pc->interp_mode[reg->index]; 1993 1994 ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; 1995 iv = *ppiv; 1996 1997 if ((mode & INTERP_PERSPECTIVE) && !iv) { 1998 iv = *ppiv = alloc_temp(pc, NULL); 1999 iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; 2000 2001 emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); 2002 emit_flop(pc, 0, iv, iv); 2003 2004 /* XXX: when loading interpolants dynamically, move these 2005 * to the program head, or make sure it can't be skipped. 2006 */ 2007 } 2008 2009 emit_interp(pc, reg, iv, mode); 2010} 2011 2012static boolean 2013nv50_program_tx_prep(struct nv50_pc *pc) 2014{ 2015 struct tgsi_parse_context tp; 2016 struct nv50_program *p = pc->p; 2017 boolean ret = FALSE; 2018 unsigned i, c, flat_nr = 0; 2019 2020 tgsi_parse_init(&tp, pc->p->pipe.tokens); 2021 while (!tgsi_parse_end_of_tokens(&tp)) { 2022 const union tgsi_full_token *tok = &tp.FullToken; 2023 2024 tgsi_parse_token(&tp); 2025 switch (tok->Token.Type) { 2026 case TGSI_TOKEN_TYPE_IMMEDIATE: 2027 { 2028 const struct tgsi_full_immediate *imm = 2029 &tp.FullToken.FullImmediate; 2030 2031 ctor_immd(pc, imm->u[0].Float, 2032 imm->u[1].Float, 2033 imm->u[2].Float, 2034 imm->u[3].Float); 2035 } 2036 break; 2037 case TGSI_TOKEN_TYPE_DECLARATION: 2038 { 2039 const struct tgsi_full_declaration *d; 2040 unsigned si, last, first, mode; 2041 2042 d = &tp.FullToken.FullDeclaration; 2043 first = d->DeclarationRange.First; 2044 last = d->DeclarationRange.Last; 2045 2046 switch (d->Declaration.File) { 2047 case TGSI_FILE_TEMPORARY: 2048 break; 2049 case TGSI_FILE_OUTPUT: 2050 if (!d->Declaration.Semantic || 2051 p->type == PIPE_SHADER_FRAGMENT) 2052 break; 2053 2054 si = d->Semantic.SemanticIndex; 2055 switch (d->Semantic.SemanticName) { 2056 case TGSI_SEMANTIC_BCOLOR: 2057 p->cfg.two_side[si].hw = first; 2058 if (p->cfg.io_nr > first) 2059 p->cfg.io_nr = first; 2060 break; 2061 case TGSI_SEMANTIC_PSIZE: 2062 p->cfg.psiz = first; 2063 if (p->cfg.io_nr > first) 2064 p->cfg.io_nr = first; 2065 break; 2066 /* 2067 case TGSI_SEMANTIC_CLIP_DISTANCE: 2068 p->cfg.clpd = MIN2(p->cfg.clpd, first); 2069 break; 2070 */ 2071 default: 2072 break; 2073 } 2074 break; 2075 case TGSI_FILE_INPUT: 2076 { 2077 if (p->type != PIPE_SHADER_FRAGMENT) 2078 break; 2079 2080 switch (d->Declaration.Interpolate) { 2081 case TGSI_INTERPOLATE_CONSTANT: 2082 mode = INTERP_FLAT; 2083 flat_nr++; 2084 break; 2085 case TGSI_INTERPOLATE_PERSPECTIVE: 2086 mode = INTERP_PERSPECTIVE; 2087 p->cfg.regs[1] |= 0x08 << 24; 2088 break; 2089 default: 2090 mode = INTERP_LINEAR; 2091 break; 2092 } 2093 if (d->Declaration.Centroid) 2094 mode |= INTERP_CENTROID; 2095 2096 assert(last < 32); 2097 for (i = first; i <= last; i++) 2098 pc->interp_mode[i] = mode; 2099 } 2100 break; 2101 case TGSI_FILE_CONSTANT: 2102 break; 2103 case TGSI_FILE_SAMPLER: 2104 break; 2105 default: 2106 NOUVEAU_ERR("bad decl file %d\n", 2107 d->Declaration.File); 2108 goto out_err; 2109 } 2110 } 2111 break; 2112 case TGSI_TOKEN_TYPE_INSTRUCTION: 2113 pc->insn_nr++; 2114 prep_inspect_insn(pc, &tok->FullInstruction); 2115 break; 2116 default: 2117 break; 2118 } 2119 } 2120 2121 if (p->type == PIPE_SHADER_VERTEX) { 2122 int rid = 0; 2123 2124 for (i = 0; i < pc->attr_nr * 4; ++i) { 2125 if (pc->attr[i].acc) { 2126 pc->attr[i].hw = rid++; 2127 p->cfg.attr[i / 32] |= 1 << (i % 32); 2128 } 2129 } 2130 2131 for (i = 0, rid = 0; i < pc->result_nr; ++i) { 2132 p->cfg.io[i].hw = rid; 2133 p->cfg.io[i].id_vp = i; 2134 2135 for (c = 0; c < 4; ++c) { 2136 int n = i * 4 + c; 2137 if (!pc->result[n].acc) 2138 continue; 2139 pc->result[n].hw = rid++; 2140 p->cfg.io[i].mask |= 1 << c; 2141 } 2142 } 2143 2144 for (c = 0; c < 2; ++c) 2145 if (p->cfg.two_side[c].hw < 0x40) 2146 p->cfg.two_side[c] = p->cfg.io[ 2147 p->cfg.two_side[c].hw]; 2148 2149 if (p->cfg.psiz < 0x40) 2150 p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw; 2151 } else 2152 if (p->type == PIPE_SHADER_FRAGMENT) { 2153 int rid, aid; 2154 unsigned n = 0, m = pc->attr_nr - flat_nr; 2155 2156 int base = (TGSI_SEMANTIC_POSITION == 2157 p->info.input_semantic_name[0]) ? 0 : 1; 2158 2159 /* non-flat interpolants have to be mapped to 2160 * the lower hardware IDs, so sort them: 2161 */ 2162 for (i = 0; i < pc->attr_nr; i++) { 2163 if (pc->interp_mode[i] == INTERP_FLAT) { 2164 p->cfg.io[m].id_vp = i + base; 2165 p->cfg.io[m++].id_fp = i; 2166 } else { 2167 if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) 2168 p->cfg.io[n].linear = TRUE; 2169 p->cfg.io[n].id_vp = i + base; 2170 p->cfg.io[n++].id_fp = i; 2171 } 2172 } 2173 2174 if (!base) /* set w-coordinate mask from perspective interp */ 2175 p->cfg.io[0].mask |= p->cfg.regs[1] >> 24; 2176 2177 aid = popcnt4( /* if fcrd isn't contained in cfg.io */ 2178 base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask); 2179 2180 for (n = 0; n < pc->attr_nr; ++n) { 2181 p->cfg.io[n].hw = rid = aid; 2182 i = p->cfg.io[n].id_fp; 2183 2184 for (c = 0; c < 4; ++c) { 2185 if (!pc->attr[i * 4 + c].acc) 2186 continue; 2187 pc->attr[i * 4 + c].rhw = rid++; 2188 p->cfg.io[n].mask |= 1 << c; 2189 2190 load_interpolant(pc, &pc->attr[i * 4 + c]); 2191 } 2192 aid += popcnt4(p->cfg.io[n].mask); 2193 } 2194 2195 if (!base) 2196 p->cfg.regs[1] |= p->cfg.io[0].mask << 24; 2197 2198 m = popcnt4(p->cfg.regs[1] >> 24); 2199 2200 /* set count of non-position inputs and of non-flat 2201 * non-position inputs for FP_INTERPOLANT_CTRL 2202 */ 2203 p->cfg.regs[1] |= aid - m; 2204 2205 if (flat_nr) { 2206 i = p->cfg.io[pc->attr_nr - flat_nr].hw; 2207 p->cfg.regs[1] |= (i - m) << 16; 2208 } else 2209 p->cfg.regs[1] |= p->cfg.regs[1] << 16; 2210 2211 /* mark color semantic for light-twoside */ 2212 n = 0x40; 2213 for (i = 0; i < pc->attr_nr; i++) { 2214 ubyte si, sn; 2215 2216 sn = p->info.input_semantic_name[p->cfg.io[i].id_fp]; 2217 si = p->info.input_semantic_index[p->cfg.io[i].id_fp]; 2218 2219 if (sn == TGSI_SEMANTIC_COLOR) { 2220 p->cfg.two_side[si] = p->cfg.io[i]; 2221 2222 /* increase colour count */ 2223 p->cfg.regs[0] += popcnt4( 2224 p->cfg.two_side[si].mask) << 16; 2225 2226 n = MIN2(n, p->cfg.io[i].hw - m); 2227 } 2228 } 2229 if (n < 0x40) 2230 p->cfg.regs[0] += n; 2231 2232 /* Initialize FP results: 2233 * FragDepth is always first TGSI and last hw output 2234 */ 2235 i = p->info.writes_z ? 4 : 0; 2236 for (rid = 0; i < pc->result_nr * 4; i++) 2237 pc->result[i].rhw = rid++; 2238 if (p->info.writes_z) 2239 pc->result[2].rhw = rid; 2240 } 2241 2242 if (pc->immd_nr) { 2243 int rid = 0; 2244 2245 pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); 2246 if (!pc->immd) 2247 goto out_err; 2248 2249 for (i = 0; i < pc->immd_nr; i++) { 2250 for (c = 0; c < 4; c++, rid++) 2251 ctor_reg(&pc->immd[rid], P_IMMD, i, rid); 2252 } 2253 } 2254 2255 ret = TRUE; 2256out_err: 2257 if (pc->iv_p) 2258 free_temp(pc, pc->iv_p); 2259 if (pc->iv_c) 2260 free_temp(pc, pc->iv_c); 2261 2262 tgsi_parse_free(&tp); 2263 return ret; 2264} 2265 2266static void 2267free_nv50_pc(struct nv50_pc *pc) 2268{ 2269 if (pc->immd) 2270 FREE(pc->immd); 2271 if (pc->param) 2272 FREE(pc->param); 2273 if (pc->result) 2274 FREE(pc->result); 2275 if (pc->attr) 2276 FREE(pc->attr); 2277 if (pc->temp) 2278 FREE(pc->temp); 2279 2280 FREE(pc); 2281} 2282 2283static boolean 2284ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) 2285{ 2286 int i, c; 2287 unsigned rtype[2] = { P_ATTR, P_RESULT }; 2288 2289 pc->p = p; 2290 pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; 2291 pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; 2292 pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; 2293 pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; 2294 2295 p->cfg.high_temp = 4; 2296 2297 p->cfg.two_side[0].hw = 0x40; 2298 p->cfg.two_side[1].hw = 0x40; 2299 2300 switch (p->type) { 2301 case PIPE_SHADER_VERTEX: 2302 p->cfg.psiz = 0x40; 2303 p->cfg.clpd = 0x40; 2304 p->cfg.io_nr = pc->result_nr; 2305 break; 2306 case PIPE_SHADER_FRAGMENT: 2307 rtype[0] = rtype[1] = P_TEMP; 2308 2309 p->cfg.regs[0] = 0x01000004; 2310 p->cfg.io_nr = pc->attr_nr; 2311 2312 if (p->info.writes_z) { 2313 p->cfg.regs[2] |= 0x00000100; 2314 p->cfg.regs[3] |= 0x00000011; 2315 } 2316 if (p->info.uses_kill) 2317 p->cfg.regs[2] |= 0x00100000; 2318 break; 2319 } 2320 2321 if (pc->temp_nr) { 2322 pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); 2323 if (!pc->temp) 2324 return FALSE; 2325 2326 for (i = 0; i < pc->temp_nr * 4; ++i) 2327 ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); 2328 } 2329 2330 if (pc->attr_nr) { 2331 pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); 2332 if (!pc->attr) 2333 return FALSE; 2334 2335 for (i = 0; i < pc->attr_nr * 4; ++i) 2336 ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); 2337 } 2338 2339 if (pc->result_nr) { 2340 unsigned nr = pc->result_nr * 4; 2341 2342 pc->result = MALLOC(nr * sizeof(struct nv50_reg)); 2343 if (!pc->result) 2344 return FALSE; 2345 2346 for (i = 0; i < nr; ++i) 2347 ctor_reg(&pc->result[i], rtype[1], i / 4, -1); 2348 } 2349 2350 if (pc->param_nr) { 2351 int rid = 0; 2352 2353 pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); 2354 if (!pc->param) 2355 return FALSE; 2356 2357 for (i = 0; i < pc->param_nr; ++i) 2358 for (c = 0; c < 4; ++c, ++rid) 2359 ctor_reg(&pc->param[rid], P_CONST, i, rid); 2360 } 2361 2362 return TRUE; 2363} 2364 2365static boolean 2366nv50_program_tx(struct nv50_program *p) 2367{ 2368 struct tgsi_parse_context parse; 2369 struct nv50_pc *pc; 2370 unsigned k; 2371 boolean ret; 2372 2373 pc = CALLOC_STRUCT(nv50_pc); 2374 if (!pc) 2375 return FALSE; 2376 2377 ret = ctor_nv50_pc(pc, p); 2378 if (ret == FALSE) 2379 goto out_cleanup; 2380 2381 ret = nv50_program_tx_prep(pc); 2382 if (ret == FALSE) 2383 goto out_cleanup; 2384 2385 tgsi_parse_init(&parse, pc->p->pipe.tokens); 2386 while (!tgsi_parse_end_of_tokens(&parse)) { 2387 const union tgsi_full_token *tok = &parse.FullToken; 2388 2389 /* don't allow half insn/immd on first and last instruction */ 2390 pc->allow32 = TRUE; 2391 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr) 2392 pc->allow32 = FALSE; 2393 2394 tgsi_parse_token(&parse); 2395 2396 switch (tok->Token.Type) { 2397 case TGSI_TOKEN_TYPE_INSTRUCTION: 2398 ++pc->insn_cur; 2399 ret = nv50_tgsi_insn(pc, tok); 2400 if (ret == FALSE) 2401 goto out_err; 2402 break; 2403 default: 2404 break; 2405 } 2406 } 2407 2408 if (p->type == PIPE_SHADER_FRAGMENT) { 2409 struct nv50_reg out; 2410 ctor_reg(&out, P_TEMP, -1, -1); 2411 2412 for (k = 0; k < pc->result_nr * 4; k++) { 2413 if (pc->result[k].rhw == -1) 2414 continue; 2415 if (pc->result[k].hw != pc->result[k].rhw) { 2416 out.hw = pc->result[k].rhw; 2417 emit_mov(pc, &out, &pc->result[k]); 2418 } 2419 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1)) 2420 pc->p->cfg.high_result = pc->result[k].rhw + 1; 2421 } 2422 } 2423 2424 /* look for single half instructions and make them long */ 2425 struct nv50_program_exec *e, *e_prev; 2426 2427 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) { 2428 if (!is_long(e)) 2429 k++; 2430 2431 if (!e->next || is_long(e->next)) { 2432 if (k & 1) 2433 convert_to_long(pc, e); 2434 k = 0; 2435 } 2436 2437 if (e->next) 2438 e_prev = e; 2439 } 2440 2441 if (!is_long(pc->p->exec_tail)) { 2442 /* this may occur if moving FP results */ 2443 assert(e_prev && !is_long(e_prev)); 2444 convert_to_long(pc, e_prev); 2445 convert_to_long(pc, pc->p->exec_tail); 2446 } 2447 2448 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head)); 2449 pc->p->exec_tail->inst[1] |= 0x00000001; 2450 2451 p->param_nr = pc->param_nr * 4; 2452 p->immd_nr = pc->immd_nr * 4; 2453 p->immd = pc->immd_buf; 2454 2455out_err: 2456 tgsi_parse_free(&parse); 2457 2458out_cleanup: 2459 free_nv50_pc(pc); 2460 return ret; 2461} 2462 2463static void 2464nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 2465{ 2466 if (nv50_program_tx(p) == FALSE) 2467 assert(0); 2468 p->translated = TRUE; 2469} 2470 2471static void 2472nv50_program_upload_data(struct nv50_context *nv50, float *map, 2473 unsigned start, unsigned count, unsigned cbuf) 2474{ 2475 struct nouveau_channel *chan = nv50->screen->base.channel; 2476 struct nouveau_grobj *tesla = nv50->screen->tesla; 2477 2478 while (count) { 2479 unsigned nr = count > 2047 ? 2047 : count; 2480 2481 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2482 OUT_RING (chan, (cbuf << 0) | (start << 8)); 2483 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2484 OUT_RINGp (chan, map, nr); 2485 2486 map += nr; 2487 start += nr; 2488 count -= nr; 2489 } 2490} 2491 2492static void 2493nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 2494{ 2495 struct pipe_screen *pscreen = nv50->pipe.screen; 2496 2497 if (!p->data[0] && p->immd_nr) { 2498 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 2499 2500 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 2501 while (heap->next && heap->size < p->immd_nr) { 2502 struct nv50_program *evict = heap->next->priv; 2503 nouveau_resource_free(&evict->data[0]); 2504 } 2505 2506 if (nouveau_resource_alloc(heap, p->immd_nr, p, 2507 &p->data[0])) 2508 assert(0); 2509 } 2510 2511 /* immediates only need to be uploaded again when freed */ 2512 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 2513 p->immd_nr, NV50_CB_PMISC); 2514 } 2515 2516 assert(p->param_nr <= 128); 2517 2518 if (p->param_nr) { 2519 unsigned cb; 2520 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], 2521 PIPE_BUFFER_USAGE_CPU_READ); 2522 2523 if (p->type == PIPE_SHADER_VERTEX) 2524 cb = NV50_CB_PVP; 2525 else 2526 cb = NV50_CB_PFP; 2527 2528 nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); 2529 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); 2530 } 2531} 2532 2533static void 2534nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 2535{ 2536 struct nouveau_channel *chan = nv50->screen->base.channel; 2537 struct nouveau_grobj *tesla = nv50->screen->tesla; 2538 struct nv50_program_exec *e; 2539 struct nouveau_stateobj *so; 2540 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR; 2541 unsigned start, count, *up, *ptr; 2542 boolean upload = FALSE; 2543 2544 if (!p->bo) { 2545 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 2546 p->exec_size * 4, &p->bo); 2547 upload = TRUE; 2548 } 2549 2550 if (p->data[0] && p->data[0]->start != p->data_start[0]) 2551 upload = TRUE; 2552 2553 if (!upload) 2554 return; 2555 2556 for (e = p->exec_head; e; e = e->next) { 2557 unsigned ei, ci, bs; 2558 2559 if (e->param.index < 0) 2560 continue; 2561 bs = (e->inst[1] >> 22) & 0x07; 2562 assert(bs < 2); 2563 ei = e->param.shift >> 5; 2564 ci = e->param.index; 2565 if (bs == 0) 2566 ci += p->data[bs]->start; 2567 2568 e->inst[ei] &= ~e->param.mask; 2569 e->inst[ei] |= (ci << e->param.shift); 2570 } 2571 2572 if (p->data[0]) 2573 p->data_start[0] = p->data[0]->start; 2574 2575#ifdef NV50_PROGRAM_DUMP 2576 NOUVEAU_ERR("-------\n"); 2577 for (e = p->exec_head; e; e = e->next) { 2578 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 2579 if (is_long(e)) 2580 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 2581 } 2582#endif 2583 2584 up = ptr = MALLOC(p->exec_size * 4); 2585 for (e = p->exec_head; e; e = e->next) { 2586 *(ptr++) = e->inst[0]; 2587 if (is_long(e)) 2588 *(ptr++) = e->inst[1]; 2589 } 2590 2591 so = so_new(4,2); 2592 so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); 2593 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0); 2594 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0); 2595 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4)); 2596 2597 start = 0; count = p->exec_size; 2598 while (count) { 2599 struct nouveau_channel *chan = nv50->screen->base.channel; 2600 unsigned nr; 2601 2602 so_emit(chan, so); 2603 2604 nr = MIN2(count, 2047); 2605 nr = MIN2(chan->pushbuf->remaining, nr); 2606 if (chan->pushbuf->remaining < (nr + 3)) { 2607 FIRE_RING(chan); 2608 continue; 2609 } 2610 2611 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2612 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD); 2613 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2614 OUT_RINGp (chan, up + start, nr); 2615 2616 start += nr; 2617 count -= nr; 2618 } 2619 2620 FREE(up); 2621 so_ref(NULL, &so); 2622} 2623 2624void 2625nv50_vertprog_validate(struct nv50_context *nv50) 2626{ 2627 struct nouveau_grobj *tesla = nv50->screen->tesla; 2628 struct nv50_program *p = nv50->vertprog; 2629 struct nouveau_stateobj *so; 2630 2631 if (!p->translated) { 2632 nv50_program_validate(nv50, p); 2633 if (!p->translated) 2634 assert(0); 2635 } 2636 2637 nv50_program_validate_data(nv50, p); 2638 nv50_program_validate_code(nv50, p); 2639 2640 so = so_new(13, 2); 2641 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 2642 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2643 NOUVEAU_BO_HIGH, 0, 0); 2644 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2645 NOUVEAU_BO_LOW, 0, 0); 2646 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 2647 so_data (so, p->cfg.attr[0]); 2648 so_data (so, p->cfg.attr[1]); 2649 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 2650 so_data (so, p->cfg.high_result); 2651 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); 2652 so_data (so, p->cfg.high_result); //8); 2653 so_data (so, p->cfg.high_temp); 2654 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 2655 so_data (so, 0); /* program start offset */ 2656 so_ref(so, &nv50->state.vertprog); 2657 so_ref(NULL, &so); 2658} 2659 2660void 2661nv50_fragprog_validate(struct nv50_context *nv50) 2662{ 2663 struct nouveau_grobj *tesla = nv50->screen->tesla; 2664 struct nv50_program *p = nv50->fragprog; 2665 struct nouveau_stateobj *so; 2666 2667 if (!p->translated) { 2668 nv50_program_validate(nv50, p); 2669 if (!p->translated) 2670 assert(0); 2671 } 2672 2673 nv50_program_validate_data(nv50, p); 2674 nv50_program_validate_code(nv50, p); 2675 2676 so = so_new(64, 2); 2677 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 2678 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2679 NOUVEAU_BO_HIGH, 0, 0); 2680 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2681 NOUVEAU_BO_LOW, 0, 0); 2682 so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); 2683 so_data (so, p->cfg.high_temp); 2684 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 2685 so_data (so, p->cfg.high_result); 2686 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1); 2687 so_data (so, p->cfg.regs[2]); 2688 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 2689 so_data (so, p->cfg.regs[3]); 2690 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 2691 so_data (so, 0); /* program start offset */ 2692 so_ref(so, &nv50->state.fragprog); 2693 so_ref(NULL, &so); 2694} 2695 2696static void 2697nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) 2698{ 2699 struct nv50_program *fp = nv50->fragprog; 2700 struct nv50_program *vp = nv50->vertprog; 2701 unsigned i, c, m = base; 2702 2703 /* XXX: This can't work correctly in all cases yet, we either 2704 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has 2705 * to be per FP input instead of per VP output 2706 */ 2707 memset(pntc, 0, 8 * sizeof(uint32_t)); 2708 2709 for (i = 0; i < fp->cfg.io_nr; i++) { 2710 uint8_t sn, si; 2711 uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp; 2712 unsigned n = popcnt4(fp->cfg.io[i].mask); 2713 2714 if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) { 2715 m += n; 2716 continue; 2717 } 2718 2719 sn = vp->info.input_semantic_name[j]; 2720 si = vp->info.input_semantic_index[j]; 2721 2722 if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) { 2723 ubyte mode = 2724 nv50->rasterizer->pipe.sprite_coord_mode[si]; 2725 2726 if (mode == PIPE_SPRITE_COORD_NONE) { 2727 m += n; 2728 continue; 2729 } 2730 } 2731 2732 /* this is either PointCoord or replaced by sprite coords */ 2733 for (c = 0; c < 4; c++) { 2734 if (!(fp->cfg.io[i].mask & (1 << c))) 2735 continue; 2736 pntc[m / 8] |= (c + 1) << ((m % 8) * 4); 2737 ++m; 2738 } 2739 } 2740} 2741 2742static int 2743nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4], 2744 struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) 2745{ 2746 int c; 2747 uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; 2748 uint8_t *map = (uint8_t *)p_map; 2749 2750 for (c = 0; c < 4; ++c) { 2751 if (mf & 1) { 2752 if (fpi->linear == TRUE) 2753 lin[mid / 32] |= 1 << (mid % 32); 2754 map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40); 2755 } 2756 2757 oid += mv & 1; 2758 mf >>= 1; 2759 mv >>= 1; 2760 } 2761 2762 return mid; 2763} 2764 2765void 2766nv50_linkage_validate(struct nv50_context *nv50) 2767{ 2768 struct nouveau_grobj *tesla = nv50->screen->tesla; 2769 struct nv50_program *vp = nv50->vertprog; 2770 struct nv50_program *fp = nv50->fragprog; 2771 struct nouveau_stateobj *so; 2772 struct nv50_sreg4 dummy, *vpo; 2773 int i, n, c, m = 0; 2774 uint32_t map[16], lin[4], reg[5], pcrd[8]; 2775 2776 memset(map, 0, sizeof(map)); 2777 memset(lin, 0, sizeof(lin)); 2778 2779 reg[1] = 0x00000004; /* low and high clip distance map ids */ 2780 reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ 2781 reg[3] = 0x00000000; /* point size map id & enable */ 2782 reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ 2783 reg[4] = fp->cfg.regs[1]; /* interpolant info */ 2784 2785 dummy.linear = FALSE; 2786 dummy.mask = 0xf; /* map all components of HPOS */ 2787 m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]); 2788 2789 dummy.mask = 0x0; 2790 2791 if (vp->cfg.clpd < 0x40) { 2792 for (c = 0; c < vp->cfg.clpd_nr; ++c) 2793 map[m++] = vp->cfg.clpd + c; 2794 reg[1] = (m << 8); 2795 } 2796 2797 reg[0] |= m << 8; /* adjust BFC0 id */ 2798 2799 /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ 2800 if (nv50->rasterizer->pipe.light_twoside) { 2801 vpo = &vp->cfg.two_side[0]; 2802 2803 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]); 2804 m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]); 2805 } 2806 2807 reg[0] += m - 4; /* adjust FFC0 id */ 2808 reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ 2809 2810 i = 0; 2811 if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) 2812 i = 1; 2813 for (; i < fp->cfg.io_nr; i++) { 2814 ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp]; 2815 ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp]; 2816 2817 n = fp->cfg.io[i].id_vp; 2818 if (n >= vp->cfg.io_nr || 2819 vp->info.output_semantic_name[n] != sn || 2820 vp->info.output_semantic_index[n] != si) 2821 vpo = &dummy; 2822 else 2823 vpo = &vp->cfg.io[n]; 2824 2825 m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo); 2826 } 2827 2828 if (nv50->rasterizer->pipe.point_size_per_vertex) { 2829 map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); 2830 reg[3] = (m++ << 4) | 1; 2831 } 2832 2833 /* now fill the stateobj */ 2834 so = so_new(64, 0); 2835 2836 n = (m + 3) / 4; 2837 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); 2838 so_data (so, m); 2839 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); 2840 so_datap (so, map, n); 2841 2842 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 2843 so_datap (so, reg, 4); 2844 2845 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); 2846 so_data (so, reg[4]); 2847 2848 so_method(so, tesla, 0x1540, 4); 2849 so_datap (so, lin, 4); 2850 2851 if (nv50->rasterizer->pipe.point_sprite) { 2852 nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff); 2853 2854 so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); 2855 so_datap (so, pcrd, 8); 2856 } 2857 2858 so_ref(so, &nv50->state.programs); 2859 so_ref(NULL, &so); 2860} 2861 2862void 2863nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 2864{ 2865 while (p->exec_head) { 2866 struct nv50_program_exec *e = p->exec_head; 2867 2868 p->exec_head = e->next; 2869 FREE(e); 2870 } 2871 p->exec_tail = NULL; 2872 p->exec_size = 0; 2873 2874 nouveau_bo_ref(NULL, &p->bo); 2875 2876 nouveau_resource_free(&p->data[0]); 2877 2878 p->translated = 0; 2879} 2880