nv50_program.c revision 81de711fc864247419221d700bd045addf22cb52
1/* 2 * Copyright 2008 Ben Skeggs 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "pipe/p_context.h" 24#include "pipe/p_defines.h" 25#include "pipe/p_state.h" 26#include "pipe/p_inlines.h" 27 28#include "pipe/p_shader_tokens.h" 29#include "tgsi/tgsi_parse.h" 30#include "tgsi/tgsi_util.h" 31 32#include "nv50_context.h" 33 34#define NV50_SU_MAX_TEMP 64 35//#define NV50_PROGRAM_DUMP 36 37/* ARL - gallium craps itself on progs/vp/arl.txt 38 * 39 * MSB - Like MAD, but MUL+SUB 40 * - Fuck it off, introduce a way to negate args for ops that 41 * support it. 42 * 43 * Look into inlining IMMD for ops other than MOV (make it general?) 44 * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 45 * but can emit to P_TEMP first - then MOV later. NVIDIA does this 46 * 47 * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 48 * case, if the emit_src() causes the inst to suddenly become long. 49 * 50 * Verify half-insns work where expected - and force disable them where they 51 * don't work - MUL has it forcibly disabled atm as it fixes POW.. 52 * 53 * FUCK! watch dst==src vectors, can overwrite components that are needed. 54 * ie. SUB R0, R0.yzxw, R0 55 * 56 * Things to check with renouveau: 57 * FP attr/result assignment - how? 58 * attrib 59 * - 0x16bc maps vp output onto fp hpos 60 * - 0x16c0 maps vp output onto fp col0 61 * result 62 * - colr always 0-3 63 * - depr always 4 64 * 0x16bc->0x16e8 --> some binding between vp/fp regs 65 * 0x16b8 --> VP output count 66 * 67 * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 68 * "MOV rcol.x, fcol.y" = 0x00000004 69 * 0x19a8 --> as above but 0x00000100 and 0x00000000 70 * - 0x00100000 used when KIL used 71 * 0x196c --> as above but 0x00000011 and 0x00000000 72 * 73 * 0x1988 --> 0xXXNNNNNN 74 * - XX == FP high something 75 */ 76struct nv50_reg { 77 enum { 78 P_TEMP, 79 P_ATTR, 80 P_RESULT, 81 P_CONST, 82 P_IMMD 83 } type; 84 int index; 85 86 int hw; 87 int neg; 88 89 int rhw; /* result hw for FP outputs, or interpolant index */ 90 int acc; /* instruction where this reg is last read (first insn == 1) */ 91}; 92 93struct nv50_pc { 94 struct nv50_program *p; 95 96 /* hw resources */ 97 struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 98 99 /* tgsi resources */ 100 struct nv50_reg *temp; 101 int temp_nr; 102 struct nv50_reg *attr; 103 int attr_nr; 104 struct nv50_reg *result; 105 int result_nr; 106 struct nv50_reg *param; 107 int param_nr; 108 struct nv50_reg *immd; 109 float *immd_buf; 110 int immd_nr; 111 112 struct nv50_reg *temp_temp[16]; 113 unsigned temp_temp_nr; 114 115 /* broadcast and destination replacement regs */ 116 struct nv50_reg *r_brdc; 117 struct nv50_reg *r_dst[4]; 118 119 unsigned interp_mode[32]; 120 /* perspective interpolation registers */ 121 struct nv50_reg *iv_p; 122 struct nv50_reg *iv_c; 123 124 /* current instruction and total number of insns */ 125 unsigned insn_cur; 126 unsigned insn_nr; 127 128 boolean allow32; 129}; 130 131static void 132alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 133{ 134 int i = 0; 135 136 if (reg->type == P_RESULT) { 137 if (pc->p->cfg.high_result < (reg->hw + 1)) 138 pc->p->cfg.high_result = reg->hw + 1; 139 } 140 141 if (reg->type != P_TEMP) 142 return; 143 144 if (reg->hw >= 0) { 145 /*XXX: do this here too to catch FP temp-as-attr usage.. 146 * not clean, but works */ 147 if (pc->p->cfg.high_temp < (reg->hw + 1)) 148 pc->p->cfg.high_temp = reg->hw + 1; 149 return; 150 } 151 152 if (reg->rhw != -1) { 153 /* try to allocate temporary with index rhw first */ 154 if (!(pc->r_temp[reg->rhw])) { 155 pc->r_temp[reg->rhw] = reg; 156 reg->hw = reg->rhw; 157 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 158 pc->p->cfg.high_temp = reg->rhw + 1; 159 return; 160 } 161 /* make sure we don't get things like $r0 needs to go 162 * in $r1 and $r1 in $r0 163 */ 164 i = pc->result_nr * 4; 165 } 166 167 for (; i < NV50_SU_MAX_TEMP; i++) { 168 if (!(pc->r_temp[i])) { 169 pc->r_temp[i] = reg; 170 reg->hw = i; 171 if (pc->p->cfg.high_temp < (i + 1)) 172 pc->p->cfg.high_temp = i + 1; 173 return; 174 } 175 } 176 177 assert(0); 178} 179 180static struct nv50_reg * 181alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 182{ 183 struct nv50_reg *r; 184 int i; 185 186 if (dst && dst->type == P_TEMP && dst->hw == -1) 187 return dst; 188 189 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 190 if (!pc->r_temp[i]) { 191 r = CALLOC_STRUCT(nv50_reg); 192 r->type = P_TEMP; 193 r->index = -1; 194 r->hw = i; 195 r->rhw = -1; 196 pc->r_temp[i] = r; 197 return r; 198 } 199 } 200 201 assert(0); 202 return NULL; 203} 204 205/* Assign the hw of the discarded temporary register src 206 * to the tgsi register dst and free src. 207 */ 208static void 209assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 210{ 211 assert(src->index == -1 && src->hw != -1); 212 213 if (dst->hw != -1) 214 pc->r_temp[dst->hw] = NULL; 215 pc->r_temp[src->hw] = dst; 216 dst->hw = src->hw; 217 218 FREE(src); 219} 220 221/* release the hardware resource held by r */ 222static void 223release_hw(struct nv50_pc *pc, struct nv50_reg *r) 224{ 225 assert(r->type == P_TEMP); 226 if (r->hw == -1) 227 return; 228 229 assert(pc->r_temp[r->hw] == r); 230 pc->r_temp[r->hw] = NULL; 231 232 r->acc = 0; 233 if (r->index == -1) 234 FREE(r); 235} 236 237static void 238free_temp(struct nv50_pc *pc, struct nv50_reg *r) 239{ 240 if (r->index == -1) { 241 unsigned hw = r->hw; 242 243 FREE(pc->r_temp[hw]); 244 pc->r_temp[hw] = NULL; 245 } 246} 247 248static int 249alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 250{ 251 int i; 252 253 if ((idx + 4) >= NV50_SU_MAX_TEMP) 254 return 1; 255 256 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 257 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 258 return alloc_temp4(pc, dst, idx + 4); 259 260 for (i = 0; i < 4; i++) { 261 dst[i] = CALLOC_STRUCT(nv50_reg); 262 dst[i]->type = P_TEMP; 263 dst[i]->index = -1; 264 dst[i]->hw = idx + i; 265 pc->r_temp[idx + i] = dst[i]; 266 } 267 268 return 0; 269} 270 271static void 272free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 273{ 274 int i; 275 276 for (i = 0; i < 4; i++) 277 free_temp(pc, reg[i]); 278} 279 280static struct nv50_reg * 281temp_temp(struct nv50_pc *pc) 282{ 283 if (pc->temp_temp_nr >= 16) 284 assert(0); 285 286 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 287 return pc->temp_temp[pc->temp_temp_nr++]; 288} 289 290static void 291kill_temp_temp(struct nv50_pc *pc) 292{ 293 int i; 294 295 for (i = 0; i < pc->temp_temp_nr; i++) 296 free_temp(pc, pc->temp_temp[i]); 297 pc->temp_temp_nr = 0; 298} 299 300static int 301ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w) 302{ 303 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)), 304 (pc->immd_nr + 1) * 4 * sizeof(float)); 305 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 306 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 307 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 308 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 309 310 return pc->immd_nr++; 311} 312 313static struct nv50_reg * 314alloc_immd(struct nv50_pc *pc, float f) 315{ 316 struct nv50_reg *r = CALLOC_STRUCT(nv50_reg); 317 unsigned hw; 318 319 for (hw = 0; hw < pc->immd_nr * 4; hw++) 320 if (pc->immd_buf[hw] == f) 321 break; 322 323 if (hw == pc->immd_nr * 4) 324 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4; 325 326 r->type = P_IMMD; 327 r->hw = hw; 328 r->index = -1; 329 return r; 330} 331 332static struct nv50_program_exec * 333exec(struct nv50_pc *pc) 334{ 335 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 336 337 e->param.index = -1; 338 return e; 339} 340 341static void 342emit(struct nv50_pc *pc, struct nv50_program_exec *e) 343{ 344 struct nv50_program *p = pc->p; 345 346 if (p->exec_tail) 347 p->exec_tail->next = e; 348 if (!p->exec_head) 349 p->exec_head = e; 350 p->exec_tail = e; 351 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 352} 353 354static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 355 356static boolean 357is_long(struct nv50_program_exec *e) 358{ 359 if (e->inst[0] & 1) 360 return TRUE; 361 return FALSE; 362} 363 364static boolean 365is_immd(struct nv50_program_exec *e) 366{ 367 if (is_long(e) && (e->inst[1] & 3) == 3) 368 return TRUE; 369 return FALSE; 370} 371 372static INLINE void 373set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 374 struct nv50_program_exec *e) 375{ 376 set_long(pc, e); 377 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 378 e->inst[1] |= (pred << 7) | (idx << 12); 379} 380 381static INLINE void 382set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 383 struct nv50_program_exec *e) 384{ 385 set_long(pc, e); 386 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 387 e->inst[1] |= (idx << 4) | (on << 6); 388} 389 390static INLINE void 391set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 392{ 393 if (is_long(e)) 394 return; 395 396 e->inst[0] |= 1; 397 set_pred(pc, 0xf, 0, e); 398 set_pred_wr(pc, 0, 0, e); 399} 400 401static INLINE void 402set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 403{ 404 if (dst->type == P_RESULT) { 405 set_long(pc, e); 406 e->inst[1] |= 0x00000008; 407 } 408 409 alloc_reg(pc, dst); 410 e->inst[0] |= (dst->hw << 2); 411} 412 413static INLINE void 414set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 415{ 416 float f = pc->immd_buf[imm->hw]; 417 unsigned val = fui(imm->neg ? -f : f); 418 419 set_long(pc, e); 420 /*XXX: can't be predicated - bits overlap.. catch cases where both 421 * are required and avoid them. */ 422 set_pred(pc, 0, 0, e); 423 set_pred_wr(pc, 0, 0, e); 424 425 e->inst[1] |= 0x00000002 | 0x00000001; 426 e->inst[0] |= (val & 0x3f) << 16; 427 e->inst[1] |= (val >> 6) << 2; 428} 429 430 431#define INTERP_LINEAR 0 432#define INTERP_FLAT 1 433#define INTERP_PERSPECTIVE 2 434#define INTERP_CENTROID 4 435 436/* interpolant index has been stored in dst->rhw */ 437static void 438emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 439 unsigned mode) 440{ 441 assert(dst->rhw != -1); 442 struct nv50_program_exec *e = exec(pc); 443 444 e->inst[0] |= 0x80000000; 445 set_dst(pc, dst, e); 446 e->inst[0] |= (dst->rhw << 16); 447 448 if (mode & INTERP_FLAT) { 449 e->inst[0] |= (1 << 8); 450 } else { 451 if (mode & INTERP_PERSPECTIVE) { 452 e->inst[0] |= (1 << 25); 453 alloc_reg(pc, iv); 454 e->inst[0] |= (iv->hw << 9); 455 } 456 457 if (mode & INTERP_CENTROID) 458 e->inst[0] |= (1 << 24); 459 } 460 461 emit(pc, e); 462} 463 464static void 465set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 466 struct nv50_program_exec *e) 467{ 468 set_long(pc, e); 469 470 e->param.index = src->hw; 471 e->param.shift = s; 472 e->param.mask = m << (s % 32); 473 474 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 475} 476 477static void 478emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 479{ 480 struct nv50_program_exec *e = exec(pc); 481 482 e->inst[0] |= 0x10000000; 483 484 set_dst(pc, dst, e); 485 486 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) { 487 set_immd(pc, src, e); 488 /*XXX: 32-bit, but steals part of "half" reg space - need to 489 * catch and handle this case if/when we do half-regs 490 */ 491 } else 492 if (src->type == P_IMMD || src->type == P_CONST) { 493 set_long(pc, e); 494 set_data(pc, src, 0x7f, 9, e); 495 e->inst[1] |= 0x20000000; /* src0 const? */ 496 } else { 497 if (src->type == P_ATTR) { 498 set_long(pc, e); 499 e->inst[1] |= 0x00200000; 500 } 501 502 alloc_reg(pc, src); 503 e->inst[0] |= (src->hw << 9); 504 } 505 506 if (is_long(e) && !is_immd(e)) { 507 e->inst[1] |= 0x04000000; /* 32-bit */ 508 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */ 509 if (!(e->inst[1] & 0x20000000)) 510 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */ 511 } else 512 e->inst[0] |= 0x00008000; 513 514 emit(pc, e); 515} 516 517static INLINE void 518emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 519{ 520 struct nv50_reg *imm = alloc_immd(pc, f); 521 emit_mov(pc, dst, imm); 522 FREE(imm); 523} 524 525static boolean 526check_swap_src_0_1(struct nv50_pc *pc, 527 struct nv50_reg **s0, struct nv50_reg **s1) 528{ 529 struct nv50_reg *src0 = *s0, *src1 = *s1; 530 531 if (src0->type == P_CONST) { 532 if (src1->type != P_CONST) { 533 *s0 = src1; 534 *s1 = src0; 535 return TRUE; 536 } 537 } else 538 if (src1->type == P_ATTR) { 539 if (src0->type != P_ATTR) { 540 *s0 = src1; 541 *s1 = src0; 542 return TRUE; 543 } 544 } 545 546 return FALSE; 547} 548 549static void 550set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 551{ 552 if (src->type == P_ATTR) { 553 set_long(pc, e); 554 e->inst[1] |= 0x00200000; 555 } else 556 if (src->type == P_CONST || src->type == P_IMMD) { 557 struct nv50_reg *temp = temp_temp(pc); 558 559 emit_mov(pc, temp, src); 560 src = temp; 561 } 562 563 alloc_reg(pc, src); 564 e->inst[0] |= (src->hw << 9); 565} 566 567static void 568set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 569{ 570 if (src->type == P_ATTR) { 571 struct nv50_reg *temp = temp_temp(pc); 572 573 emit_mov(pc, temp, src); 574 src = temp; 575 } else 576 if (src->type == P_CONST || src->type == P_IMMD) { 577 assert(!(e->inst[0] & 0x00800000)); 578 if (e->inst[0] & 0x01000000) { 579 struct nv50_reg *temp = temp_temp(pc); 580 581 emit_mov(pc, temp, src); 582 src = temp; 583 } else { 584 set_data(pc, src, 0x7f, 16, e); 585 e->inst[0] |= 0x00800000; 586 } 587 } 588 589 alloc_reg(pc, src); 590 e->inst[0] |= (src->hw << 16); 591} 592 593static void 594set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 595{ 596 set_long(pc, e); 597 598 if (src->type == P_ATTR) { 599 struct nv50_reg *temp = temp_temp(pc); 600 601 emit_mov(pc, temp, src); 602 src = temp; 603 } else 604 if (src->type == P_CONST || src->type == P_IMMD) { 605 assert(!(e->inst[0] & 0x01000000)); 606 if (e->inst[0] & 0x00800000) { 607 struct nv50_reg *temp = temp_temp(pc); 608 609 emit_mov(pc, temp, src); 610 src = temp; 611 } else { 612 set_data(pc, src, 0x7f, 32+14, e); 613 e->inst[0] |= 0x01000000; 614 } 615 } 616 617 alloc_reg(pc, src); 618 e->inst[1] |= (src->hw << 14); 619} 620 621static void 622emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 623 struct nv50_reg *src1) 624{ 625 struct nv50_program_exec *e = exec(pc); 626 627 e->inst[0] |= 0xc0000000; 628 629 if (!pc->allow32) 630 set_long(pc, e); 631 632 check_swap_src_0_1(pc, &src0, &src1); 633 set_dst(pc, dst, e); 634 set_src_0(pc, src0, e); 635 if (src1->type == P_IMMD && !is_long(e)) { 636 if (src0->neg) 637 e->inst[0] |= 0x00008000; 638 set_immd(pc, src1, e); 639 } else { 640 set_src_1(pc, src1, e); 641 if (src0->neg ^ src1->neg) { 642 if (is_long(e)) 643 e->inst[1] |= 0x08000000; 644 else 645 e->inst[0] |= 0x00008000; 646 } 647 } 648 649 emit(pc, e); 650} 651 652static void 653emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 654 struct nv50_reg *src0, struct nv50_reg *src1) 655{ 656 struct nv50_program_exec *e = exec(pc); 657 658 e->inst[0] |= 0xb0000000; 659 660 check_swap_src_0_1(pc, &src0, &src1); 661 662 if (!pc->allow32 || src0->neg || src1->neg) { 663 set_long(pc, e); 664 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27); 665 } 666 667 set_dst(pc, dst, e); 668 set_src_0(pc, src0, e); 669 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 670 set_src_2(pc, src1, e); 671 else 672 if (src1->type == P_IMMD) 673 set_immd(pc, src1, e); 674 else 675 set_src_1(pc, src1, e); 676 677 emit(pc, e); 678} 679 680static void 681emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 682 struct nv50_reg *src0, struct nv50_reg *src1) 683{ 684 struct nv50_program_exec *e = exec(pc); 685 686 set_long(pc, e); 687 e->inst[0] |= 0xb0000000; 688 e->inst[1] |= (sub << 29); 689 690 check_swap_src_0_1(pc, &src0, &src1); 691 set_dst(pc, dst, e); 692 set_src_0(pc, src0, e); 693 set_src_1(pc, src1, e); 694 695 emit(pc, e); 696} 697 698static INLINE void 699emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 700 struct nv50_reg *src1) 701{ 702 src1->neg ^= 1; 703 emit_add(pc, dst, src0, src1); 704 src1->neg ^= 1; 705} 706 707static void 708emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 709 struct nv50_reg *src1, struct nv50_reg *src2) 710{ 711 struct nv50_program_exec *e = exec(pc); 712 713 e->inst[0] |= 0xe0000000; 714 715 check_swap_src_0_1(pc, &src0, &src1); 716 set_dst(pc, dst, e); 717 set_src_0(pc, src0, e); 718 set_src_1(pc, src1, e); 719 set_src_2(pc, src2, e); 720 721 if (src0->neg ^ src1->neg) 722 e->inst[1] |= 0x04000000; 723 if (src2->neg) 724 e->inst[1] |= 0x08000000; 725 726 emit(pc, e); 727} 728 729static INLINE void 730emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 731 struct nv50_reg *src1, struct nv50_reg *src2) 732{ 733 src2->neg ^= 1; 734 emit_mad(pc, dst, src0, src1, src2); 735 src2->neg ^= 1; 736} 737 738static void 739emit_flop(struct nv50_pc *pc, unsigned sub, 740 struct nv50_reg *dst, struct nv50_reg *src) 741{ 742 struct nv50_program_exec *e = exec(pc); 743 744 e->inst[0] |= 0x90000000; 745 if (sub) { 746 set_long(pc, e); 747 e->inst[1] |= (sub << 29); 748 } 749 750 set_dst(pc, dst, e); 751 set_src_0(pc, src, e); 752 753 emit(pc, e); 754} 755 756static void 757emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 758{ 759 struct nv50_program_exec *e = exec(pc); 760 761 e->inst[0] |= 0xb0000000; 762 763 set_dst(pc, dst, e); 764 set_src_0(pc, src, e); 765 set_long(pc, e); 766 e->inst[1] |= (6 << 29) | 0x00004000; 767 768 emit(pc, e); 769} 770 771static void 772emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 773{ 774 struct nv50_program_exec *e = exec(pc); 775 776 e->inst[0] |= 0xb0000000; 777 778 set_dst(pc, dst, e); 779 set_src_0(pc, src, e); 780 set_long(pc, e); 781 e->inst[1] |= (6 << 29); 782 783 emit(pc, e); 784} 785 786#define CVTOP_RN 0x01 787#define CVTOP_FLOOR 0x03 788#define CVTOP_CEIL 0x05 789#define CVTOP_TRUNC 0x07 790#define CVTOP_SAT 0x08 791#define CVTOP_ABS 0x10 792 793#define CVT_F32_F32 0xc4 794#define CVT_F32_S32 0x44 795#define CVT_F32_U32 0x64 796#define CVT_S32_F32 0x8c 797#define CVT_S32_S32 0x0c 798#define CVT_F32_F32_ROP 0xcc 799 800static void 801emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 802 int wp, unsigned cop, unsigned fmt) 803{ 804 struct nv50_program_exec *e; 805 806 e = exec(pc); 807 set_long(pc, e); 808 809 e->inst[0] |= 0xa0000000; 810 e->inst[1] |= 0x00004000; 811 e->inst[1] |= (cop << 16); 812 e->inst[1] |= (fmt << 24); 813 set_src_0(pc, src, e); 814 815 if (wp >= 0) 816 set_pred_wr(pc, 1, wp, e); 817 818 if (dst) 819 set_dst(pc, dst, e); 820 else { 821 e->inst[0] |= 0x000001fc; 822 e->inst[1] |= 0x00000008; 823 } 824 825 emit(pc, e); 826} 827 828static void 829emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst, 830 struct nv50_reg *src0, struct nv50_reg *src1) 831{ 832 struct nv50_program_exec *e = exec(pc); 833 unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; 834 struct nv50_reg *rdst; 835 836 assert(c_op <= 7); 837 if (check_swap_src_0_1(pc, &src0, &src1)) 838 c_op = inv_cop[c_op]; 839 840 rdst = dst; 841 if (dst->type != P_TEMP) 842 dst = alloc_temp(pc, NULL); 843 844 /* set.u32 */ 845 set_long(pc, e); 846 e->inst[0] |= 0xb0000000; 847 e->inst[1] |= (3 << 29); 848 e->inst[1] |= (c_op << 14); 849 /*XXX: breaks things, .u32 by default? 850 * decuda will disasm as .u16 and use .lo/.hi regs, but this 851 * doesn't seem to match what the hw actually does. 852 inst[1] |= 0x04000000; << breaks things.. .u32 by default? 853 */ 854 set_dst(pc, dst, e); 855 set_src_0(pc, src0, e); 856 set_src_1(pc, src1, e); 857 emit(pc, e); 858 859 /* cvt.f32.u32 */ 860 e = exec(pc); 861 e->inst[0] = 0xa0000001; 862 e->inst[1] = 0x64014780; 863 set_dst(pc, rdst, e); 864 set_src_0(pc, dst, e); 865 emit(pc, e); 866 867 if (dst != rdst) 868 free_temp(pc, dst); 869} 870 871static INLINE void 872emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 873{ 874 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP); 875} 876 877static void 878emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 879 struct nv50_reg *v, struct nv50_reg *e) 880{ 881 struct nv50_reg *temp = alloc_temp(pc, NULL); 882 883 emit_flop(pc, 3, temp, v); 884 emit_mul(pc, temp, temp, e); 885 emit_preex2(pc, temp, temp); 886 emit_flop(pc, 6, dst, temp); 887 888 free_temp(pc, temp); 889} 890 891static INLINE void 892emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 893{ 894 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32); 895} 896 897static INLINE void 898emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 899{ 900 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32); 901} 902 903static void 904emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 905 struct nv50_reg **src) 906{ 907 struct nv50_reg *one = alloc_immd(pc, 1.0); 908 struct nv50_reg *zero = alloc_immd(pc, 0.0); 909 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 910 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 911 struct nv50_reg *tmp[4]; 912 boolean allow32 = pc->allow32; 913 914 pc->allow32 = FALSE; 915 916 if (mask & (3 << 1)) { 917 tmp[0] = alloc_temp(pc, NULL); 918 emit_minmax(pc, 4, tmp[0], src[0], zero); 919 } 920 921 if (mask & (1 << 2)) { 922 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 923 924 tmp[1] = temp_temp(pc); 925 emit_minmax(pc, 4, tmp[1], src[1], zero); 926 927 tmp[3] = temp_temp(pc); 928 emit_minmax(pc, 4, tmp[3], src[3], neg128); 929 emit_minmax(pc, 5, tmp[3], tmp[3], pos128); 930 931 emit_pow(pc, dst[2], tmp[1], tmp[3]); 932 emit_mov(pc, dst[2], zero); 933 set_pred(pc, 3, 0, pc->p->exec_tail); 934 } 935 936 if (mask & (1 << 1)) 937 assimilate_temp(pc, dst[1], tmp[0]); 938 else 939 if (mask & (1 << 2)) 940 free_temp(pc, tmp[0]); 941 942 pc->allow32 = allow32; 943 944 /* do this last, in case src[i,j] == dst[0,3] */ 945 if (mask & (1 << 0)) 946 emit_mov(pc, dst[0], one); 947 948 if (mask & (1 << 3)) 949 emit_mov(pc, dst[3], one); 950 951 FREE(pos128); 952 FREE(neg128); 953 FREE(zero); 954 FREE(one); 955} 956 957static void 958emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 959{ 960 struct nv50_program_exec *e = exec(pc); 961 962 set_long(pc, e); 963 e->inst[0] |= 0xa0000000; /* delta */ 964 e->inst[1] |= (7 << 29); /* delta */ 965 e->inst[1] |= 0x04000000; /* negate arg0? probably not */ 966 e->inst[1] |= (1 << 14); /* src .f32 */ 967 set_dst(pc, dst, e); 968 set_src_0(pc, src, e); 969 970 emit(pc, e); 971} 972 973static void 974emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 975{ 976 struct nv50_program_exec *e; 977 const int r_pred = 1; 978 979 /* Sets predicate reg ? */ 980 e = exec(pc); 981 e->inst[0] = 0xa00001fd; 982 e->inst[1] = 0xc4014788; 983 set_src_0(pc, src, e); 984 set_pred_wr(pc, 1, r_pred, e); 985 if (src->neg) 986 e->inst[1] |= 0x20000000; 987 emit(pc, e); 988 989 /* This is probably KILP */ 990 e = exec(pc); 991 e->inst[0] = 0x000001fe; 992 set_long(pc, e); 993 set_pred(pc, 1 /* LT? */, r_pred, e); 994 emit(pc, e); 995} 996 997static void 998emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 999 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj) 1000{ 1001 struct nv50_reg *temp, *t[4]; 1002 struct nv50_program_exec *e; 1003 1004 unsigned c, mode, dim; 1005 1006 switch (type) { 1007 case TGSI_TEXTURE_1D: 1008 dim = 1; 1009 break; 1010 case TGSI_TEXTURE_UNKNOWN: 1011 case TGSI_TEXTURE_2D: 1012 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */ 1013 case TGSI_TEXTURE_RECT: 1014 dim = 2; 1015 break; 1016 case TGSI_TEXTURE_3D: 1017 case TGSI_TEXTURE_CUBE: 1018 case TGSI_TEXTURE_SHADOW2D: 1019 case TGSI_TEXTURE_SHADOWRECT: /* XXX */ 1020 dim = 3; 1021 break; 1022 default: 1023 assert(0); 1024 break; 1025 } 1026 1027 /* some cards need t[0]'s hw index to be a multiple of 4 */ 1028 alloc_temp4(pc, t, 0); 1029 1030 if (proj) { 1031 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1032 mode = pc->interp_mode[src[0]->index]; 1033 1034 t[3]->rhw = src[3]->rhw; 1035 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1036 emit_flop(pc, 0, t[3], t[3]); 1037 1038 for (c = 0; c < dim; c++) { 1039 t[c]->rhw = src[c]->rhw; 1040 emit_interp(pc, t[c], t[3], 1041 (mode | INTERP_PERSPECTIVE)); 1042 } 1043 } else { 1044 emit_flop(pc, 0, t[3], src[3]); 1045 for (c = 0; c < dim; c++) 1046 emit_mul(pc, t[c], src[c], t[3]); 1047 1048 /* XXX: for some reason the blob sometimes uses MAD: 1049 * emit_mad(pc, t[c], src[0][c], t[3], t[3]) 1050 * pc->p->exec_tail->inst[1] |= 0x080fc000; 1051 */ 1052 } 1053 } else { 1054 if (type == TGSI_TEXTURE_CUBE) { 1055 temp = temp_temp(pc); 1056 emit_minmax(pc, 4, temp, src[0], src[1]); 1057 emit_minmax(pc, 4, temp, temp, src[2]); 1058 emit_flop(pc, 0, temp, temp); 1059 for (c = 0; c < 3; c++) 1060 emit_mul(pc, t[c], src[c], temp); 1061 } else { 1062 for (c = 0; c < dim; c++) 1063 emit_mov(pc, t[c], src[c]); 1064 } 1065 } 1066 1067 e = exec(pc); 1068 set_long(pc, e); 1069 e->inst[0] |= 0xf0000000; 1070 e->inst[1] |= 0x00000004; 1071 set_dst(pc, t[0], e); 1072 e->inst[0] |= (unit << 9); 1073 1074 if (dim == 2) 1075 e->inst[0] |= 0x00400000; 1076 else 1077 if (dim == 3) 1078 e->inst[0] |= 0x00800000; 1079 1080 e->inst[0] |= (mask & 0x3) << 25; 1081 e->inst[1] |= (mask & 0xc) << 12; 1082 1083 emit(pc, e); 1084 1085#if 1 1086 if (mask & 1) emit_mov(pc, dst[0], t[0]); 1087 if (mask & 2) emit_mov(pc, dst[1], t[1]); 1088 if (mask & 4) emit_mov(pc, dst[2], t[2]); 1089 if (mask & 8) emit_mov(pc, dst[3], t[3]); 1090 1091 free_temp4(pc, t); 1092#else 1093 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1094 * the texture coordinates, not the fetched values: latency ? */ 1095 1096 for (c = 0; c < 4; c++) { 1097 if (mask & (1 << c)) 1098 assimilate_temp(pc, dst[c], t[c]); 1099 else 1100 free_temp(pc, t[c]); 1101 } 1102#endif 1103} 1104 1105static void 1106convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1107{ 1108 unsigned q = 0, m = ~0; 1109 1110 assert(!is_long(e)); 1111 1112 switch (e->inst[0] >> 28) { 1113 case 0x1: 1114 /* MOV */ 1115 q = 0x0403c000; 1116 m = 0xffff7fff; 1117 break; 1118 case 0x8: 1119 /* INTERP (move centroid, perspective and flat bits) */ 1120 m = ~0x03000100; 1121 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 1122 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 1123 break; 1124 case 0x9: 1125 /* RCP */ 1126 break; 1127 case 0xB: 1128 /* ADD */ 1129 m = ~(127 << 16); 1130 q = ((e->inst[0] & (~m)) >> 2); 1131 break; 1132 case 0xC: 1133 /* MUL */ 1134 m = ~0x00008000; 1135 q = ((e->inst[0] & (~m)) << 12); 1136 break; 1137 case 0xE: 1138 /* MAD (if src2 == dst) */ 1139 q = ((e->inst[0] & 0x1fc) << 12); 1140 break; 1141 default: 1142 assert(0); 1143 break; 1144 } 1145 1146 set_long(pc, e); 1147 pc->p->exec_size++; 1148 1149 e->inst[0] &= m; 1150 e->inst[1] |= q; 1151} 1152 1153static boolean 1154negate_supported(const struct tgsi_full_instruction *insn, int i) 1155{ 1156 switch (insn->Instruction.Opcode) { 1157 case TGSI_OPCODE_DP3: 1158 case TGSI_OPCODE_DP4: 1159 case TGSI_OPCODE_MUL: 1160 case TGSI_OPCODE_KIL: 1161 case TGSI_OPCODE_ADD: 1162 case TGSI_OPCODE_SUB: 1163 case TGSI_OPCODE_MAD: 1164 return TRUE; 1165 case TGSI_OPCODE_POW: 1166 return (i == 1) ? TRUE : FALSE; 1167 default: 1168 return FALSE; 1169 } 1170} 1171 1172/* Return a read mask for source registers deduced from opcode & write mask. */ 1173static unsigned 1174nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 1175{ 1176 unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask; 1177 1178 switch (insn->Instruction.Opcode) { 1179 case TGSI_OPCODE_COS: 1180 case TGSI_OPCODE_SIN: 1181 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 1182 case TGSI_OPCODE_DP3: 1183 return 0x7; 1184 case TGSI_OPCODE_DP4: 1185 case TGSI_OPCODE_DPH: 1186 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 1187 return 0xf; 1188 case TGSI_OPCODE_DST: 1189 return mask & (c ? 0xa : 0x6); 1190 case TGSI_OPCODE_EX2: 1191 case TGSI_OPCODE_LG2: 1192 case TGSI_OPCODE_POW: 1193 case TGSI_OPCODE_RCP: 1194 case TGSI_OPCODE_RSQ: 1195 case TGSI_OPCODE_SCS: 1196 return 0x1; 1197 case TGSI_OPCODE_LIT: 1198 return 0xb; 1199 case TGSI_OPCODE_TEX: 1200 case TGSI_OPCODE_TXP: 1201 { 1202 const struct tgsi_instruction_ext_texture *tex; 1203 1204 assert(insn->Instruction.Extended); 1205 tex = &insn->InstructionExtTexture; 1206 1207 mask = 0x7; 1208 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) 1209 mask |= 0x8; 1210 1211 switch (tex->Texture) { 1212 case TGSI_TEXTURE_1D: 1213 mask &= 0x9; 1214 break; 1215 case TGSI_TEXTURE_2D: 1216 mask &= 0xb; 1217 break; 1218 default: 1219 break; 1220 } 1221 } 1222 return mask; 1223 case TGSI_OPCODE_XPD: 1224 x = 0; 1225 if (mask & 1) x |= 0x6; 1226 if (mask & 2) x |= 0x5; 1227 if (mask & 4) x |= 0x3; 1228 return x; 1229 default: 1230 break; 1231 } 1232 1233 return mask; 1234} 1235 1236static struct nv50_reg * 1237tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 1238{ 1239 switch (dst->DstRegister.File) { 1240 case TGSI_FILE_TEMPORARY: 1241 return &pc->temp[dst->DstRegister.Index * 4 + c]; 1242 case TGSI_FILE_OUTPUT: 1243 return &pc->result[dst->DstRegister.Index * 4 + c]; 1244 case TGSI_FILE_NULL: 1245 return NULL; 1246 default: 1247 break; 1248 } 1249 1250 return NULL; 1251} 1252 1253static struct nv50_reg * 1254tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 1255 boolean neg) 1256{ 1257 struct nv50_reg *r = NULL; 1258 struct nv50_reg *temp; 1259 unsigned sgn, c; 1260 1261 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 1262 1263 c = tgsi_util_get_full_src_register_extswizzle(src, chan); 1264 switch (c) { 1265 case TGSI_EXTSWIZZLE_X: 1266 case TGSI_EXTSWIZZLE_Y: 1267 case TGSI_EXTSWIZZLE_Z: 1268 case TGSI_EXTSWIZZLE_W: 1269 switch (src->SrcRegister.File) { 1270 case TGSI_FILE_INPUT: 1271 r = &pc->attr[src->SrcRegister.Index * 4 + c]; 1272 break; 1273 case TGSI_FILE_TEMPORARY: 1274 r = &pc->temp[src->SrcRegister.Index * 4 + c]; 1275 break; 1276 case TGSI_FILE_CONSTANT: 1277 r = &pc->param[src->SrcRegister.Index * 4 + c]; 1278 break; 1279 case TGSI_FILE_IMMEDIATE: 1280 r = &pc->immd[src->SrcRegister.Index * 4 + c]; 1281 break; 1282 case TGSI_FILE_SAMPLER: 1283 break; 1284 default: 1285 assert(0); 1286 break; 1287 } 1288 break; 1289 case TGSI_EXTSWIZZLE_ZERO: 1290 r = alloc_immd(pc, 0.0); 1291 return r; 1292 case TGSI_EXTSWIZZLE_ONE: 1293 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET) 1294 return alloc_immd(pc, -1.0); 1295 return alloc_immd(pc, 1.0); 1296 default: 1297 assert(0); 1298 break; 1299 } 1300 1301 switch (sgn) { 1302 case TGSI_UTIL_SIGN_KEEP: 1303 break; 1304 case TGSI_UTIL_SIGN_CLEAR: 1305 temp = temp_temp(pc); 1306 emit_abs(pc, temp, r); 1307 r = temp; 1308 break; 1309 case TGSI_UTIL_SIGN_TOGGLE: 1310 if (neg) 1311 r->neg = 1; 1312 else { 1313 temp = temp_temp(pc); 1314 emit_neg(pc, temp, r); 1315 r = temp; 1316 } 1317 break; 1318 case TGSI_UTIL_SIGN_SET: 1319 temp = temp_temp(pc); 1320 emit_abs(pc, temp, r); 1321 if (neg) 1322 temp->neg = 1; 1323 else 1324 emit_neg(pc, temp, temp); 1325 r = temp; 1326 break; 1327 default: 1328 assert(0); 1329 break; 1330 } 1331 1332 return r; 1333} 1334 1335/* return TRUE for ops that produce only a single result */ 1336static boolean 1337is_scalar_op(unsigned op) 1338{ 1339 switch (op) { 1340 case TGSI_OPCODE_DP2: 1341 case TGSI_OPCODE_DP3: 1342 case TGSI_OPCODE_DP4: 1343 case TGSI_OPCODE_DPH: 1344 case TGSI_OPCODE_EX2: 1345 case TGSI_OPCODE_LG2: 1346 case TGSI_OPCODE_POW: 1347 case TGSI_OPCODE_RCP: 1348 case TGSI_OPCODE_RSQ: 1349 /* 1350 case TGSI_OPCODE_COS: 1351 case TGSI_OPCODE_KIL: 1352 case TGSI_OPCODE_LIT: 1353 case TGSI_OPCODE_SCS: 1354 case TGSI_OPCODE_SIN: 1355 */ 1356 return TRUE; 1357 default: 1358 return FALSE; 1359 } 1360} 1361 1362/* Returns a bitmask indicating which dst components depend 1363 * on source s, component c (reverse of nv50_tgsi_src_mask). 1364 */ 1365static unsigned 1366nv50_tgsi_dst_revdep(unsigned op, int s, int c) 1367{ 1368 if (is_scalar_op(op)) 1369 return 0x1; 1370 1371 switch (op) { 1372 case TGSI_OPCODE_DST: 1373 return (1 << c) & (s ? 0xa : 0x6); 1374 case TGSI_OPCODE_XPD: 1375 switch (c) { 1376 case 0: return 0x6; 1377 case 1: return 0x5; 1378 case 2: return 0x3; 1379 case 3: return 0x0; 1380 default: 1381 assert(0); 1382 return 0x0; 1383 } 1384 case TGSI_OPCODE_LIT: 1385 case TGSI_OPCODE_SCS: 1386 case TGSI_OPCODE_TEX: 1387 case TGSI_OPCODE_TXP: 1388 /* these take care of dangerous swizzles themselves */ 1389 return 0x0; 1390 case TGSI_OPCODE_IF: 1391 case TGSI_OPCODE_KIL: 1392 /* don't call this function for these ops */ 1393 assert(0); 1394 return 0; 1395 default: 1396 /* linear vector instruction */ 1397 return (1 << c); 1398 } 1399} 1400 1401static boolean 1402nv50_program_tx_insn(struct nv50_pc *pc, 1403 const struct tgsi_full_instruction *inst) 1404{ 1405 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 1406 unsigned mask, sat, unit; 1407 int i, c; 1408 1409 mask = inst->FullDstRegisters[0].DstRegister.WriteMask; 1410 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 1411 1412 memset(src, 0, sizeof(src)); 1413 1414 for (c = 0; c < 4; c++) { 1415 if ((mask & (1 << c)) && !pc->r_dst[c]) 1416 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); 1417 else 1418 dst[c] = pc->r_dst[c]; 1419 rdst[c] = dst[c]; 1420 } 1421 1422 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1423 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; 1424 unsigned src_mask; 1425 boolean neg_supp; 1426 1427 src_mask = nv50_tgsi_src_mask(inst, i); 1428 neg_supp = negate_supported(inst, i); 1429 1430 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) 1431 unit = fs->SrcRegister.Index; 1432 1433 for (c = 0; c < 4; c++) 1434 if (src_mask & (1 << c)) 1435 src[i][c] = tgsi_src(pc, c, fs, neg_supp); 1436 } 1437 1438 brdc = temp = pc->r_brdc; 1439 if (brdc && brdc->type != P_TEMP) { 1440 temp = temp_temp(pc); 1441 if (sat) 1442 brdc = temp; 1443 } else 1444 if (sat) { 1445 for (c = 0; c < 4; c++) { 1446 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 1447 continue; 1448 rdst[c] = dst[c]; 1449 dst[c] = temp_temp(pc); 1450 } 1451 } 1452 1453 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 1454 1455 switch (inst->Instruction.Opcode) { 1456 case TGSI_OPCODE_ABS: 1457 for (c = 0; c < 4; c++) { 1458 if (!(mask & (1 << c))) 1459 continue; 1460 emit_abs(pc, dst[c], src[0][c]); 1461 } 1462 break; 1463 case TGSI_OPCODE_ADD: 1464 for (c = 0; c < 4; c++) { 1465 if (!(mask & (1 << c))) 1466 continue; 1467 emit_add(pc, dst[c], src[0][c], src[1][c]); 1468 } 1469 break; 1470 case TGSI_OPCODE_COS: 1471 temp = temp_temp(pc); 1472 emit_precossin(pc, temp, src[0][0]); 1473 emit_flop(pc, 5, temp, temp); 1474 for (c = 0; c < 4; c++) { 1475 if (!(mask & (1 << c))) 1476 continue; 1477 emit_mov(pc, dst[c], temp); 1478 } 1479 break; 1480 case TGSI_OPCODE_DP3: 1481 emit_mul(pc, temp, src[0][0], src[1][0]); 1482 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1483 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 1484 break; 1485 case TGSI_OPCODE_DP4: 1486 emit_mul(pc, temp, src[0][0], src[1][0]); 1487 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1488 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1489 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 1490 break; 1491 case TGSI_OPCODE_DPH: 1492 emit_mul(pc, temp, src[0][0], src[1][0]); 1493 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1494 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1495 emit_add(pc, brdc, src[1][3], temp); 1496 break; 1497 case TGSI_OPCODE_DST: 1498 if (mask & (1 << 1)) 1499 emit_mul(pc, dst[1], src[0][1], src[1][1]); 1500 if (mask & (1 << 2)) 1501 emit_mov(pc, dst[2], src[0][2]); 1502 if (mask & (1 << 3)) 1503 emit_mov(pc, dst[3], src[1][3]); 1504 if (mask & (1 << 0)) 1505 emit_mov_immdval(pc, dst[0], 1.0f); 1506 break; 1507 case TGSI_OPCODE_EX2: 1508 emit_preex2(pc, temp, src[0][0]); 1509 emit_flop(pc, 6, brdc, temp); 1510 break; 1511 case TGSI_OPCODE_FLR: 1512 for (c = 0; c < 4; c++) { 1513 if (!(mask & (1 << c))) 1514 continue; 1515 emit_flr(pc, dst[c], src[0][c]); 1516 } 1517 break; 1518 case TGSI_OPCODE_FRC: 1519 temp = temp_temp(pc); 1520 for (c = 0; c < 4; c++) { 1521 if (!(mask & (1 << c))) 1522 continue; 1523 emit_flr(pc, temp, src[0][c]); 1524 emit_sub(pc, dst[c], src[0][c], temp); 1525 } 1526 break; 1527 case TGSI_OPCODE_KIL: 1528 emit_kil(pc, src[0][0]); 1529 emit_kil(pc, src[0][1]); 1530 emit_kil(pc, src[0][2]); 1531 emit_kil(pc, src[0][3]); 1532 pc->p->cfg.fp.regs[2] |= 0x00100000; 1533 break; 1534 case TGSI_OPCODE_LIT: 1535 emit_lit(pc, &dst[0], mask, &src[0][0]); 1536 break; 1537 case TGSI_OPCODE_LG2: 1538 emit_flop(pc, 3, brdc, src[0][0]); 1539 break; 1540 case TGSI_OPCODE_LRP: 1541 temp = temp_temp(pc); 1542 for (c = 0; c < 4; c++) { 1543 if (!(mask & (1 << c))) 1544 continue; 1545 emit_sub(pc, temp, src[1][c], src[2][c]); 1546 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 1547 } 1548 break; 1549 case TGSI_OPCODE_MAD: 1550 for (c = 0; c < 4; c++) { 1551 if (!(mask & (1 << c))) 1552 continue; 1553 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 1554 } 1555 break; 1556 case TGSI_OPCODE_MAX: 1557 for (c = 0; c < 4; c++) { 1558 if (!(mask & (1 << c))) 1559 continue; 1560 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]); 1561 } 1562 break; 1563 case TGSI_OPCODE_MIN: 1564 for (c = 0; c < 4; c++) { 1565 if (!(mask & (1 << c))) 1566 continue; 1567 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]); 1568 } 1569 break; 1570 case TGSI_OPCODE_MOV: 1571 case TGSI_OPCODE_SWZ: 1572 for (c = 0; c < 4; c++) { 1573 if (!(mask & (1 << c))) 1574 continue; 1575 emit_mov(pc, dst[c], src[0][c]); 1576 } 1577 break; 1578 case TGSI_OPCODE_MUL: 1579 for (c = 0; c < 4; c++) { 1580 if (!(mask & (1 << c))) 1581 continue; 1582 emit_mul(pc, dst[c], src[0][c], src[1][c]); 1583 } 1584 break; 1585 case TGSI_OPCODE_POW: 1586 emit_pow(pc, brdc, src[0][0], src[1][0]); 1587 break; 1588 case TGSI_OPCODE_RCP: 1589 emit_flop(pc, 0, brdc, src[0][0]); 1590 break; 1591 case TGSI_OPCODE_RSQ: 1592 emit_flop(pc, 2, brdc, src[0][0]); 1593 break; 1594 case TGSI_OPCODE_SCS: 1595 temp = temp_temp(pc); 1596 if (mask & 3) 1597 emit_precossin(pc, temp, src[0][0]); 1598 if (mask & (1 << 0)) 1599 emit_flop(pc, 5, dst[0], temp); 1600 if (mask & (1 << 1)) 1601 emit_flop(pc, 4, dst[1], temp); 1602 if (mask & (1 << 2)) 1603 emit_mov_immdval(pc, dst[2], 0.0); 1604 if (mask & (1 << 3)) 1605 emit_mov_immdval(pc, dst[3], 1.0); 1606 break; 1607 case TGSI_OPCODE_SGE: 1608 for (c = 0; c < 4; c++) { 1609 if (!(mask & (1 << c))) 1610 continue; 1611 emit_set(pc, 6, dst[c], src[0][c], src[1][c]); 1612 } 1613 break; 1614 case TGSI_OPCODE_SIN: 1615 temp = temp_temp(pc); 1616 emit_precossin(pc, temp, src[0][0]); 1617 emit_flop(pc, 4, temp, temp); 1618 for (c = 0; c < 4; c++) { 1619 if (!(mask & (1 << c))) 1620 continue; 1621 emit_mov(pc, dst[c], temp); 1622 } 1623 break; 1624 case TGSI_OPCODE_SLT: 1625 for (c = 0; c < 4; c++) { 1626 if (!(mask & (1 << c))) 1627 continue; 1628 emit_set(pc, 1, dst[c], src[0][c], src[1][c]); 1629 } 1630 break; 1631 case TGSI_OPCODE_SUB: 1632 for (c = 0; c < 4; c++) { 1633 if (!(mask & (1 << c))) 1634 continue; 1635 emit_sub(pc, dst[c], src[0][c], src[1][c]); 1636 } 1637 break; 1638 case TGSI_OPCODE_TEX: 1639 emit_tex(pc, dst, mask, src[0], unit, 1640 inst->InstructionExtTexture.Texture, FALSE); 1641 break; 1642 case TGSI_OPCODE_TXP: 1643 emit_tex(pc, dst, mask, src[0], unit, 1644 inst->InstructionExtTexture.Texture, TRUE); 1645 break; 1646 case TGSI_OPCODE_XPD: 1647 temp = temp_temp(pc); 1648 if (mask & (1 << 0)) { 1649 emit_mul(pc, temp, src[0][2], src[1][1]); 1650 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 1651 } 1652 if (mask & (1 << 1)) { 1653 emit_mul(pc, temp, src[0][0], src[1][2]); 1654 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 1655 } 1656 if (mask & (1 << 2)) { 1657 emit_mul(pc, temp, src[0][1], src[1][0]); 1658 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 1659 } 1660 if (mask & (1 << 3)) 1661 emit_mov_immdval(pc, dst[3], 1.0); 1662 break; 1663 case TGSI_OPCODE_END: 1664 break; 1665 default: 1666 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 1667 return FALSE; 1668 } 1669 1670 if (brdc) { 1671 if (sat) 1672 emit_sat(pc, brdc, brdc); 1673 for (c = 0; c < 4; c++) 1674 if ((mask & (1 << c)) && dst[c] != brdc) 1675 emit_mov(pc, dst[c], brdc); 1676 } else 1677 if (sat) { 1678 for (c = 0; c < 4; c++) { 1679 if (!(mask & (1 << c))) 1680 continue; 1681 /* in this case we saturate later */ 1682 if (dst[c]->type == P_TEMP && dst[c]->index < 0) 1683 continue; 1684 emit_sat(pc, rdst[c], dst[c]); 1685 } 1686 } 1687 1688 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1689 for (c = 0; c < 4; c++) { 1690 if (!src[i][c]) 1691 continue; 1692 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) 1693 FREE(src[i][c]); 1694 } 1695 } 1696 1697 kill_temp_temp(pc); 1698 return TRUE; 1699} 1700 1701static void 1702prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok, 1703 unsigned *r_usage[2]) 1704{ 1705 const struct tgsi_full_instruction *insn; 1706 const struct tgsi_full_src_register *src; 1707 const struct tgsi_dst_register *dst; 1708 1709 unsigned i, c, k, n, mask, *acc_p; 1710 1711 insn = &tok->FullInstruction; 1712 dst = &insn->FullDstRegisters[0].DstRegister; 1713 mask = dst->WriteMask; 1714 1715 if (!r_usage[0]) 1716 r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned)); 1717 if (!r_usage[1]) 1718 r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned)); 1719 1720 if (dst->File == TGSI_FILE_TEMPORARY) { 1721 for (c = 0; c < 4; c++) { 1722 if (!(mask & (1 << c))) 1723 continue; 1724 r_usage[0][dst->Index * 4 + c] = pc->insn_nr; 1725 } 1726 } 1727 1728 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 1729 src = &insn->FullSrcRegisters[i]; 1730 1731 switch (src->SrcRegister.File) { 1732 case TGSI_FILE_TEMPORARY: 1733 acc_p = r_usage[0]; 1734 break; 1735 case TGSI_FILE_INPUT: 1736 acc_p = r_usage[1]; 1737 break; 1738 default: 1739 continue; 1740 } 1741 1742 mask = nv50_tgsi_src_mask(insn, i); 1743 1744 for (c = 0; c < 4; c++) { 1745 if (!(mask & (1 << c))) 1746 continue; 1747 1748 k = tgsi_util_get_full_src_register_extswizzle(src, c); 1749 switch (k) { 1750 case TGSI_EXTSWIZZLE_X: 1751 case TGSI_EXTSWIZZLE_Y: 1752 case TGSI_EXTSWIZZLE_Z: 1753 case TGSI_EXTSWIZZLE_W: 1754 n = src->SrcRegister.Index * 4 + k; 1755 acc_p[n] = pc->insn_nr; 1756 break; 1757 default: 1758 break; 1759 } 1760 } 1761 } 1762} 1763 1764/* Returns a bitmask indicating which dst components need to be 1765 * written to temporaries first to avoid 'corrupting' sources. 1766 * 1767 * m[i] (out) indicate component to write in the i-th position 1768 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 1769 */ 1770static unsigned 1771nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 1772{ 1773 unsigned i, c, x, unsafe; 1774 1775 for (c = 0; c < 4; c++) 1776 m[c] = c; 1777 1778 /* Swap as long as a dst component written earlier is depended on 1779 * by one written later, but the next one isn't depended on by it. 1780 */ 1781 for (c = 0; c < 3; c++) { 1782 if (rdep[m[c + 1]] & (1 << m[c])) 1783 continue; /* if next one is depended on by us */ 1784 for (i = c + 1; i < 4; i++) 1785 /* if we are depended on by a later one */ 1786 if (rdep[m[c]] & (1 << m[i])) 1787 break; 1788 if (i == 4) 1789 continue; 1790 /* now, swap */ 1791 x = m[c]; 1792 m[c] = m[c + 1]; 1793 m[c + 1] = x; 1794 1795 /* restart */ 1796 c = 0; 1797 } 1798 1799 /* mark dependencies that could not be resolved by reordering */ 1800 for (i = 0; i < 3; ++i) 1801 for (c = i + 1; c < 4; ++c) 1802 if (rdep[m[i]] & (1 << m[c])) 1803 unsafe |= (1 << i); 1804 1805 /* NOTE: $unsafe is with respect to order, not component */ 1806 return unsafe; 1807} 1808 1809/* Select a suitable dst register for broadcasting scalar results, 1810 * or return NULL if we have to allocate an extra TEMP. 1811 * 1812 * If e.g. only 1 component is written, we may also emit the final 1813 * result to a write-only register. 1814 */ 1815static struct nv50_reg * 1816tgsi_broadcast_dst(struct nv50_pc *pc, 1817 const struct tgsi_full_dst_register *fd, unsigned mask) 1818{ 1819 if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) { 1820 int c = ffs(~mask & fd->DstRegister.WriteMask); 1821 if (c) 1822 return tgsi_dst(pc, c - 1, fd); 1823 } else { 1824 int c = ffs(fd->DstRegister.WriteMask) - 1; 1825 if ((1 << c) == fd->DstRegister.WriteMask) 1826 return tgsi_dst(pc, c, fd); 1827 } 1828 1829 return NULL; 1830} 1831 1832/* Scan source swizzles and return a bitmask indicating dst regs that 1833 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 1834 */ 1835static unsigned 1836nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 1837 unsigned rdep[4]) 1838{ 1839 const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0]; 1840 const struct tgsi_full_src_register *fs; 1841 unsigned i, deqs = 0; 1842 1843 for (i = 0; i < 4; ++i) 1844 rdep[i] = 0; 1845 1846 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 1847 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 1848 boolean neg_supp = negate_supported(insn, i); 1849 1850 fs = &insn->FullSrcRegisters[i]; 1851 if (fs->SrcRegister.File != fd->DstRegister.File || 1852 fs->SrcRegister.Index != fd->DstRegister.Index) 1853 continue; 1854 1855 for (chn = 0; chn < 4; ++chn) { 1856 unsigned s, c; 1857 1858 if (!(mask & (1 << chn))) /* src is not read */ 1859 continue; 1860 c = tgsi_util_get_full_src_register_extswizzle(fs, chn); 1861 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 1862 1863 if (c > TGSI_EXTSWIZZLE_W || 1864 !(fd->DstRegister.WriteMask & (1 << c))) 1865 continue; 1866 1867 /* no danger if src is copied to TEMP first */ 1868 if ((s != TGSI_UTIL_SIGN_KEEP) && 1869 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp)) 1870 continue; 1871 1872 rdep[c] |= nv50_tgsi_dst_revdep( 1873 insn->Instruction.Opcode, i, chn); 1874 deqs |= (1 << c); 1875 } 1876 } 1877 1878 return deqs; 1879} 1880 1881static boolean 1882nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 1883{ 1884 struct tgsi_full_instruction insn = tok->FullInstruction; 1885 const struct tgsi_full_dst_register *fd; 1886 unsigned i, deqs, rdep[4], m[4]; 1887 1888 fd = &tok->FullInstruction.FullDstRegisters[0]; 1889 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 1890 1891 if (is_scalar_op(insn.Instruction.Opcode)) { 1892 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 1893 if (!pc->r_brdc) 1894 pc->r_brdc = temp_temp(pc); 1895 return nv50_program_tx_insn(pc, &insn); 1896 } 1897 pc->r_brdc = NULL; 1898 1899 if (!deqs) 1900 return nv50_program_tx_insn(pc, &insn); 1901 1902 deqs = nv50_revdep_reorder(m, rdep); 1903 1904 for (i = 0; i < 4; ++i) { 1905 assert(pc->r_dst[m[i]] == NULL); 1906 1907 insn.FullDstRegisters[0].DstRegister.WriteMask = 1908 fd->DstRegister.WriteMask & (1 << m[i]); 1909 1910 if (!insn.FullDstRegisters[0].DstRegister.WriteMask) 1911 continue; 1912 1913 if (deqs & (1 << i)) 1914 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 1915 1916 if (!nv50_program_tx_insn(pc, &insn)) 1917 return FALSE; 1918 } 1919 1920 for (i = 0; i < 4; i++) { 1921 struct nv50_reg *reg = pc->r_dst[i]; 1922 if (!reg) 1923 continue; 1924 pc->r_dst[i] = NULL; 1925 1926 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 1927 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 1928 else 1929 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 1930 free_temp(pc, reg); 1931 } 1932 1933 return TRUE; 1934} 1935 1936static unsigned 1937load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid, 1938 int *aid, int *p_oid) 1939{ 1940 struct nv50_reg *iv; 1941 int oid, c, n; 1942 unsigned mask = 0; 1943 1944 iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p; 1945 1946 for (c = 0, n = i * 4; c < 4; c++, n++) { 1947 oid = (*p_oid)++; 1948 pc->attr[n].type = P_TEMP; 1949 pc->attr[n].index = i; 1950 1951 if (pc->attr[n].acc == acc[n]) 1952 continue; 1953 mask |= (1 << c); 1954 1955 pc->attr[n].acc = acc[n]; 1956 pc->attr[n].rhw = pc->attr[n].hw = -1; 1957 alloc_reg(pc, &pc->attr[n]); 1958 1959 pc->attr[n].rhw = (*aid)++; 1960 emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]); 1961 1962 pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4)); 1963 (*mid)++; 1964 pc->p->cfg.fp.regs[1] += 0x00010001; 1965 } 1966 1967 return mask; 1968} 1969 1970static boolean 1971nv50_program_tx_prep(struct nv50_pc *pc) 1972{ 1973 struct tgsi_parse_context p; 1974 boolean ret = FALSE; 1975 unsigned i, c; 1976 unsigned fcol, bcol, fcrd, depr; 1977 1978 /* count (centroid) perspective interpolations */ 1979 unsigned centroid_loads = 0; 1980 unsigned perspect_loads = 0; 1981 1982 /* track register access for temps and attrs */ 1983 unsigned *r_usage[2]; 1984 r_usage[0] = NULL; 1985 r_usage[1] = NULL; 1986 1987 depr = fcol = bcol = fcrd = 0xffff; 1988 1989 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 1990 pc->p->cfg.fp.regs[0] = 0x01000404; 1991 pc->p->cfg.fp.regs[1] = 0x00000400; 1992 } 1993 1994 tgsi_parse_init(&p, pc->p->pipe.tokens); 1995 while (!tgsi_parse_end_of_tokens(&p)) { 1996 const union tgsi_full_token *tok = &p.FullToken; 1997 1998 tgsi_parse_token(&p); 1999 switch (tok->Token.Type) { 2000 case TGSI_TOKEN_TYPE_IMMEDIATE: 2001 { 2002 const struct tgsi_full_immediate *imm = 2003 &p.FullToken.FullImmediate; 2004 2005 ctor_immd(pc, imm->u[0].Float, 2006 imm->u[1].Float, 2007 imm->u[2].Float, 2008 imm->u[3].Float); 2009 } 2010 break; 2011 case TGSI_TOKEN_TYPE_DECLARATION: 2012 { 2013 const struct tgsi_full_declaration *d; 2014 unsigned last, first, mode; 2015 2016 d = &p.FullToken.FullDeclaration; 2017 first = d->DeclarationRange.First; 2018 last = d->DeclarationRange.Last; 2019 2020 switch (d->Declaration.File) { 2021 case TGSI_FILE_TEMPORARY: 2022 if (pc->temp_nr < (last + 1)) 2023 pc->temp_nr = last + 1; 2024 break; 2025 case TGSI_FILE_OUTPUT: 2026 if (pc->result_nr < (last + 1)) 2027 pc->result_nr = last + 1; 2028 2029 if (!d->Declaration.Semantic) 2030 break; 2031 2032 switch (d->Semantic.SemanticName) { 2033 case TGSI_SEMANTIC_POSITION: 2034 depr = first; 2035 pc->p->cfg.fp.regs[2] |= 0x00000100; 2036 pc->p->cfg.fp.regs[3] |= 0x00000011; 2037 break; 2038 default: 2039 break; 2040 } 2041 2042 break; 2043 case TGSI_FILE_INPUT: 2044 { 2045 if (pc->attr_nr < (last + 1)) 2046 pc->attr_nr = last + 1; 2047 2048 if (pc->p->type != PIPE_SHADER_FRAGMENT) 2049 break; 2050 2051 switch (d->Declaration.Interpolate) { 2052 case TGSI_INTERPOLATE_CONSTANT: 2053 mode = INTERP_FLAT; 2054 break; 2055 case TGSI_INTERPOLATE_PERSPECTIVE: 2056 mode = INTERP_PERSPECTIVE; 2057 break; 2058 default: 2059 mode = INTERP_LINEAR; 2060 break; 2061 } 2062 2063 if (d->Declaration.Semantic) { 2064 switch (d->Semantic.SemanticName) { 2065 case TGSI_SEMANTIC_POSITION: 2066 fcrd = first; 2067 break; 2068 case TGSI_SEMANTIC_COLOR: 2069 fcol = first; 2070 mode = INTERP_PERSPECTIVE; 2071 break; 2072 case TGSI_SEMANTIC_BCOLOR: 2073 bcol = first; 2074 mode = INTERP_PERSPECTIVE; 2075 break; 2076 } 2077 } 2078 2079 if (d->Declaration.Centroid) { 2080 mode |= INTERP_CENTROID; 2081 if (mode & INTERP_PERSPECTIVE) 2082 centroid_loads++; 2083 } else 2084 if (mode & INTERP_PERSPECTIVE) 2085 perspect_loads++; 2086 2087 assert(last < 32); 2088 for (i = first; i <= last; i++) 2089 pc->interp_mode[i] = mode; 2090 } 2091 break; 2092 case TGSI_FILE_CONSTANT: 2093 if (pc->param_nr < (last + 1)) 2094 pc->param_nr = last + 1; 2095 break; 2096 case TGSI_FILE_SAMPLER: 2097 break; 2098 default: 2099 NOUVEAU_ERR("bad decl file %d\n", 2100 d->Declaration.File); 2101 goto out_err; 2102 } 2103 } 2104 break; 2105 case TGSI_TOKEN_TYPE_INSTRUCTION: 2106 pc->insn_nr++; 2107 prep_inspect_insn(pc, tok, r_usage); 2108 break; 2109 default: 2110 break; 2111 } 2112 } 2113 2114 if (pc->temp_nr) { 2115 pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg)); 2116 if (!pc->temp) 2117 goto out_err; 2118 2119 for (i = 0; i < pc->temp_nr; i++) { 2120 for (c = 0; c < 4; c++) { 2121 pc->temp[i*4+c].type = P_TEMP; 2122 pc->temp[i*4+c].hw = -1; 2123 pc->temp[i*4+c].rhw = -1; 2124 pc->temp[i*4+c].index = i; 2125 pc->temp[i*4+c].acc = r_usage[0][i*4+c]; 2126 } 2127 } 2128 } 2129 2130 if (pc->attr_nr) { 2131 int oid = 4, mid = 4, aid = 0; 2132 /* oid = VP output id 2133 * aid = FP attribute/interpolant id 2134 * mid = VP output mapping field ID 2135 */ 2136 2137 pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg)); 2138 if (!pc->attr) 2139 goto out_err; 2140 2141 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 2142 /* position should be loaded first */ 2143 if (fcrd != 0xffff) { 2144 unsigned mask; 2145 mid = 0; 2146 mask = load_fp_attrib(pc, fcrd, r_usage[1], 2147 &mid, &aid, &oid); 2148 oid = 0; 2149 pc->p->cfg.fp.regs[1] |= (mask << 24); 2150 pc->p->cfg.fp.map[0] = 0x04040404 * fcrd; 2151 } 2152 pc->p->cfg.fp.map[0] += 0x03020100; 2153 2154 /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */ 2155 2156 if (perspect_loads) { 2157 pc->iv_p = alloc_temp(pc, NULL); 2158 2159 if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) { 2160 pc->p->cfg.fp.regs[1] |= 0x08000000; 2161 pc->iv_p->rhw = aid++; 2162 emit_interp(pc, pc->iv_p, NULL, 2163 INTERP_LINEAR); 2164 emit_flop(pc, 0, pc->iv_p, pc->iv_p); 2165 } else { 2166 pc->iv_p->rhw = aid - 1; 2167 emit_flop(pc, 0, pc->iv_p, 2168 &pc->attr[fcrd * 4 + 3]); 2169 } 2170 } 2171 2172 if (centroid_loads) { 2173 pc->iv_c = alloc_temp(pc, NULL); 2174 pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++; 2175 emit_interp(pc, pc->iv_c, NULL, 2176 INTERP_CENTROID); 2177 emit_flop(pc, 0, pc->iv_c, pc->iv_c); 2178 pc->p->cfg.fp.regs[1] |= 0x08000000; 2179 } 2180 2181 for (c = 0; c < 4; c++) { 2182 /* I don't know what these values do, but 2183 * let's set them like the blob does: 2184 */ 2185 if (fcol != 0xffff && r_usage[1][fcol * 4 + c]) 2186 pc->p->cfg.fp.regs[0] += 0x00010000; 2187 if (bcol != 0xffff && r_usage[1][bcol * 4 + c]) 2188 pc->p->cfg.fp.regs[0] += 0x00010000; 2189 } 2190 2191 for (i = 0; i < pc->attr_nr; i++) 2192 load_fp_attrib(pc, i, r_usage[1], 2193 &mid, &aid, &oid); 2194 2195 if (pc->iv_p) 2196 free_temp(pc, pc->iv_p); 2197 if (pc->iv_c) 2198 free_temp(pc, pc->iv_c); 2199 2200 pc->p->cfg.fp.high_map = (mid / 4); 2201 pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0); 2202 } else { 2203 /* vertex program */ 2204 for (i = 0; i < pc->attr_nr * 4; i++) { 2205 pc->p->cfg.vp.attr[aid / 32] |= 2206 (1 << (aid % 32)); 2207 pc->attr[i].type = P_ATTR; 2208 pc->attr[i].hw = aid++; 2209 pc->attr[i].index = i / 4; 2210 } 2211 } 2212 } 2213 2214 if (pc->result_nr) { 2215 int rid = 0; 2216 2217 pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg)); 2218 if (!pc->result) 2219 goto out_err; 2220 2221 for (i = 0; i < pc->result_nr; i++) { 2222 for (c = 0; c < 4; c++) { 2223 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 2224 pc->result[i*4+c].type = P_TEMP; 2225 pc->result[i*4+c].hw = -1; 2226 pc->result[i*4+c].rhw = (i == depr) ? 2227 -1 : rid++; 2228 } else { 2229 pc->result[i*4+c].type = P_RESULT; 2230 pc->result[i*4+c].hw = rid++; 2231 } 2232 pc->result[i*4+c].index = i; 2233 } 2234 2235 if (pc->p->type == PIPE_SHADER_FRAGMENT && 2236 depr != 0xffff) { 2237 pc->result[depr * 4 + 2].rhw = 2238 (pc->result_nr - 1) * 4; 2239 } 2240 } 2241 } 2242 2243 if (pc->param_nr) { 2244 int rid = 0; 2245 2246 pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg)); 2247 if (!pc->param) 2248 goto out_err; 2249 2250 for (i = 0; i < pc->param_nr; i++) { 2251 for (c = 0; c < 4; c++) { 2252 pc->param[i*4+c].type = P_CONST; 2253 pc->param[i*4+c].hw = rid++; 2254 pc->param[i*4+c].index = i; 2255 } 2256 } 2257 } 2258 2259 if (pc->immd_nr) { 2260 int rid = 0; 2261 2262 pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg)); 2263 if (!pc->immd) 2264 goto out_err; 2265 2266 for (i = 0; i < pc->immd_nr; i++) { 2267 for (c = 0; c < 4; c++) { 2268 pc->immd[i*4+c].type = P_IMMD; 2269 pc->immd[i*4+c].hw = rid++; 2270 pc->immd[i*4+c].index = i; 2271 } 2272 } 2273 } 2274 2275 ret = TRUE; 2276out_err: 2277 if (r_usage[0]) 2278 FREE(r_usage[0]); 2279 if (r_usage[1]) 2280 FREE(r_usage[1]); 2281 2282 tgsi_parse_free(&p); 2283 return ret; 2284} 2285 2286static void 2287free_nv50_pc(struct nv50_pc *pc) 2288{ 2289 if (pc->immd) 2290 FREE(pc->immd); 2291 if (pc->param) 2292 FREE(pc->param); 2293 if (pc->result) 2294 FREE(pc->result); 2295 if (pc->attr) 2296 FREE(pc->attr); 2297 if (pc->temp) 2298 FREE(pc->temp); 2299 2300 FREE(pc); 2301} 2302 2303static boolean 2304nv50_program_tx(struct nv50_program *p) 2305{ 2306 struct tgsi_parse_context parse; 2307 struct nv50_pc *pc; 2308 unsigned k; 2309 boolean ret; 2310 2311 pc = CALLOC_STRUCT(nv50_pc); 2312 if (!pc) 2313 return FALSE; 2314 pc->p = p; 2315 pc->p->cfg.high_temp = 4; 2316 2317 ret = nv50_program_tx_prep(pc); 2318 if (ret == FALSE) 2319 goto out_cleanup; 2320 2321 tgsi_parse_init(&parse, pc->p->pipe.tokens); 2322 while (!tgsi_parse_end_of_tokens(&parse)) { 2323 const union tgsi_full_token *tok = &parse.FullToken; 2324 2325 /* don't allow half insn/immd on first and last instruction */ 2326 pc->allow32 = TRUE; 2327 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr) 2328 pc->allow32 = FALSE; 2329 2330 tgsi_parse_token(&parse); 2331 2332 switch (tok->Token.Type) { 2333 case TGSI_TOKEN_TYPE_INSTRUCTION: 2334 ++pc->insn_cur; 2335 ret = nv50_tgsi_insn(pc, tok); 2336 if (ret == FALSE) 2337 goto out_err; 2338 break; 2339 default: 2340 break; 2341 } 2342 } 2343 2344 if (p->type == PIPE_SHADER_FRAGMENT) { 2345 struct nv50_reg out; 2346 2347 out.type = P_TEMP; 2348 for (k = 0; k < pc->result_nr * 4; k++) { 2349 if (pc->result[k].rhw == -1) 2350 continue; 2351 if (pc->result[k].hw != pc->result[k].rhw) { 2352 out.hw = pc->result[k].rhw; 2353 emit_mov(pc, &out, &pc->result[k]); 2354 } 2355 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1)) 2356 pc->p->cfg.high_result = pc->result[k].rhw + 1; 2357 } 2358 } 2359 2360 /* look for single half instructions and make them long */ 2361 struct nv50_program_exec *e, *e_prev; 2362 2363 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) { 2364 if (!is_long(e)) 2365 k++; 2366 2367 if (!e->next || is_long(e->next)) { 2368 if (k & 1) 2369 convert_to_long(pc, e); 2370 k = 0; 2371 } 2372 2373 if (e->next) 2374 e_prev = e; 2375 } 2376 2377 if (!is_long(pc->p->exec_tail)) { 2378 /* this may occur if moving FP results */ 2379 assert(e_prev && !is_long(e_prev)); 2380 convert_to_long(pc, e_prev); 2381 convert_to_long(pc, pc->p->exec_tail); 2382 } 2383 2384 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head)); 2385 pc->p->exec_tail->inst[1] |= 0x00000001; 2386 2387 p->param_nr = pc->param_nr * 4; 2388 p->immd_nr = pc->immd_nr * 4; 2389 p->immd = pc->immd_buf; 2390 2391out_err: 2392 tgsi_parse_free(&parse); 2393 2394out_cleanup: 2395 free_nv50_pc(pc); 2396 return ret; 2397} 2398 2399static void 2400nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 2401{ 2402 if (nv50_program_tx(p) == FALSE) 2403 assert(0); 2404 p->translated = TRUE; 2405} 2406 2407static void 2408nv50_program_upload_data(struct nv50_context *nv50, float *map, 2409 unsigned start, unsigned count, unsigned cbuf) 2410{ 2411 struct nouveau_channel *chan = nv50->screen->base.channel; 2412 struct nouveau_grobj *tesla = nv50->screen->tesla; 2413 2414 while (count) { 2415 unsigned nr = count > 2047 ? 2047 : count; 2416 2417 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2418 OUT_RING (chan, (cbuf << 0) | (start << 8)); 2419 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2420 OUT_RINGp (chan, map, nr); 2421 2422 map += nr; 2423 start += nr; 2424 count -= nr; 2425 } 2426} 2427 2428static void 2429nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 2430{ 2431 struct pipe_screen *pscreen = nv50->pipe.screen; 2432 2433 if (!p->data[0] && p->immd_nr) { 2434 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 2435 2436 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 2437 while (heap->next && heap->size < p->immd_nr) { 2438 struct nv50_program *evict = heap->next->priv; 2439 nouveau_resource_free(&evict->data[0]); 2440 } 2441 2442 if (nouveau_resource_alloc(heap, p->immd_nr, p, 2443 &p->data[0])) 2444 assert(0); 2445 } 2446 2447 /* immediates only need to be uploaded again when freed */ 2448 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 2449 p->immd_nr, NV50_CB_PMISC); 2450 } 2451 2452 if (!p->data[1] && p->param_nr) { 2453 struct nouveau_resource *heap = 2454 nv50->screen->parm_heap[p->type]; 2455 2456 if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) { 2457 while (heap->next && heap->size < p->param_nr) { 2458 struct nv50_program *evict = heap->next->priv; 2459 nouveau_resource_free(&evict->data[1]); 2460 } 2461 2462 if (nouveau_resource_alloc(heap, p->param_nr, p, 2463 &p->data[1])) 2464 assert(0); 2465 } 2466 } 2467 2468 if (p->param_nr) { 2469 unsigned cbuf = NV50_CB_PVP; 2470 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], 2471 PIPE_BUFFER_USAGE_CPU_READ); 2472 if (p->type == PIPE_SHADER_FRAGMENT) 2473 cbuf = NV50_CB_PFP; 2474 nv50_program_upload_data(nv50, map, p->data[1]->start, 2475 p->param_nr, cbuf); 2476 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); 2477 } 2478} 2479 2480static void 2481nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 2482{ 2483 struct nouveau_channel *chan = nv50->screen->base.channel; 2484 struct nouveau_grobj *tesla = nv50->screen->tesla; 2485 struct nv50_program_exec *e; 2486 struct nouveau_stateobj *so; 2487 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR; 2488 unsigned start, count, *up, *ptr; 2489 boolean upload = FALSE; 2490 2491 if (!p->bo) { 2492 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 2493 p->exec_size * 4, &p->bo); 2494 upload = TRUE; 2495 } 2496 2497 if ((p->data[0] && p->data[0]->start != p->data_start[0]) || 2498 (p->data[1] && p->data[1]->start != p->data_start[1])) { 2499 for (e = p->exec_head; e; e = e->next) { 2500 unsigned ei, ci, bs; 2501 2502 if (e->param.index < 0) 2503 continue; 2504 bs = (e->inst[1] >> 22) & 0x07; 2505 assert(bs < 2); 2506 ei = e->param.shift >> 5; 2507 ci = e->param.index + p->data[bs]->start; 2508 2509 e->inst[ei] &= ~e->param.mask; 2510 e->inst[ei] |= (ci << e->param.shift); 2511 } 2512 2513 if (p->data[0]) 2514 p->data_start[0] = p->data[0]->start; 2515 if (p->data[1]) 2516 p->data_start[1] = p->data[1]->start; 2517 2518 upload = TRUE; 2519 } 2520 2521 if (!upload) 2522 return; 2523 2524#ifdef NV50_PROGRAM_DUMP 2525 NOUVEAU_ERR("-------\n"); 2526 for (e = p->exec_head; e; e = e->next) { 2527 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 2528 if (is_long(e)) 2529 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 2530 } 2531#endif 2532 2533 up = ptr = MALLOC(p->exec_size * 4); 2534 for (e = p->exec_head; e; e = e->next) { 2535 *(ptr++) = e->inst[0]; 2536 if (is_long(e)) 2537 *(ptr++) = e->inst[1]; 2538 } 2539 2540 so = so_new(4,2); 2541 so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); 2542 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0); 2543 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0); 2544 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4)); 2545 2546 start = 0; count = p->exec_size; 2547 while (count) { 2548 struct nouveau_channel *chan = nv50->screen->base.channel; 2549 unsigned nr; 2550 2551 so_emit(chan, so); 2552 2553 nr = MIN2(count, 2047); 2554 nr = MIN2(chan->pushbuf->remaining, nr); 2555 if (chan->pushbuf->remaining < (nr + 3)) { 2556 FIRE_RING(chan); 2557 continue; 2558 } 2559 2560 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2561 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD); 2562 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2563 OUT_RINGp (chan, up + start, nr); 2564 2565 start += nr; 2566 count -= nr; 2567 } 2568 2569 FREE(up); 2570 so_ref(NULL, &so); 2571} 2572 2573void 2574nv50_vertprog_validate(struct nv50_context *nv50) 2575{ 2576 struct nouveau_grobj *tesla = nv50->screen->tesla; 2577 struct nv50_program *p = nv50->vertprog; 2578 struct nouveau_stateobj *so; 2579 2580 if (!p->translated) { 2581 nv50_program_validate(nv50, p); 2582 if (!p->translated) 2583 assert(0); 2584 } 2585 2586 nv50_program_validate_data(nv50, p); 2587 nv50_program_validate_code(nv50, p); 2588 2589 so = so_new(13, 2); 2590 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 2591 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2592 NOUVEAU_BO_HIGH, 0, 0); 2593 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2594 NOUVEAU_BO_LOW, 0, 0); 2595 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 2596 so_data (so, p->cfg.vp.attr[0]); 2597 so_data (so, p->cfg.vp.attr[1]); 2598 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 2599 so_data (so, p->cfg.high_result); 2600 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); 2601 so_data (so, p->cfg.high_result); //8); 2602 so_data (so, p->cfg.high_temp); 2603 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 2604 so_data (so, 0); /* program start offset */ 2605 so_ref(so, &nv50->state.vertprog); 2606 so_ref(NULL, &so); 2607} 2608 2609void 2610nv50_fragprog_validate(struct nv50_context *nv50) 2611{ 2612 struct nouveau_grobj *tesla = nv50->screen->tesla; 2613 struct nv50_program *p = nv50->fragprog; 2614 struct nouveau_stateobj *so; 2615 unsigned i; 2616 2617 if (!p->translated) { 2618 nv50_program_validate(nv50, p); 2619 if (!p->translated) 2620 assert(0); 2621 } 2622 2623 nv50_program_validate_data(nv50, p); 2624 nv50_program_validate_code(nv50, p); 2625 2626 so = so_new(64, 2); 2627 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 2628 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2629 NOUVEAU_BO_HIGH, 0, 0); 2630 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2631 NOUVEAU_BO_LOW, 0, 0); 2632 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 2633 so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */ 2634 so_data (so, 0x00000004); 2635 so_data (so, 0x00000000); 2636 so_data (so, 0x00000000); 2637 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map); 2638 for (i = 0; i < p->cfg.fp.high_map; i++) 2639 so_data(so, p->cfg.fp.map[i]); 2640 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2); 2641 so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */ 2642 so_data (so, p->cfg.high_temp); 2643 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 2644 so_data (so, p->cfg.high_result); 2645 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1); 2646 so_data (so, p->cfg.fp.regs[2]); 2647 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 2648 so_data (so, p->cfg.fp.regs[3]); 2649 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 2650 so_data (so, 0); /* program start offset */ 2651 so_ref(so, &nv50->state.fragprog); 2652 so_ref(NULL, &so); 2653} 2654 2655void 2656nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 2657{ 2658 while (p->exec_head) { 2659 struct nv50_program_exec *e = p->exec_head; 2660 2661 p->exec_head = e->next; 2662 FREE(e); 2663 } 2664 p->exec_tail = NULL; 2665 p->exec_size = 0; 2666 2667 nouveau_bo_ref(NULL, &p->bo); 2668 2669 nouveau_resource_free(&p->data[0]); 2670 nouveau_resource_free(&p->data[1]); 2671 2672 p->translated = 0; 2673} 2674